Example #1
0
static void
fipe_ioat_cancel(void)
{
	uint32_t status;
	uint8_t	*addr = fipe_ioat_ctrl.ioat_reg_addr;
	ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;

	/*
	 * Reset channel. Sometimes reset is not reliable,
	 * so check completion or abort status after reset.
	 */
	/* LINTED: constant in conditional context */
	while (1) {
		/* Issue reset channel command. */
		ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x20);

		/* Query command status. */
		status = ddi_get32(handle,
		    (uint32_t *)(addr + FIPE_IOAT_CHAN_STS_LO));
		if (status & 0x1) {
			/* Reset channel completed. */
			break;
		} else {
			SMT_PAUSE();
		}
	}

	/* Put channel into "not in use" state. */
	ddi_put16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0);
}
Example #2
0
static void
fipe_disable(void)
{
	/*
	 * Try to acquire lock, which also implicitly has the same effect
	 * of calling membar_sync().
	 */
	while (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
		/*
		 * If power saving is inactive, just return and all dirty
		 * house-keeping work will be handled in fipe_enable().
		 */
		if (fipe_gbl_ctrl.pm_active == B_FALSE) {
			return;
		} else {
			(void) SMT_PAUSE();
		}
	}

	/* Disable power saving if it's active. */
	if (fipe_gbl_ctrl.pm_active) {
		/*
		 * Set pm_active to FALSE as soon as possible to prevent
		 * other CPUs from waiting on pm_active flag.
		 */
		fipe_gbl_ctrl.pm_active = B_FALSE;
		membar_producer();
		fipe_mc_restore();
		fipe_ioat_cancel();
	}

	mutex_exit(&fipe_gbl_ctrl.lock);
}
Example #3
0
void
lock_set_spl_spin(lock_t *lp, int new_pil, ushort_t *old_pil_addr, int old_pil)
{
	int spin_count = 1;
	int backoff;	/* current backoff */
	int backctr;	/* ctr for backoff */

	if (panicstr)
		return;

	if (ncpus == 1)
		panic("lock_set_spl: %p lock held and only one CPU", lp);

	ASSERT(new_pil > LOCK_LEVEL);

	if (&plat_lock_delay) {
		backoff = 0;
	} else {
		backoff = BACKOFF_BASE;
	}
	do {
		splx(old_pil);
		while (LOCK_HELD(lp)) {
			if (panicstr) {
				*old_pil_addr = (ushort_t)splr(new_pil);
				return;
			}
			spin_count++;
			/*
			 * Add an exponential backoff delay before trying again
			 * to touch the mutex data structure.
			 * spin_count test and call to nulldev are to prevent
			 * compiler optimizer from eliminating the delay loop.
			 */
			if (&plat_lock_delay) {
				plat_lock_delay(&backoff);
			} else {
				for (backctr = backoff; backctr; backctr--) {
					if (!spin_count) (void) nulldev();
				}
				backoff = backoff << 1;		/* double it */
				if (backoff > BACKOFF_CAP) {
					backoff = BACKOFF_CAP;
				}

				SMT_PAUSE();
			}
		}
		old_pil = splr(new_pil);
	} while (!lock_spin_try(lp));

	*old_pil_addr = (ushort_t)old_pil;

	if (spin_count) {
		LOCKSTAT_RECORD(LS_LOCK_SET_SPL_SPIN, lp, spin_count);
	}

	LOCKSTAT_RECORD(LS_LOCK_SET_SPL_ACQUIRE, lp, spin_count);
}
/*ARGSUSED*/
static void
acpi_cpu_check_wakeup(void *arg)
{
	/*
	 * Toggle interrupt flag to detect pending interrupts.
	 * If interrupt happened, do_interrupt() will notify CPU idle
	 * notification framework so no need to call cpu_idle_exit() here.
	 */
	sti();
	SMT_PAUSE();
	cli();
}
Example #5
0
/*
 * Spin until either start_cpus() wakes us up, or we get a request to
 * enter the safe phase (followed by a later start_cpus()).
 */
void
mach_cpu_pause(volatile char *safe)
{
	*safe = PAUSE_WAIT;
	membar_enter();

	while (*safe != PAUSE_IDLE) {
		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
			enter_safe_phase();
		SMT_PAUSE();
	}
}
/*
 * When we apply priority inheritance, we must grab the owner's thread lock
 * while already holding the waiter's thread lock.  If both thread locks are
 * turnstile locks, this can lead to deadlock: while we hold L1 and try to
 * grab L2, some unrelated thread may be applying priority inheritance to
 * some other blocking chain, holding L2 and trying to grab L1.  The most
 * obvious solution -- do a lock_try() for the owner lock -- isn't quite
 * sufficient because it can cause livelock: each thread may hold one lock,
 * try to grab the other, fail, bail out, and try again, looping forever.
 * To prevent livelock we must define a winner, i.e. define an arbitrary
 * lock ordering on the turnstile locks.  For simplicity we declare that
 * virtual address order defines lock order, i.e. if L1 < L2, then the
 * correct lock ordering is L1, L2.  Thus the thread that holds L1 and
 * wants L2 should spin until L2 is available, but the thread that holds
 * L2 and can't get L1 on the first try must drop L2 and return failure.
 * Moreover, the losing thread must not reacquire L2 until the winning
 * thread has had a chance to grab it; to ensure this, the losing thread
 * must grab L1 after dropping L2, thus spinning until the winner is done.
 * Complicating matters further, note that the owner's thread lock pointer
 * can change (i.e. be pointed at a different lock) while we're trying to
 * grab it.  If that happens, we must unwind our state and try again.
 *
 * On success, returns 1 with both locks held.
 * On failure, returns 0 with neither lock held.
 */
static int
turnstile_interlock(lock_t *wlp, lock_t *volatile *olpp)
{
	ASSERT(LOCK_HELD(wlp));

	for (;;) {
		volatile lock_t *olp = *olpp;

		/*
		 * If the locks are identical, there's nothing to do.
		 */
		if (olp == wlp)
			return (1);
		if (lock_try((lock_t *)olp)) {
			/*
			 * If 'olp' is still the right lock, return success.
			 * Otherwise, drop 'olp' and try the dance again.
			 */
			if (olp == *olpp)
				return (1);
			lock_clear((lock_t *)olp);
		} else {
			hrtime_t spin_time = 0;
			/*
			 * If we're grabbing the locks out of order, we lose.
			 * Drop the waiter's lock, and then grab and release
			 * the owner's lock to ensure that we won't retry
			 * until the winner is done (as described above).
			 */
			if (olp >= (lock_t *)turnstile_table && olp < wlp) {
				lock_clear(wlp);
				lock_set((lock_t *)olp);
				lock_clear((lock_t *)olp);
				return (0);
			}
			/*
			 * We're grabbing the locks in the right order,
			 * so spin until the owner's lock either becomes
			 * available or spontaneously changes.
			 */
			spin_time =
			    LOCKSTAT_START_TIME(LS_TURNSTILE_INTERLOCK_SPIN);
			while (olp == *olpp && LOCK_HELD(olp)) {
				if (panicstr)
					return (1);
				SMT_PAUSE();
			}
			LOCKSTAT_RECORD_TIME(LS_TURNSTILE_INTERLOCK_SPIN,
			    olp, spin_time);
		}
	}
}
Example #7
0
/*
 * Simple C support for the cases where spin locks miss on the first try.
 */
void
lock_set_spin(lock_t *lp)
{
	int spin_count = 1;
	int backoff;	/* current backoff */
	int backctr;	/* ctr for backoff */

	if (panicstr)
		return;

	if (ncpus == 1)
		panic("lock_set: %p lock held and only one CPU", lp);

	if (&plat_lock_delay) {
		backoff = 0;
	} else {
		backoff = BACKOFF_BASE;
	}

	while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
		if (panicstr)
			return;
		spin_count++;
		/*
		 * Add an exponential backoff delay before trying again
		 * to touch the mutex data structure.
		 * the spin_count test and call to nulldev are to prevent
		 * the compiler optimizer from eliminating the delay loop.
		 */
		if (&plat_lock_delay) {
			plat_lock_delay(&backoff);
		} else {
			/* delay */
			for (backctr = backoff; backctr; backctr--) {
				if (!spin_count) (void) nulldev();
			}

			backoff = backoff << 1;		/* double it */
			if (backoff > BACKOFF_CAP) {
				backoff = BACKOFF_CAP;
			}
			SMT_PAUSE();
		}
	}

	if (spin_count) {
		LOCKSTAT_RECORD(LS_LOCK_SET_SPIN, lp, spin_count);
	}

	LOCKSTAT_RECORD0(LS_LOCK_SET_ACQUIRE, lp);
}
Example #8
0
void
mp_enter_barrier(void)
{
	hrtime_t last_poke_time = 0;
	int poke_allowed = 0;
	int done = 0;
	int i;

	ASSERT(MUTEX_HELD(&cpu_lock));

	pause_cpus(NULL);

	while (!done) {
		done = 1;
		poke_allowed = 0;

		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
			last_poke_time = xpv_gethrtime();
			poke_allowed = 1;
		}

		for (i = 0; i < NCPU; i++) {
			cpu_t *cp = cpu_get(i);

			if (cp == NULL || cp == CPU)
				continue;

			switch (cpu_phase[i]) {
			case CPU_PHASE_NONE:
				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
				poke_cpu(i);
				done = 0;
				break;

			case CPU_PHASE_WAIT_SAFE:
				if (poke_allowed)
					poke_cpu(i);
				done = 0;
				break;

			case CPU_PHASE_SAFE:
			case CPU_PHASE_POWERED_OFF:
				break;
			}
		}

		SMT_PAUSE();
	}
}
Example #9
0
/*
 * Reach a point at which the CPU can be safely powered-off or
 * suspended.  Nothing can wake this CPU out of the loop.
 */
static void
enter_safe_phase(void)
{
	ulong_t flags = intr_clear();

	if (setjmp(&curthread->t_pcb) == 0) {
		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
			SMT_PAUSE();
	}

	ASSERT(!interrupts_enabled());

	intr_restore(flags);
}
Example #10
0
void
mach_cpu_pause(volatile char *safe)
{
	/*
	 * This cpu is now safe.
	 */
	*safe = PAUSE_WAIT;
	membar_enter(); /* make sure stores are flushed */

	/*
	 * Now we wait.  When we are allowed to continue, safe
	 * will be set to PAUSE_IDLE.
	 */
	while (*safe != PAUSE_IDLE)
		SMT_PAUSE();
}
/*
 * Called by a CPU which has just been onlined.  It is expected that the CPU
 * performing the online operation will call tsc_sync_master().
 *
 * TSC sync is disabled in the context of virtualization. See comments
 * above tsc_sync_master.
 */
void
tsc_sync_slave(void)
{
	ulong_t flags;
	hrtime_t s1;
	tsc_sync_t *tsc = tscp;
	int cnt;
	int hwtype;

	hwtype = get_hwenv();
	if (!tsc_master_slave_sync_needed || hwtype == HW_XEN_HVM ||
	    hwtype == HW_VMWARE)
		return;

	flags = clear_int_flag();

	for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
		/* Re-fill the cache line */
		s1 = tsc->master_tsc;
		membar_enter();
		tsc_sync_go = TSC_SYNC_GO;
		do {
			/*
			 * Do not put an SMT_PAUSE here. For instance,
			 * if the master and slave are really the same
			 * hyper-threaded CPU, then you want the master
			 * to yield to the slave as quickly as possible here,
			 * but not the other way.
			 */
			s1 = tsc_read();
		} while (tsc->master_tsc == 0);
		tsc->slave_tsc = s1;
		membar_enter();
		tsc_sync_go = TSC_SYNC_DONE;

		while (tsc_sync_go != TSC_SYNC_STOP)
			SMT_PAUSE();
	}

	restore_int_flag(flags);
}
Example #12
0
static void
acpi_cpu_mwait_ipi_check_wakeup(void *arg)
{
	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;

	ASSERT(arg != NULL);
	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
		/*
		 * CPU has been awakened, notify CPU idle notification system.
		 */
		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
	} else {
		/*
		 * Toggle interrupt flag to detect pending interrupts.
		 * If interrupt happened, do_interrupt() will notify CPU idle
		 * notification framework so no need to call cpu_idle_exit()
		 * here.
		 */
		sti();
		SMT_PAUSE();
		cli();
	}
}
Example #13
0
/*
 * mutex_vector_enter() is called from the assembly mutex_enter() routine
 * if the lock is held or is not of type MUTEX_ADAPTIVE.
 */
void
mutex_vector_enter(mutex_impl_t *lp)
{
	kthread_id_t	owner;
	hrtime_t	sleep_time = 0;	/* how long we slept */
	uint_t		spin_count = 0;	/* how many times we spun */
	cpu_t 		*cpup, *last_cpu;
	extern cpu_t	*cpu_list;
	turnstile_t	*ts;
	volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp;
	int		backoff;	/* current backoff */
	int		backctr;	/* ctr for backoff */
	int		sleep_count = 0;

	ASSERT_STACK_ALIGNED();

	if (MUTEX_TYPE_SPIN(lp)) {
		lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl,
		    &lp->m_spin.m_oldspl);
		return;
	}

	if (!MUTEX_TYPE_ADAPTIVE(lp)) {
		mutex_panic("mutex_enter: bad mutex", lp);
		return;
	}

	/*
	 * Adaptive mutexes must not be acquired from above LOCK_LEVEL.
	 * We can migrate after loading CPU but before checking CPU_ON_INTR,
	 * so we must verify by disabling preemption and loading CPU again.
	 */
	cpup = CPU;
	if (CPU_ON_INTR(cpup) && !panicstr) {
		kpreempt_disable();
		if (CPU_ON_INTR(CPU))
			mutex_panic("mutex_enter: adaptive at high PIL", lp);
		kpreempt_enable();
	}

	CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1);

	if (&plat_lock_delay) {
		backoff = 0;
	} else {
		backoff = BACKOFF_BASE;
	}

	for (;;) {
spin:
		spin_count++;
		/*
		 * Add an exponential backoff delay before trying again
		 * to touch the mutex data structure.
		 * the spin_count test and call to nulldev are to prevent
		 * the compiler optimizer from eliminating the delay loop.
		 */
		if (&plat_lock_delay) {
			plat_lock_delay(&backoff);
		} else {
			for (backctr = backoff; backctr; backctr--) {
				if (!spin_count) (void) nulldev();
			};    /* delay */
			backoff = backoff << 1;			/* double it */
			if (backoff > BACKOFF_CAP) {
				backoff = BACKOFF_CAP;
			}

			SMT_PAUSE();
		}

		if (panicstr)
			return;

		if ((owner = MUTEX_OWNER(vlp)) == NULL) {
			if (mutex_adaptive_tryenter(lp))
				break;
			continue;
		}

		if (owner == curthread)
			mutex_panic("recursive mutex_enter", lp);

		/*
		 * If lock is held but owner is not yet set, spin.
		 * (Only relevant for platforms that don't have cas.)
		 */
		if (owner == MUTEX_NO_OWNER)
			continue;

		/*
		 * When searching the other CPUs, start with the one where
		 * we last saw the owner thread.  If owner is running, spin.
		 *
		 * We must disable preemption at this point to guarantee
		 * that the list doesn't change while we traverse it
		 * without the cpu_lock mutex.  While preemption is
		 * disabled, we must revalidate our cached cpu pointer.
		 */
		kpreempt_disable();
		if (cpup->cpu_next == NULL)
			cpup = cpu_list;
		last_cpu = cpup;	/* mark end of search */
		do {
			if (cpup->cpu_thread == owner) {
				kpreempt_enable();
				goto spin;
			}
		} while ((cpup = cpup->cpu_next) != last_cpu);
		kpreempt_enable();

		/*
		 * The owner appears not to be running, so block.
		 * See the Big Theory Statement for memory ordering issues.
		 */
		ts = turnstile_lookup(lp);
		MUTEX_SET_WAITERS(lp);
		membar_enter();

		/*
		 * Recheck whether owner is running after waiters bit hits
		 * global visibility (above).  If owner is running, spin.
		 *
		 * Since we are at ipl DISP_LEVEL, kernel preemption is
		 * disabled, however we still need to revalidate our cached
		 * cpu pointer to make sure the cpu hasn't been deleted.
		 */
		if (cpup->cpu_next == NULL)
			last_cpu = cpup = cpu_list;
		do {
			if (cpup->cpu_thread == owner) {
				turnstile_exit(lp);
				goto spin;
			}
		} while ((cpup = cpup->cpu_next) != last_cpu);
		membar_consumer();

		/*
		 * If owner and waiters bit are unchanged, block.
		 */
		if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) {
			sleep_time -= gethrtime();
			(void) turnstile_block(ts, TS_WRITER_Q, lp,
			    &mutex_sobj_ops, NULL, NULL);
			sleep_time += gethrtime();
			sleep_count++;
		} else {
			turnstile_exit(lp);
		}
	}

	ASSERT(MUTEX_OWNER(lp) == curthread);

	if (sleep_time != 0) {
		/*
		 * Note, sleep time is the sum of all the sleeping we
		 * did.
		 */
		LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time);
	}

	/*
	 * We do not count a sleep as a spin.
	 */
	if (spin_count > sleep_count)
		LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp,
		    spin_count - sleep_count);

	LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp);
}
Example #14
0
/*
 * Push out a priority cross call.
 */
static void
xc_priority_common(
	xc_func_t func,
	xc_arg_t arg1,
	xc_arg_t arg2,
	xc_arg_t arg3,
	ulong_t *set)
{
	int i;
	int c;
	struct cpu *cpup;

	/*
	 * Wait briefly for any previous xc_priority to have finished.
	 */
	for (c = 0; c < max_ncpus; ++c) {
		cpup = cpu[c];
		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
			continue;

		/*
		 * The value of 40000 here is from old kernel code. It
		 * really should be changed to some time based value, since
		 * under a hypervisor, there's no guarantee a remote CPU
		 * is even scheduled.
		 */
		for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i)
			SMT_PAUSE();

		/*
		 * Some CPU did not respond to a previous priority request. It's
		 * probably deadlocked with interrupts blocked or some such
		 * problem. We'll just erase the previous request - which was
		 * most likely a kmdb_enter that has already expired - and plow
		 * ahead.
		 */
		if (BT_TEST(xc_priority_set, c)) {
			XC_BT_CLEAR(xc_priority_set, c);
			if (cpup->cpu_m.xc_work_cnt > 0)
				xc_decrement(&cpup->cpu_m);
		}
	}

	/*
	 * fill in cross call data
	 */
	xc_priority_data.xc_func = func;
	xc_priority_data.xc_a1 = arg1;
	xc_priority_data.xc_a2 = arg2;
	xc_priority_data.xc_a3 = arg3;

	/*
	 * Post messages to all CPUs involved that are CPU_READY
	 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
	 */
	for (c = 0; c < max_ncpus; ++c) {
		if (!BT_TEST(set, c))
			continue;
		cpup = cpu[c];
		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
		    cpup == CPU)
			continue;
		(void) xc_increment(&cpup->cpu_m);
		XC_BT_SET(xc_priority_set, c);
		send_dirint(c, XC_HI_PIL);
		for (i = 0; i < 10; ++i) {
			(void) casptr(&cpup->cpu_m.xc_msgbox,
			    cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
		}
	}
}
Example #15
0
/*ARGSUSED*/
uint_t
xc_serv(caddr_t arg1, caddr_t arg2)
{
	struct machcpu *mcpup = &(CPU->cpu_m);
	xc_msg_t *msg;
	xc_data_t *data;
	xc_msg_t *xc_waiters = NULL;
	uint32_t num_waiting = 0;
	xc_func_t func;
	xc_arg_t a1;
	xc_arg_t a2;
	xc_arg_t a3;
	uint_t rc = DDI_INTR_UNCLAIMED;

	while (mcpup->xc_work_cnt != 0) {
		rc = DDI_INTR_CLAIMED;

		/*
		 * We may have to wait for a message to arrive.
		 */
		for (msg = NULL; msg == NULL;
		    msg = xc_extract(&mcpup->xc_msgbox)) {

			/*
			 * Alway check for and handle a priority message.
			 */
			if (BT_TEST(xc_priority_set, CPU->cpu_id)) {
				func = xc_priority_data.xc_func;
				a1 = xc_priority_data.xc_a1;
				a2 = xc_priority_data.xc_a2;
				a3 = xc_priority_data.xc_a3;
				XC_BT_CLEAR(xc_priority_set, CPU->cpu_id);
				xc_decrement(mcpup);
				func(a1, a2, a3);
				if (mcpup->xc_work_cnt == 0)
					return (rc);
			}

			/*
			 * wait for a message to arrive
			 */
			SMT_PAUSE();
		}


		/*
		 * process the message
		 */
		switch (msg->xc_command) {

		/*
		 * ASYNC gives back the message immediately, then we do the
		 * function and return with no more waiting.
		 */
		case XC_MSG_ASYNC:
			data = &cpu[msg->xc_master]->cpu_m.xc_data;
			func = data->xc_func;
			a1 = data->xc_a1;
			a2 = data->xc_a2;
			a3 = data->xc_a3;
			msg->xc_command = XC_MSG_DONE;
			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
			if (func != NULL)
				(void) (*func)(a1, a2, a3);
			xc_decrement(mcpup);
			break;

		/*
		 * SYNC messages do the call, then send it back to the master
		 * in WAITING mode
		 */
		case XC_MSG_SYNC:
			data = &cpu[msg->xc_master]->cpu_m.xc_data;
			if (data->xc_func != NULL)
				(void) (*data->xc_func)(data->xc_a1,
				    data->xc_a2, data->xc_a3);
			msg->xc_command = XC_MSG_WAITING;
			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
			break;

		/*
		 * WAITING messsages are collected by the master until all
		 * have arrived. Once all arrive, we release them back to
		 * the slaves
		 */
		case XC_MSG_WAITING:
			xc_insert(&xc_waiters, msg);
			if (++num_waiting < mcpup->xc_wait_cnt)
				break;
			while ((msg = xc_extract(&xc_waiters)) != NULL) {
				msg->xc_command = XC_MSG_RELEASED;
				xc_insert(&cpu[msg->xc_slave]->cpu_m.xc_msgbox,
				    msg);
				--num_waiting;
			}
			if (num_waiting != 0)
				panic("wrong number waiting");
			mcpup->xc_wait_cnt = 0;
			break;

		/*
		 * CALL messages do the function and then, like RELEASE,
		 * send the message is back to master as DONE.
		 */
		case XC_MSG_CALL:
			data = &cpu[msg->xc_master]->cpu_m.xc_data;
			if (data->xc_func != NULL)
				(void) (*data->xc_func)(data->xc_a1,
				    data->xc_a2, data->xc_a3);
			/*FALLTHROUGH*/
		case XC_MSG_RELEASED:
			msg->xc_command = XC_MSG_DONE;
			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
			xc_decrement(mcpup);
			break;

		/*
		 * DONE means a slave has completely finished up.
		 * Once we collect all the DONE messages, we'll exit
		 * processing too.
		 */
		case XC_MSG_DONE:
			msg->xc_command = XC_MSG_FREE;
			xc_insert(&mcpup->xc_free, msg);
			xc_decrement(mcpup);
			break;

		case XC_MSG_FREE:
			panic("free message 0x%p in msgbox", (void *)msg);
			break;

		default:
			panic("bad message 0x%p in msgbox", (void *)msg);
			break;
		}
	}
	return (rc);
}
/*
 * Called by the master in the TSC sync operation (usually the boot CPU).
 * If the slave is discovered to have a skew, gethrtimef will be changed to
 * point to tsc_gethrtime_delta(). Calculating skews is precise only when
 * the master and slave TSCs are read simultaneously; however, there is no
 * algorithm that can read both CPUs in perfect simultaneity. The proposed
 * algorithm is an approximate method based on the behaviour of cache
 * management. The slave CPU continuously reads TSC and then reads a global
 * variable which the master CPU updates. The moment the master's update reaches
 * the slave's visibility (being forced by an mfence operation) we use the TSC
 * reading taken on the slave. A corresponding TSC read will be taken on the
 * master as soon as possible after finishing the mfence operation. But the
 * delay between causing the slave to notice the invalid cache line and the
 * competion of mfence is not repeatable. This error is heuristically assumed
 * to be 1/4th of the total write time as being measured by the two TSC reads
 * on the master sandwiching the mfence. Furthermore, due to the nature of
 * bus arbitration, contention on memory bus, etc., the time taken for the write
 * to reflect globally can vary a lot. So instead of taking a single reading,
 * a set of readings are taken and the one with least write time is chosen
 * to calculate the final skew.
 *
 * TSC sync is disabled in the context of virtualization because the CPUs
 * assigned to the guest are virtual CPUs which means the real CPUs on which
 * guest runs keep changing during life time of guest OS. So we would end up
 * calculating TSC skews for a set of CPUs during boot whereas the guest
 * might migrate to a different set of physical CPUs at a later point of
 * time.
 */
void
tsc_sync_master(processorid_t slave)
{
	ulong_t flags, source, min_write_time = ~0UL;
	hrtime_t write_time, x, mtsc_after, tdelta;
	tsc_sync_t *tsc = tscp;
	int cnt;
	int hwtype;

	hwtype = get_hwenv();
	if (!tsc_master_slave_sync_needed || hwtype == HW_XEN_HVM ||
	    hwtype == HW_VMWARE)
		return;

	flags = clear_int_flag();
	source = CPU->cpu_id;

	for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
		while (tsc_sync_go != TSC_SYNC_GO)
			SMT_PAUSE();

		tsc->master_tsc = tsc_read();
		membar_enter();
		mtsc_after = tsc_read();
		while (tsc_sync_go != TSC_SYNC_DONE)
			SMT_PAUSE();
		write_time =  mtsc_after - tsc->master_tsc;
		if (write_time <= min_write_time) {
			min_write_time = write_time;
			/*
			 * Apply heuristic adjustment only if the calculated
			 * delta is > 1/4th of the write time.
			 */
			x = tsc->slave_tsc - mtsc_after;
			if (x < 0)
				x = -x;
			if (x > (min_write_time/4))
				/*
				 * Subtract 1/4th of the measured write time
				 * from the master's TSC value, as an estimate
				 * of how late the mfence completion came
				 * after the slave noticed the cache line
				 * change.
				 */
				tdelta = tsc->slave_tsc -
				    (mtsc_after - (min_write_time/4));
			else
				tdelta = tsc->slave_tsc - mtsc_after;
			tsc_sync_tick_delta[slave] =
			    tsc_sync_tick_delta[source] - tdelta;
		}

		tsc->master_tsc = tsc->slave_tsc = write_time = 0;
		membar_enter();
		tsc_sync_go = TSC_SYNC_STOP;
	}
	if (tdelta < 0)
		tdelta = -tdelta;
	if (tdelta > largest_tsc_delta)
		largest_tsc_delta = tdelta;
	if (min_write_time < shortest_write_time)
		shortest_write_time = min_write_time;
	/*
	 * Enable delta variants of tsc functions if the largest of all chosen
	 * deltas is > smallest of the write time.
	 */
	if (largest_tsc_delta > shortest_write_time) {
		gethrtimef = tsc_gethrtime_delta;
		gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
	}
	restore_int_flag(flags);
}
int
turnstile_block(turnstile_t *ts, int qnum, void *sobj, sobj_ops_t *sobj_ops,
    kmutex_t *mp, lwp_timer_t *lwptp)
{
	kthread_t *owner;
	kthread_t *t = curthread;
	proc_t *p = ttoproc(t);
	klwp_t *lwp = ttolwp(t);
	turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj);
	int error = 0;
	int loser = 0;

	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
	ASSERT(mp == NULL || IS_UPI(mp));
	ASSERT((SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) ^ (mp == NULL));

	thread_lock_high(t);

	if (ts == NULL) {
		/*
		 * This is the first thread to block on this sobj.
		 * Take its attached turnstile and add it to the hash chain.
		 */
		ts = t->t_ts;
		ts->ts_sobj = sobj;
		ts->ts_next = tc->tc_first;
		tc->tc_first = ts;
		ASSERT(ts->ts_waiters == 0);
	} else {
		/*
		 * Another thread has already donated its turnstile
		 * to block on this sobj, so ours isn't needed.
		 * Stash it on the active turnstile's freelist.
		 */
		turnstile_t *myts = t->t_ts;
		myts->ts_free = ts->ts_free;
		ts->ts_free = myts;
		t->t_ts = ts;
		ASSERT(ts->ts_sobj == sobj);
		ASSERT(ts->ts_waiters > 0);
	}

	/*
	 * Put the thread to sleep.
	 */
	ASSERT(t != CPU->cpu_idle_thread);
	ASSERT(CPU_ON_INTR(CPU) == 0);
	ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
	ASSERT(t->t_state == TS_ONPROC);

	if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
		curthread->t_flag |= T_WAKEABLE;
	}
	CL_SLEEP(t);		/* assign kernel priority */
	THREAD_SLEEP(t, &tc->tc_lock);
	t->t_wchan = sobj;
	t->t_sobj_ops = sobj_ops;
	DTRACE_SCHED(sleep);

	if (lwp != NULL) {
		lwp->lwp_ru.nvcsw++;
		(void) new_mstate(t, LMS_SLEEP);
		if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
			lwp->lwp_asleep = 1;
			lwp->lwp_sysabort = 0;
			/*
			 * make wchan0 non-zero to conform to the rule that
			 * threads blocking for user-level objects have a
			 * non-zero wchan0: this prevents spurious wake-ups
			 * by, for example, /proc.
			 */
			t->t_wchan0 = (caddr_t)1;
		}
	}
	ts->ts_waiters++;
	sleepq_insert(&ts->ts_sleepq[qnum], t);

	if (SOBJ_TYPE(sobj_ops) == SOBJ_MUTEX &&
	    SOBJ_OWNER(sobj_ops, sobj) == NULL)
		panic("turnstile_block(%p): unowned mutex", (void *)ts);

	/*
	 * Follow the blocking chain to its end, willing our priority to
	 * everyone who's in our way.
	 */
	while (t->t_sobj_ops != NULL &&
	    (owner = SOBJ_OWNER(t->t_sobj_ops, t->t_wchan)) != NULL) {
		if (owner == curthread) {
			if (SOBJ_TYPE(sobj_ops) != SOBJ_USER_PI) {
				panic("Deadlock: cycle in blocking chain");
			}
			/*
			 * If the cycle we've encountered ends in mp,
			 * then we know it isn't a 'real' cycle because
			 * we're going to drop mp before we go to sleep.
			 * Moreover, since we've come full circle we know
			 * that we must have willed priority to everyone
			 * in our way.  Therefore, we can break out now.
			 */
			if (t->t_wchan == (void *)mp)
				break;

			if (loser)
				lock_clear(&turnstile_loser_lock);
			/*
			 * For SOBJ_USER_PI, a cycle is an application
			 * deadlock which needs to be communicated
			 * back to the application.
			 */
			thread_unlock_nopreempt(t);
			mutex_exit(mp);
			setrun(curthread);
			swtch(); /* necessary to transition state */
			curthread->t_flag &= ~T_WAKEABLE;
			if (lwptp->lwpt_id != 0)
				(void) lwp_timer_dequeue(lwptp);
			setallwatch();
			lwp->lwp_asleep = 0;
			lwp->lwp_sysabort = 0;
			return (EDEADLK);
		}
		if (!turnstile_interlock(t->t_lockp, &owner->t_lockp)) {
			/*
			 * If we failed to grab the owner's thread lock,
			 * turnstile_interlock() will have dropped t's
			 * thread lock, so at this point we don't even know
			 * that 't' exists anymore.  The simplest solution
			 * is to restart the entire priority inheritance dance
			 * from the beginning of the blocking chain, since
			 * we *do* know that 'curthread' still exists.
			 * Application of priority inheritance is idempotent,
			 * so it's OK that we're doing it more than once.
			 * Note also that since we've dropped our thread lock,
			 * we may already have been woken up; if so, our
			 * t_sobj_ops will be NULL, the loop will terminate,
			 * and the call to swtch() will be a no-op.  Phew.
			 *
			 * There is one further complication: if two (or more)
			 * threads keep trying to grab the turnstile locks out
			 * of order and keep losing the race to another thread,
			 * these "dueling losers" can livelock the system.
			 * Therefore, once we get into this rare situation,
			 * we serialize all the losers.
			 */
			if (loser == 0) {
				loser = 1;
				lock_set(&turnstile_loser_lock);
			}
			t = curthread;
			thread_lock_high(t);
			continue;
		}

		/*
		 * We now have the owner's thread lock.  If we are traversing
		 * from non-SOBJ_USER_PI ops to SOBJ_USER_PI ops, then we know
		 * that we have caught the thread while in the TS_SLEEP state,
		 * but holding mp.  We know that this situation is transient
		 * (mp will be dropped before the holder actually sleeps on
		 * the SOBJ_USER_PI sobj), so we will spin waiting for mp to
		 * be dropped.  Then, as in the turnstile_interlock() failure
		 * case, we will restart the priority inheritance dance.
		 */
		if (SOBJ_TYPE(t->t_sobj_ops) != SOBJ_USER_PI &&
		    owner->t_sobj_ops != NULL &&
		    SOBJ_TYPE(owner->t_sobj_ops) == SOBJ_USER_PI) {
			kmutex_t *upi_lock = (kmutex_t *)t->t_wchan;

			ASSERT(IS_UPI(upi_lock));
			ASSERT(SOBJ_TYPE(t->t_sobj_ops) == SOBJ_MUTEX);

			if (t->t_lockp != owner->t_lockp)
				thread_unlock_high(owner);
			thread_unlock_high(t);
			if (loser)
				lock_clear(&turnstile_loser_lock);

			while (mutex_owner(upi_lock) == owner) {
				SMT_PAUSE();
				continue;
			}

			if (loser)
				lock_set(&turnstile_loser_lock);
			t = curthread;
			thread_lock_high(t);
			continue;
		}

		turnstile_pi_inherit(t->t_ts, owner, DISP_PRIO(t));
		if (t->t_lockp != owner->t_lockp)
			thread_unlock_high(t);
		t = owner;
	}

	if (loser)
		lock_clear(&turnstile_loser_lock);

	/*
	 * Note: 't' and 'curthread' were synonymous before the loop above,
	 * but now they may be different.  ('t' is now the last thread in
	 * the blocking chain.)
	 */
	if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
		ushort_t s = curthread->t_oldspl;
		int timedwait = 0;
		uint_t imm_timeout = 0;
		clock_t tim = -1;

		thread_unlock_high(t);
		if (lwptp->lwpt_id != 0) {
			/*
			 * We enqueued a timeout.  If it has already fired,
			 * lwptp->lwpt_imm_timeout has been set with cas,
			 * so fetch it with cas.
			 */
			timedwait = 1;
			imm_timeout =
			    atomic_cas_uint(&lwptp->lwpt_imm_timeout, 0, 0);
		}
		mutex_exit(mp);
		splx(s);

		if (ISSIG(curthread, JUSTLOOKING) ||
		    MUSTRETURN(p, curthread) || imm_timeout)
			setrun(curthread);
		swtch();
		curthread->t_flag &= ~T_WAKEABLE;
		if (timedwait)
			tim = lwp_timer_dequeue(lwptp);
		setallwatch();
		if (ISSIG(curthread, FORREAL) || lwp->lwp_sysabort ||
		    MUSTRETURN(p, curthread))
			error = EINTR;
		else if (imm_timeout || (timedwait && tim == -1))
			error = ETIME;
		lwp->lwp_sysabort = 0;
		lwp->lwp_asleep = 0;
	} else {
		thread_unlock_nopreempt(t);
		swtch();
	}

	return (error);
}