int xb_read(void *data, unsigned len) { volatile struct xenstore_domain_interface *intf = xs_domain_interface(xb_addr); XENSTORE_RING_IDX cons, prod; extern int do_polled_io; while (len != 0) { unsigned int avail; const char *src; mutex_enter(&xb_wait_lock); while (intf->rsp_cons == intf->rsp_prod) { if (interrupts_unleashed && !do_polled_io) { if (cv_wait_sig(&xb_wait_cv, &xb_wait_lock) == 0) { mutex_exit(&xb_wait_lock); return (EINTR); } } else { /* polled mode needed for early probes */ (void) HYPERVISOR_yield(); } } mutex_exit(&xb_wait_lock); /* Read indexes, then verify. */ cons = intf->rsp_cons; prod = intf->rsp_prod; membar_enter(); if (!check_indexes(cons, prod)) return (EIO); src = get_input_chunk(cons, prod, (char *)intf->rsp, &avail); if (avail == 0) continue; if (avail > len) avail = len; /* We must read header before we read data. */ membar_consumer(); (void) memcpy(data, src, avail); data = (void *)((uintptr_t)data + avail); len -= avail; /* Other side must not see free space until we've copied out */ membar_enter(); intf->rsp_cons += avail; /* Implies mb(): they will see new header. */ ec_notify_via_evtchn(xen_info->store_evtchn); } return (0); }
int pthread_rwlock_tryrdlock(pthread_rwlock_t *ptr) { uintptr_t owner, next; if (__predict_false(__uselibcstub)) return __libc_rwlock_tryrdlock_stub(ptr); #ifdef ERRORCHECK if (ptr->ptr_magic != _PT_RWLOCK_MAGIC) return EINVAL; #endif /* * Don't get a readlock if there is a writer or if there are waiting * writers; i.e. prefer writers to readers. This strategy is dictated * by SUSv3. */ for (owner = (uintptr_t)ptr->ptr_owner;; owner = next) { if ((owner & (RW_WRITE_LOCKED | RW_WRITE_WANTED)) != 0) return EBUSY; next = rw_cas(ptr, owner, owner + RW_READ_INCR); if (owner == next) { /* Got it! */ #ifndef PTHREAD__ATOMIC_IS_MEMBAR membar_enter(); #endif return 0; } } }
static int systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { switch (cmd) { case DDI_ATTACH: break; case DDI_RESUME: return (DDI_SUCCESS); default: return (DDI_FAILURE); } systrace_probe = (void (*)())dtrace_probe; membar_enter(); if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0, DDI_PSEUDO, NULL) == DDI_FAILURE || dtrace_register("syscall", &systrace_attr, DTRACE_PRIV_USER, NULL, &systrace_pops, NULL, &systrace_id) != 0) { systrace_probe = systrace_stub; ddi_remove_minor_node(devi, NULL); return (DDI_FAILURE); } ddi_report_dev(devi); systrace_devi = devi; return (DDI_SUCCESS); }
int mtx_enter_try(struct mutex *mtx) { struct cpu_info *owner, *ci = curcpu(); int s; if (mtx->mtx_wantipl != IPL_NONE) s = splraise(mtx->mtx_wantipl); owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci); #ifdef DIAGNOSTIC if (__predict_false(owner == ci)) panic("mtx %p: locking against myself", mtx); #endif if (owner == NULL) { if (mtx->mtx_wantipl != IPL_NONE) mtx->mtx_oldipl = s; #ifdef DIAGNOSTIC ci->ci_mutex_level++; #endif membar_enter(); return (1); } if (mtx->mtx_wantipl != IPL_NONE) splx(s); return (0); }
void mcs_rwlock::downgrade() { membar_exit(); // this is for all intents and purposes, a release w_assert1(*&_holders == WRITER); *&_holders = READER; membar_enter(); // but it's also an acquire }
int pthread_rwlock_trywrlock(pthread_rwlock_t *ptr) { uintptr_t owner, next; pthread_t self; if (__predict_false(__uselibcstub)) return __libc_rwlock_trywrlock_stub(ptr); #ifdef ERRORCHECK if (ptr->ptr_magic != _PT_RWLOCK_MAGIC) return EINVAL; #endif self = pthread__self(); for (owner = (uintptr_t)ptr->ptr_owner;; owner = next) { if (owner != 0) return EBUSY; next = rw_cas(ptr, owner, (uintptr_t)self | RW_WRITE_LOCKED); if (owner == next) { /* Got it! */ #ifndef PTHREAD__ATOMIC_IS_MEMBAR membar_enter(); #endif return 0; } } }
bool mcs_rwlock::attempt_read() { unsigned int old_value = *&_holders; if(old_value & WRITER || old_value != atomic_cas_32(&_holders, old_value, old_value+READER)) return false; membar_enter(); return true; }
/* * Called by a CPU which has just been onlined. It is expected that the CPU * performing the online operation will call tsc_sync_master(). * * TSC sync is disabled in the context of virtualization. See comments * above tsc_sync_master. */ void tsc_sync_slave(void) { ulong_t flags; hrtime_t s1; tsc_sync_t *tsc = tscp; int cnt; int hwtype; hwtype = get_hwenv(); if (!tsc_master_slave_sync_needed || hwtype == HW_XEN_HVM || hwtype == HW_VMWARE) return; flags = clear_int_flag(); for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) { /* Re-fill the cache line */ s1 = tsc->master_tsc; membar_enter(); tsc_sync_go = TSC_SYNC_GO; do { /* * Do not put an SMT_PAUSE here. For instance, * if the master and slave are really the same * hyper-threaded CPU, then you want the master * to yield to the slave as quickly as possible here, * but not the other way. */ s1 = tsc_read(); } while (tsc->master_tsc == 0); tsc->slave_tsc = s1; membar_enter(); tsc_sync_go = TSC_SYNC_DONE; while (tsc_sync_go != TSC_SYNC_STOP) SMT_PAUSE(); } restore_int_flag(flags); }
/* * Spin until either start_cpus() wakes us up, or we get a request to * enter the safe phase (followed by a later start_cpus()). */ void mach_cpu_pause(volatile char *safe) { *safe = PAUSE_WAIT; membar_enter(); while (*safe != PAUSE_IDLE) { if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE) enter_safe_phase(); SMT_PAUSE(); } }
int xb_write(const void *data, unsigned len) { volatile struct xenstore_domain_interface *intf = xs_domain_interface(xb_addr); XENSTORE_RING_IDX cons, prod; extern int do_polled_io; while (len != 0) { void *dst; unsigned int avail; mutex_enter(&xb_wait_lock); while ((intf->req_prod - intf->req_cons) == XENSTORE_RING_SIZE) { if (interrupts_unleashed && !do_polled_io) { if (cv_wait_sig(&xb_wait_cv, &xb_wait_lock) == 0) { mutex_exit(&xb_wait_lock); return (EINTR); } } else { /* polled mode needed for early probes */ (void) HYPERVISOR_yield(); } } mutex_exit(&xb_wait_lock); /* Read indexes, then verify. */ cons = intf->req_cons; prod = intf->req_prod; membar_enter(); if (!check_indexes(cons, prod)) return (EIO); dst = get_output_chunk(cons, prod, (char *)intf->req, &avail); if (avail == 0) continue; if (avail > len) avail = len; (void) memcpy(dst, data, avail); data = (void *)((uintptr_t)data + avail); len -= avail; /* Other side must not see new header until data is there. */ membar_producer(); intf->req_prod += avail; /* This implies mb() before other side sees interrupt. */ ec_notify_via_evtchn(xen_info->store_evtchn); } return (0); }
void mach_cpu_pause(volatile char *safe) { /* * This cpu is now safe. */ *safe = PAUSE_WAIT; membar_enter(); /* make sure stores are flushed */ /* * Now we wait. When we are allowed to continue, safe * will be set to PAUSE_IDLE. */ while (*safe != PAUSE_IDLE) SMT_PAUSE(); }
/* * This locking needs work and will misbehave severely if: * 1) the backing memory has to be paged in * 2) some lockholder exits while holding the lock */ static void shmif_lockbus(struct shmif_mem *busmem) { int i = 0; while (__predict_false(atomic_cas_32(&busmem->shm_lock, LOCK_UNLOCKED, LOCK_LOCKED) == LOCK_LOCKED)) { if (__predict_false(++i > LOCK_COOLDOWN)) { /* wait 1ms */ rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL, 0, 1000*1000); i = 0; } continue; } membar_enter(); }
void mcs_rwlock::acquire_read() { /* attempt to CAS first. If no writers around, or no intervening * add'l readers, we're done */ if(!attempt_read()) { /* There seem to be writers around, or other readers intervened in our * attempt_read() above. * Join the queue and wait for them to leave */ { CRITICAL_SECTION(cs, (parent_lock*) this); _add_when_writer_leaves(READER); } membar_enter(); } }
void mcs_rwlock::acquire_write() { /* always join the queue first. * * 1. We don't want to race with other writers * * 2. We don't want to make readers deal with the gap between * us updating _holders and actually acquiring the MCS lock. */ CRITICAL_SECTION(cs, (parent_lock*) this); _add_when_writer_leaves(WRITER); w_assert1(has_writer()); // me! // now wait for existing readers to clear out if(has_reader()) _spin_on_readers(); // done! membar_enter(); }
void occ_rwlock::acquire_read() { int count = atomic_add_32_nv(&_active_count, READER); while(count & WRITER) { // block count = atomic_add_32_nv(&_active_count, -READER); { CRITICAL_SECTION(cs, _read_write_mutex); // nasty race: we could have fooled a writer into sleeping... if(count == WRITER) DO_PTHREAD(pthread_cond_signal(&_write_cond)); while(*&_active_count & WRITER) { DO_PTHREAD(pthread_cond_wait(&_read_cond, &_read_write_mutex)); } } count = atomic_add_32_nv(&_active_count, READER); } membar_enter(); }
bool mcs_rwlock::_attempt_write(unsigned int expected) { /* succeeds iff we are the only reader (if expected==READER) * or if there are no readers or writers (if expected==0) * * How do we know there's the only reader is us? * A: we rely on these facts: this is called with expected==READER only * from attempt_upgrade(), which is called from latch only in the case * in which we hold the latch in LATCH_SH mode and are requesting it in LATCH_EX mode. If there is a writer waiting we have to get in line like everyone else. No need for a membar because we already hold the latch */ ext_qnode me = QUEUE_EXT_QNODE_INITIALIZER; if(*&_holders != expected || !attempt(&me)) return false; // at this point, we've called mcs_lock::attempt(&me), and // have acquired the parent/mcs lock // The following line replaces our reader bit with a writer bit. bool result = (expected == atomic_cas_32(&_holders, expected, WRITER)); release(me); // parent/mcs lock membar_enter(); return result; }
static int cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) { cpupart_t *oldpp; cpu_t *ncp, *newlist; kthread_t *t; int move_threads = 1; lgrp_id_t lgrpid; proc_t *p; int lgrp_diff_lpl; lpl_t *cpu_lpl; int ret; boolean_t unbind_all_threads = (forced != 0); ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(newpp != NULL); oldpp = cp->cpu_part; ASSERT(oldpp != NULL); ASSERT(oldpp->cp_ncpus > 0); if (newpp == oldpp) { /* * Don't need to do anything. */ return (0); } cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); if (!disp_bound_partition(cp, 0)) { /* * Don't need to move threads if there are no threads in * the partition. Note that threads can't enter the * partition while we're holding cpu_lock. */ move_threads = 0; } else if (oldpp->cp_ncpus == 1) { /* * The last CPU is removed from a partition which has threads * running in it. Some of these threads may be bound to this * CPU. * * Attempt to unbind threads from the CPU and from the processor * set. Note that no threads should be bound to this CPU since * cpupart_move_threads will refuse to move bound threads to * other CPUs. */ (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); (void) cpupart_unbind_threads(oldpp, B_FALSE); if (!disp_bound_partition(cp, 0)) { /* * No bound threads in this partition any more */ move_threads = 0; } else { /* * There are still threads bound to the partition */ cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); return (EBUSY); } } /* * If forced flag is set unbind any threads from this CPU. * Otherwise unbind soft-bound threads only. */ if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); return (ret); } /* * Stop further threads weak binding to this cpu. */ cpu_inmotion = cp; membar_enter(); /* * Notify the Processor Groups subsystem that the CPU * will be moving cpu partitions. This is done before * CPUs are paused to provide an opportunity for any * needed memory allocations. */ pg_cpupart_out(cp, oldpp); pg_cpupart_in(cp, newpp); again: if (move_threads) { int loop_count; /* * Check for threads strong or weak bound to this CPU. */ for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { if (loop_count >= 5) { cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); pg_cpupart_out(cp, newpp); pg_cpupart_in(cp, oldpp); cpu_inmotion = NULL; return (EBUSY); /* some threads still bound */ } delay(1); } } /* * Before we actually start changing data structures, notify * the cyclic subsystem that we want to move this CPU out of its * partition. */ if (!cyclic_move_out(cp)) { /* * This CPU must be the last CPU in a processor set with * a bound cyclic. */ cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); pg_cpupart_out(cp, newpp); pg_cpupart_in(cp, oldpp); cpu_inmotion = NULL; return (EBUSY); } pause_cpus(cp); if (move_threads) { /* * The thread on cpu before the pause thread may have read * cpu_inmotion before we raised the barrier above. Check * again. */ if (disp_bound_threads(cp, 1)) { start_cpus(); goto again; } } /* * Now that CPUs are paused, let the PG subsystem perform * any necessary data structure updates. */ pg_cpupart_move(cp, oldpp, newpp); /* save this cpu's lgroup -- it'll be the same in the new partition */ lgrpid = cp->cpu_lpl->lpl_lgrpid; cpu_lpl = cp->cpu_lpl; /* * let the lgroup framework know cp has left the partition */ lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); /* move out of old partition */ oldpp->cp_ncpus--; if (oldpp->cp_ncpus > 0) { ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; if (oldpp->cp_cpulist == cp) { oldpp->cp_cpulist = ncp; } } else { ncp = oldpp->cp_cpulist = NULL; cp_numparts_nonempty--; ASSERT(cp_numparts_nonempty != 0); } oldpp->cp_gen++; /* move into new partition */ newlist = newpp->cp_cpulist; if (newlist == NULL) { newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; cp_numparts_nonempty++; ASSERT(cp_numparts_nonempty != 0); } else { cp->cpu_next_part = newlist; cp->cpu_prev_part = newlist->cpu_prev_part; newlist->cpu_prev_part->cpu_next_part = cp; newlist->cpu_prev_part = cp; } cp->cpu_part = newpp; newpp->cp_ncpus++; newpp->cp_gen++; ASSERT(bitset_is_null(&newpp->cp_haltset)); ASSERT(bitset_is_null(&oldpp->cp_haltset)); /* * let the lgroup framework know cp has entered the partition */ lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); /* * If necessary, move threads off processor. */ if (move_threads) { ASSERT(ncp != NULL); /* * Walk thru the active process list to look for * threads that need to have a new home lgroup, * or the last CPU they run on is the same CPU * being moved out of the partition. */ for (p = practive; p != NULL; p = p->p_next) { t = p->p_tlist; if (t == NULL) continue; lgrp_diff_lpl = 0; do { ASSERT(t->t_lpl != NULL); /* * Update the count of how many threads are * in this CPU's lgroup but have a different lpl */ if (t->t_lpl != cpu_lpl && t->t_lpl->lpl_lgrpid == lgrpid) lgrp_diff_lpl++; /* * If the lgroup that t is assigned to no * longer has any CPUs in t's partition, * we'll have to choose a new lgroup for t. */ if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, t->t_cpupart)) { lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 0); } /* * make sure lpl points to our own partition */ ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && (t->t_lpl < t->t_cpupart->cp_lgrploads + t->t_cpupart->cp_nlgrploads)); ASSERT(t->t_lpl->lpl_ncpu > 0); /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, t->t_pri, NULL); } t = t->t_forw; } while (t != p->p_tlist); /* * Didn't find any threads in the same lgroup as this * CPU with a different lpl, so remove the lgroup from * the process lgroup bitmask. */ if (lgrp_diff_lpl) klgrpset_del(p->p_lgrpset, lgrpid); } /* * Walk thread list looking for threads that need to be * rehomed, since there are some threads that are not in * their process's p_tlist. */ t = curthread; do { ASSERT(t != NULL && t->t_lpl != NULL); /* * If the lgroup that t is assigned to no * longer has any CPUs in t's partition, * we'll have to choose a new lgroup for t. * Also, choose best lgroup for home when * thread has specified lgroup affinities, * since there may be an lgroup with more * affinity available after moving CPUs * around. */ if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, t->t_cpupart) || t->t_lgrp_affinity) { lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 1); } /* make sure lpl points to our own partition */ ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && (t->t_lpl < t->t_cpupart->cp_lgrploads + t->t_cpupart->cp_nlgrploads)); ASSERT(t->t_lpl->lpl_ncpu > 0); /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, t->t_pri, NULL); } t = t->t_next; } while (t != curthread); /* * Clear off the CPU's run queue, and the kp queue if the * partition is now empty. */ disp_cpu_inactive(cp); /* * Make cp switch to a thread from the new partition. */ cp->cpu_runrun = 1; cp->cpu_kprunrun = 1; } cpu_inmotion = NULL; start_cpus(); /* * Let anyone interested know that cpu has been added to the set. */ cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); /* * Now let the cyclic subsystem know that it can reshuffle cyclics * bound to the new processor set. */ cyclic_move_in(cp); return (0); }
/* * rw_vector_enter: * * Acquire a rwlock. */ void rw_vector_enter(krwlock_t *rw, const krw_t op) { uintptr_t owner, incr, need_wait, set_wait, curthread, next; turnstile_t *ts; int queue; lwp_t *l; LOCKSTAT_TIMER(slptime); LOCKSTAT_TIMER(slpcnt); LOCKSTAT_TIMER(spintime); LOCKSTAT_COUNTER(spincnt); LOCKSTAT_FLAG(lsflag); l = curlwp; curthread = (uintptr_t)l; RW_ASSERT(rw, !cpu_intr_p()); RW_ASSERT(rw, curthread != 0); RW_WANTLOCK(rw, op); if (panicstr == NULL) { LOCKDEBUG_BARRIER(&kernel_lock, 1); } /* * We play a slight trick here. If we're a reader, we want * increment the read count. If we're a writer, we want to * set the owner field and whe WRITE_LOCKED bit. * * In the latter case, we expect those bits to be zero, * therefore we can use an add operation to set them, which * means an add operation for both cases. */ if (__predict_true(op == RW_READER)) { incr = RW_READ_INCR; set_wait = RW_HAS_WAITERS; need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED; queue = TS_READER_Q; } else { RW_DASSERT(rw, op == RW_WRITER); incr = curthread | RW_WRITE_LOCKED; set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; need_wait = RW_WRITE_LOCKED | RW_THREAD; queue = TS_WRITER_Q; } LOCKSTAT_ENTER(lsflag); KPREEMPT_DISABLE(curlwp); for (owner = rw->rw_owner; ;) { /* * Read the lock owner field. If the need-to-wait * indicator is clear, then try to acquire the lock. */ if ((owner & need_wait) == 0) { next = rw_cas(rw, owner, (owner + incr) & ~RW_WRITE_WANTED); if (__predict_true(next == owner)) { /* Got it! */ membar_enter(); break; } /* * Didn't get it -- spin around again (we'll * probably sleep on the next iteration). */ owner = next; continue; } if (__predict_false(panicstr != NULL)) { kpreempt_enable(); return; } if (__predict_false(RW_OWNER(rw) == curthread)) { rw_abort(rw, __func__, "locking against myself"); } /* * If the lock owner is running on another CPU, and * there are no existing waiters, then spin. */ if (rw_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime); u_int count = SPINLOCK_BACKOFF_MIN; do { KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count); KPREEMPT_DISABLE(curlwp); owner = rw->rw_owner; } while (rw_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKSTAT_COUNT(spincnt, 1); if ((owner & need_wait) == 0) continue; } /* * Grab the turnstile chain lock. Once we have that, we * can adjust the waiter bits and sleep queue. */ ts = turnstile_lookup(rw); /* * Mark the rwlock as having waiters. If the set fails, * then we may not need to sleep and should spin again. * Reload rw_owner because turnstile_lookup() may have * spun on the turnstile chain lock. */ owner = rw->rw_owner; if ((owner & need_wait) == 0 || rw_oncpu(owner)) { turnstile_exit(rw); continue; } next = rw_cas(rw, owner, owner | set_wait); if (__predict_false(next != owner)) { turnstile_exit(rw); owner = next; continue; } LOCKSTAT_START_TIMER(lsflag, slptime); turnstile_block(ts, queue, rw, &rw_syncobj); LOCKSTAT_STOP_TIMER(lsflag, slptime); LOCKSTAT_COUNT(slpcnt, 1); /* * No need for a memory barrier because of context switch. * If not handed the lock, then spin again. */ if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread) break; owner = rw->rw_owner; } KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime); LOCKSTAT_EVENT(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime); LOCKSTAT_EXIT(lsflag); RW_DASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) || (op == RW_READER && RW_COUNT(rw) != 0)); RW_LOCKED(rw, op); }
/* * mutex_vector_enter() is called from the assembly mutex_enter() routine * if the lock is held or is not of type MUTEX_ADAPTIVE. */ void mutex_vector_enter(mutex_impl_t *lp) { kthread_id_t owner; hrtime_t sleep_time = 0; /* how long we slept */ uint_t spin_count = 0; /* how many times we spun */ cpu_t *cpup, *last_cpu; extern cpu_t *cpu_list; turnstile_t *ts; volatile mutex_impl_t *vlp = (volatile mutex_impl_t *)lp; int backoff; /* current backoff */ int backctr; /* ctr for backoff */ int sleep_count = 0; ASSERT_STACK_ALIGNED(); if (MUTEX_TYPE_SPIN(lp)) { lock_set_spl(&lp->m_spin.m_spinlock, lp->m_spin.m_minspl, &lp->m_spin.m_oldspl); return; } if (!MUTEX_TYPE_ADAPTIVE(lp)) { mutex_panic("mutex_enter: bad mutex", lp); return; } /* * Adaptive mutexes must not be acquired from above LOCK_LEVEL. * We can migrate after loading CPU but before checking CPU_ON_INTR, * so we must verify by disabling preemption and loading CPU again. */ cpup = CPU; if (CPU_ON_INTR(cpup) && !panicstr) { kpreempt_disable(); if (CPU_ON_INTR(CPU)) mutex_panic("mutex_enter: adaptive at high PIL", lp); kpreempt_enable(); } CPU_STATS_ADDQ(cpup, sys, mutex_adenters, 1); if (&plat_lock_delay) { backoff = 0; } else { backoff = BACKOFF_BASE; } for (;;) { spin: spin_count++; /* * Add an exponential backoff delay before trying again * to touch the mutex data structure. * the spin_count test and call to nulldev are to prevent * the compiler optimizer from eliminating the delay loop. */ if (&plat_lock_delay) { plat_lock_delay(&backoff); } else { for (backctr = backoff; backctr; backctr--) { if (!spin_count) (void) nulldev(); }; /* delay */ backoff = backoff << 1; /* double it */ if (backoff > BACKOFF_CAP) { backoff = BACKOFF_CAP; } SMT_PAUSE(); } if (panicstr) return; if ((owner = MUTEX_OWNER(vlp)) == NULL) { if (mutex_adaptive_tryenter(lp)) break; continue; } if (owner == curthread) mutex_panic("recursive mutex_enter", lp); /* * If lock is held but owner is not yet set, spin. * (Only relevant for platforms that don't have cas.) */ if (owner == MUTEX_NO_OWNER) continue; /* * When searching the other CPUs, start with the one where * we last saw the owner thread. If owner is running, spin. * * We must disable preemption at this point to guarantee * that the list doesn't change while we traverse it * without the cpu_lock mutex. While preemption is * disabled, we must revalidate our cached cpu pointer. */ kpreempt_disable(); if (cpup->cpu_next == NULL) cpup = cpu_list; last_cpu = cpup; /* mark end of search */ do { if (cpup->cpu_thread == owner) { kpreempt_enable(); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); kpreempt_enable(); /* * The owner appears not to be running, so block. * See the Big Theory Statement for memory ordering issues. */ ts = turnstile_lookup(lp); MUTEX_SET_WAITERS(lp); membar_enter(); /* * Recheck whether owner is running after waiters bit hits * global visibility (above). If owner is running, spin. * * Since we are at ipl DISP_LEVEL, kernel preemption is * disabled, however we still need to revalidate our cached * cpu pointer to make sure the cpu hasn't been deleted. */ if (cpup->cpu_next == NULL) last_cpu = cpup = cpu_list; do { if (cpup->cpu_thread == owner) { turnstile_exit(lp); goto spin; } } while ((cpup = cpup->cpu_next) != last_cpu); membar_consumer(); /* * If owner and waiters bit are unchanged, block. */ if (MUTEX_OWNER(vlp) == owner && MUTEX_HAS_WAITERS(vlp)) { sleep_time -= gethrtime(); (void) turnstile_block(ts, TS_WRITER_Q, lp, &mutex_sobj_ops, NULL, NULL); sleep_time += gethrtime(); sleep_count++; } else { turnstile_exit(lp); } } ASSERT(MUTEX_OWNER(lp) == curthread); if (sleep_time != 0) { /* * Note, sleep time is the sum of all the sleeping we * did. */ LOCKSTAT_RECORD(LS_MUTEX_ENTER_BLOCK, lp, sleep_time); } /* * We do not count a sleep as a spin. */ if (spin_count > sleep_count) LOCKSTAT_RECORD(LS_MUTEX_ENTER_SPIN, lp, spin_count - sleep_count); LOCKSTAT_RECORD0(LS_MUTEX_ENTER_ACQUIRE, lp); }
int atomicGet(volatile atomic_t& val) { long temp = val.l; membar_enter(); return temp; }
/* * Called by the master in the TSC sync operation (usually the boot CPU). * If the slave is discovered to have a skew, gethrtimef will be changed to * point to tsc_gethrtime_delta(). Calculating skews is precise only when * the master and slave TSCs are read simultaneously; however, there is no * algorithm that can read both CPUs in perfect simultaneity. The proposed * algorithm is an approximate method based on the behaviour of cache * management. The slave CPU continuously reads TSC and then reads a global * variable which the master CPU updates. The moment the master's update reaches * the slave's visibility (being forced by an mfence operation) we use the TSC * reading taken on the slave. A corresponding TSC read will be taken on the * master as soon as possible after finishing the mfence operation. But the * delay between causing the slave to notice the invalid cache line and the * competion of mfence is not repeatable. This error is heuristically assumed * to be 1/4th of the total write time as being measured by the two TSC reads * on the master sandwiching the mfence. Furthermore, due to the nature of * bus arbitration, contention on memory bus, etc., the time taken for the write * to reflect globally can vary a lot. So instead of taking a single reading, * a set of readings are taken and the one with least write time is chosen * to calculate the final skew. * * TSC sync is disabled in the context of virtualization because the CPUs * assigned to the guest are virtual CPUs which means the real CPUs on which * guest runs keep changing during life time of guest OS. So we would end up * calculating TSC skews for a set of CPUs during boot whereas the guest * might migrate to a different set of physical CPUs at a later point of * time. */ void tsc_sync_master(processorid_t slave) { ulong_t flags, source, min_write_time = ~0UL; hrtime_t write_time, x, mtsc_after, tdelta; tsc_sync_t *tsc = tscp; int cnt; int hwtype; hwtype = get_hwenv(); if (!tsc_master_slave_sync_needed || hwtype == HW_XEN_HVM || hwtype == HW_VMWARE) return; flags = clear_int_flag(); source = CPU->cpu_id; for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) { while (tsc_sync_go != TSC_SYNC_GO) SMT_PAUSE(); tsc->master_tsc = tsc_read(); membar_enter(); mtsc_after = tsc_read(); while (tsc_sync_go != TSC_SYNC_DONE) SMT_PAUSE(); write_time = mtsc_after - tsc->master_tsc; if (write_time <= min_write_time) { min_write_time = write_time; /* * Apply heuristic adjustment only if the calculated * delta is > 1/4th of the write time. */ x = tsc->slave_tsc - mtsc_after; if (x < 0) x = -x; if (x > (min_write_time/4)) /* * Subtract 1/4th of the measured write time * from the master's TSC value, as an estimate * of how late the mfence completion came * after the slave noticed the cache line * change. */ tdelta = tsc->slave_tsc - (mtsc_after - (min_write_time/4)); else tdelta = tsc->slave_tsc - mtsc_after; tsc_sync_tick_delta[slave] = tsc_sync_tick_delta[source] - tdelta; } tsc->master_tsc = tsc->slave_tsc = write_time = 0; membar_enter(); tsc_sync_go = TSC_SYNC_STOP; } if (tdelta < 0) tdelta = -tdelta; if (tdelta > largest_tsc_delta) largest_tsc_delta = tdelta; if (min_write_time < shortest_write_time) shortest_write_time = min_write_time; /* * Enable delta variants of tsc functions if the largest of all chosen * deltas is > smallest of the write time. */ if (largest_tsc_delta > shortest_write_time) { gethrtimef = tsc_gethrtime_delta; gethrtimeunscaledf = tsc_gethrtimeunscaled_delta; } restore_int_flag(flags); }
static int machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { switch (cmd) { case DDI_ATTACH: break; case DDI_RESUME: return (DDI_SUCCESS); default: return (DDI_FAILURE); } #if !defined(__APPLE__) machtrace_probe = (void (*)())dtrace_probe; membar_enter(); if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0, DDI_PSEUDO, NULL) == DDI_FAILURE || dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL, &machtrace_pops, NULL, &machtrace_id) != 0) { machtrace_probe = systrace_stub; #else machtrace_probe = dtrace_probe; membar_enter(); if (ddi_create_minor_node(devi, "machtrace", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE || dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL, &machtrace_pops, NULL, &machtrace_id) != 0) { machtrace_probe = (void (*))&systrace_stub; #endif /* __APPLE__ */ ddi_remove_minor_node(devi, NULL); return (DDI_FAILURE); } ddi_report_dev(devi); machtrace_devi = devi; return (DDI_SUCCESS); } d_open_t _systrace_open; int _systrace_open(dev_t dev, int flags, int devtype, struct proc *p) { #pragma unused(dev,flags,devtype,p) return 0; } #define SYSTRACE_MAJOR -24 /* let the kernel pick the device number */ /* * A struct describing which functions will get invoked for certain * actions. */ static struct cdevsw systrace_cdevsw = { _systrace_open, /* open */ eno_opcl, /* close */ eno_rdwrt, /* read */ eno_rdwrt, /* write */ eno_ioctl, /* ioctl */ (stop_fcn_t *)nulldev, /* stop */ (reset_fcn_t *)nulldev, /* reset */ NULL, /* tty's */ eno_select, /* select */ eno_mmap, /* mmap */ eno_strat, /* strategy */ eno_getc, /* getc */ eno_putc, /* putc */ 0 /* type */ }; static int gSysTraceInited = 0; void systrace_init( void ); void systrace_init( void ) { if (0 == gSysTraceInited) { int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw); if (majdevno < 0) { printf("systrace_init: failed to allocate a major number!\n"); gSysTraceInited = 0; return; } systrace_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); machtrace_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); gSysTraceInited = 1; } else panic("systrace_init: called twice!\n"); } #undef SYSTRACE_MAJOR #endif /* __APPLE__ */ static uint64_t systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { #pragma unused(arg,id,parg,aframes) /* __APPLE__ */ uint64_t val = 0; syscall_arg_t *stack = (syscall_arg_t *)NULL; uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); if (uthread) stack = (syscall_arg_t *)uthread->t_dtrace_syscall_args; if (!stack) return(0); DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ val = (uint64_t)*(stack+argno); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); return (val); }
static int pthread__rwlock_wrlock(pthread_rwlock_t *ptr, const struct timespec *ts) { uintptr_t owner, next; pthread_mutex_t *interlock; pthread_t self; int error; self = pthread__self(); #ifdef ERRORCHECK if (ptr->ptr_magic != _PT_RWLOCK_MAGIC) return EINVAL; #endif for (owner = (uintptr_t)ptr->ptr_owner;; owner = next) { /* * Read the lock owner field. If the need-to-wait * indicator is clear, then try to acquire the lock. */ if ((owner & RW_THREAD) == 0) { next = rw_cas(ptr, owner, (uintptr_t)self | RW_WRITE_LOCKED); if (owner == next) { /* Got it! */ #ifndef PTHREAD__ATOMIC_IS_MEMBAR membar_enter(); #endif return 0; } /* * Didn't get it -- spin around again (we'll * probably sleep on the next iteration). */ continue; } if ((owner & RW_THREAD) == (uintptr_t)self) return EDEADLK; /* If held write locked and no waiters, spin. */ if (pthread__rwlock_spin(owner)) { while (pthread__rwlock_spin(owner)) { owner = (uintptr_t)ptr->ptr_owner; } next = owner; continue; } /* * Grab the interlock. Once we have that, we * can adjust the waiter bits and sleep queue. */ interlock = pthread__hashlock(ptr); pthread_mutex_lock(interlock); /* * Mark the rwlock as having waiters. If the set fails, * then we may not need to sleep and should spin again. */ next = rw_cas(ptr, owner, owner | RW_HAS_WAITERS | RW_WRITE_WANTED); if (owner != next) { pthread_mutex_unlock(interlock); continue; } /* The waiters bit is set - it's safe to sleep. */ PTQ_INSERT_TAIL(&ptr->ptr_wblocked, self, pt_sleep); self->pt_rwlocked = _RW_WANT_WRITE; self->pt_sleepobj = &ptr->ptr_wblocked; self->pt_early = pthread__rwlock_early; error = pthread__park(self, interlock, &ptr->ptr_wblocked, ts, 0, &ptr->ptr_wblocked); /* Did we get the lock? */ if (self->pt_rwlocked == _RW_LOCKED) { #ifndef PTHREAD__ATOMIC_IS_MEMBAR membar_enter(); #endif return 0; } if (error != 0) return error; pthread__errorfunc(__FILE__, __LINE__, __func__, "direct handoff failure"); } }