/* CAS-less unlock, not quite as efficient and will make sure every vcore runs * (since we don't have a convenient way to make sure our qnode->next runs * yet, other than making sure everyone runs). * * To use this without ensuring all vcores run, you'll need the unlock code to * save pred to a specific field in the qnode and check both its initial pred * as well as its run time pred (who could be an usurper). It's all possible, * but a little more difficult to follow. Also, I'm adjusting this comment * months after writing it originally, so it is probably not sufficient, but * necessary. */ void __mcs_pdro_unlock_no_cas(struct mcs_pdro_lock *lock, struct mcs_pdro_qnode *qnode) { struct mcs_pdro_qnode *old_tail, *usurper; /* Check if someone is already waiting on us to unlock */ if (qnode->next == 0) { cmb(); /* no need for CPU mbs, since there's an atomic_swap() */ /* Unlock it */ old_tail = atomic_swap_ptr((void**)&lock->lock, 0); /* no one else was already waiting, so we successfully unlocked and can * return */ if (old_tail == qnode) return; /* someone else was already waiting on the lock (last one on the list), * and we accidentally took them off. Try and put it back. */ usurper = atomic_swap_ptr((void*)&lock->lock, old_tail); /* since someone else was waiting, they should have made themselves our * next. spin (very briefly!) til it happens. */ while (qnode->next == 0) { /* make sure old_tail isn't preempted. best we can do for now is * to make sure all vcores run, and thereby get our next. */ for (int i = 0; i < max_vcores(); i++) ensure_vcore_runs(i); cpu_relax(); } if (usurper) { /* an usurper is someone who snuck in before we could put the old * tail back. They now have the lock. Let's put whoever is * supposed to be next as their next one. * * First, we need to change our next's pred. There's a slight race * here, so our next will need to make sure both us and pred are * done */ /* I was trying to do something so we didn't need to ensure all * vcores run, using more space in the qnode to figure out who our * pred was a lock time (guessing actually, since there's a race, * etc). */ //qnode->next->pred = usurper; //wmb(); usurper->next = qnode->next; /* could imagine another wmb() and a flag so our next knows to no * longer check us too. */ } else { /* No usurper meant we put things back correctly, so we should just * pass the lock / unlock whoever is next */ qnode->next->locked = 0; } } else { /* mb()s necessary since we didn't call an atomic_swap() */ wmb(); /* need to make sure any previous writes don't pass unlocking */ rwmb(); /* need to make sure any reads happen before the unlocking */ /* simply unlock whoever is next */ qnode->next->locked = 0; } }
/* Internal version of the locking function, doesn't care if notifs are * disabled. While spinning, we'll check to see if other vcores involved in the * locking are running. If we change to that vcore, we'll continue when our * vcore gets restarted. If the change fails, it is because the vcore is * running, and we'll continue. * * It's worth noting that changing to another vcore won't hurt correctness. * Even if they are no longer the lockholder, they will be checking preemption * messages and will help break out of the deadlock. So long as we don't * spin uncontrollably, we're okay. */ void __mcs_pdro_lock(struct mcs_pdro_lock *lock, struct mcs_pdro_qnode *qnode) { struct mcs_pdro_qnode *predecessor; uint32_t pred_vcoreid; /* Now the actual lock */ qnode->next = 0; cmb(); /* swap provides a CPU mb() */ predecessor = atomic_swap_ptr((void**)&lock->lock, qnode); if (predecessor) { qnode->locked = 1; /* Read-in the vcoreid before releasing them. We won't need to worry * about their qnode memory being freed/reused (they can't til we fill * in the 'next' slot), which is a bit of a performance win. This also * cuts down on cache-line contention when we ensure they run, which * helps a lot too. */ pred_vcoreid = ACCESS_ONCE(predecessor->vcoreid); wmb(); /* order the locked write before the next write */ predecessor->next = qnode; /* no need for a wrmb(), since this will only get unlocked after they * read our previous write */ while (qnode->locked) { /* We don't know who the lock holder is (it hurts performance via * 'true' sharing to track it) Instead we'll make sure our pred is * running, which trickles up to the lock holder. */ ensure_vcore_runs(pred_vcoreid); cpu_relax(); } } }
static inline void rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n) { RW_INHERITDEBUG(n, o); n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner, (void *)n); RW_DASSERT(rw, n == o); }
void *wfl_remove(struct wfl *list) { for (struct wfl_entry *p = list->head; p != NULL; p = p->next) { if (p->data != NULL) { void *data = atomic_swap_ptr(&p->data, 0); if (data != NULL) return data; } } return NULL; }
void rump_unschedule_cpu1(struct lwp *l, void *interlock) { struct rumpcpu *rcpu; struct cpu_info *ci; void *old; ci = l->l_cpu; ci->ci_curlwp = ci->ci_data.cpu_onproc = NULL; rcpu = cpuinfo_to_rumpcpu(ci); KASSERT(rcpu->rcpu_ci == ci); /* * Make sure all stores are seen before the CPU release. This * is relevant only in the non-fastpath scheduling case, but * we don't know here if that's going to happen, so need to * expect the worst. * * If the scheduler interlock was requested by the caller, we * need to obtain it before we release the CPU. Otherwise, we risk a * race condition where another thread is scheduled onto the * rump kernel CPU before our current thread can * grab the interlock. */ if (interlock == rcpu->rcpu_mtx) rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); else membar_exit(); /* Release the CPU. */ old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l); /* No waiters? No problems. We're outta here. */ if (old == RCPULWP_BUSY) { return; } KASSERT(old == RCPULWP_WANTED); /* * Ok, things weren't so snappy. * * Snailpath: take lock and signal anyone waiting for this CPU. */ if (interlock != rcpu->rcpu_mtx) rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); if (rcpu->rcpu_wanted) rumpuser_cv_broadcast(rcpu->rcpu_cv); if (interlock != rcpu->rcpu_mtx) rumpuser_mutex_exit(rcpu->rcpu_mtx); }
static void pefs_aesni_uninit(struct pefs_alg *pa) { struct fpu_kern_ctx *fpu_ctx; u_int cpuid; CPU_FOREACH(cpuid) { fpu_ctx = (void *)atomic_swap_ptr( (volatile void *)DPCPU_ID_PTR(cpuid, pefs_aesni_fpu), (uintptr_t)NULL); if (fpu_ctx != NULL) fpu_kern_free_ctx(fpu_ctx); } }
void lt_rwlock_write_unlock(lt_rwlock_t * lock) { lt_thread_list_t * place_holder = NULL; /* As of this point, arriving readers should go into readers[1], the * readers in readers[0] should go there too, and readers[1] should * be empty. * Hence, we us place-holder lists for arriving readers, which we will * add to readers[1] once we're sure the scheduler will take the proper * list place_holder: empty list of threads; * * we assume readers[1] is empty */ assert(lt_thread_list_empty(lock->readers[1])); /* so we put the place-holder at readers[0] */ atomic_swap_ptr(lock->readers[0], place_holder); /* place_holder now contains readers[0] * we now de-queue ourselves */ assert(lt_thread_eq(lt_thread_queue_deq(lock->writers), lt_thread_self()) == 0); /* if the writers queue is empty, it will schedule into readers[1]; * if not, it will continue scheduling into readers[0], which is now the * place-holder. */ /* we add whatever we swapped out to readers[1] */ lt_thread_list_move(lock->readers[1], place_holder); /* we swap again. As readers[0] only contains sleeping threads, we're sure * all the threads in there are asleep and won't think they're in * readers[1] */ atomic_swap_ptr(place_holder, lock->readers[0]); /* we move whatever we got from readers[0] between the time put the * place-holder in place and the time we removed ourselves from the writers * queue (which may have made it possible to schedule into readers[1] * to readers[1]. If the scheduler is still scheduling into readers[0], * these threads just got lucky */ lt_thread_list_move(lock->readers[1], place_holder); /* now, we awake everyone in readers[1] */ lt_thread_list_for_each(lock->readers[1], lt_thread_wake); /* and we're done */ }
/* Similar to the original PDR lock, this tracks the lockholder for better * recovery from preemptions. Under heavy contention, changing to the * lockholder instead of pred makes it more likely to have a vcore outside the * MCS chain handle the preemption. If that never happens, performance will * suffer. * * Simply checking the lockholder causes a lot of unnecessary traffic, so we * first look for signs of preemption in read-mostly locations (by comparison, * the lockholder changes on every lock/unlock). * * We also use the "qnodes are in the lock" style, which is slightly slower than * using the stack in regular MCS/MCSPDR locks, but it speeds PDR up a bit by * not having to read other qnodes' memory to determine their vcoreid. The * slowdown may be due to some weird caching/prefetch settings (like Adjacent * Cacheline Prefetch). * * Note that these locks, like all PDR locks, have opportunities to accidentally * ensure some vcore runs that isn't in the chain. Whenever we read lockholder * or even pred, that particular vcore might subsequently unlock and then get * preempted (or change_to someone else) before we ensure they run. If this * happens and there is another VC in the MCS chain, it will make sure the right * cores run. If there are no other vcores in the chain, it is up to the rest * of the vcore/event handling system to deal with this, which should happen * when one of the other vcores handles the preemption message generated by our * change_to. */ void __mcs_pdr_lock(struct mcs_pdr_lock *lock, struct mcs_pdr_qnode *qnode) { struct mcs_pdr_qnode *predecessor; uint32_t pred_vcoreid; struct mcs_pdr_qnode *qnode0 = qnode - vcore_id(); seq_ctr_t seq; qnode->next = 0; cmb(); /* swap provides a CPU mb() */ predecessor = atomic_swap_ptr((void**)&lock->lock, qnode); if (predecessor) { qnode->locked = 1; pred_vcoreid = predecessor - qnode0; /* can compute this whenever */ wmb(); /* order the locked write before the next write */ predecessor->next = qnode; seq = ACCESS_ONCE(__procinfo.coremap_seqctr); /* no need for a wrmb(), since this will only get unlocked after they * read our pred->next write */ while (qnode->locked) { /* Check to see if anything is amiss. If someone in the chain is * preempted, then someone will notice. Simply checking our pred * isn't that great of an indicator of preemption. The reason is * that the offline vcore is most likely the lockholder (under heavy * lock contention), and we want someone farther back in the chain * to notice (someone that will stay preempted long enough for a * vcore outside the chain to recover them). Checking the seqctr * will tell us of any preempts since we started, so if a storm * starts while we're spinning, we can join in and try to save the * lockholder before its successor gets it. * * Also, if we're the lockholder, then we need to let our pred run * so they can hand us the lock. */ if (vcore_is_preempted(pred_vcoreid) || seq != __procinfo.coremap_seqctr) { if (lock->lockholder_vcoreid == MCSPDR_NO_LOCKHOLDER || lock->lockholder_vcoreid == vcore_id()) ensure_vcore_runs(pred_vcoreid); else ensure_vcore_runs(lock->lockholder_vcoreid); } cpu_relax(); } } else { lock->lockholder_vcoreid = vcore_id(); } }
// Perform atomic 'exchange pointers' operation. Pointer is set // to the 'val' value. Old value is returned. inline T *xchg (T *val_) { #if defined ZMQ_ATOMIC_PTR_WINDOWS return (T*) InterlockedExchangePointer ((PVOID*) &ptr, val_); #elif defined ZMQ_ATOMIC_PTR_INTRINSIC return (T*) __atomic_exchange_n (&ptr, val_, __ATOMIC_ACQ_REL); #elif defined ZMQ_ATOMIC_PTR_CXX11 return ptr.exchange(val_, std::memory_order_acq_rel); #elif defined ZMQ_ATOMIC_PTR_ATOMIC_H return (T*) atomic_swap_ptr (&ptr, val_); #elif defined ZMQ_ATOMIC_PTR_TILE return (T*) arch_atomic_exchange (&ptr, val_); #elif defined ZMQ_ATOMIC_PTR_X86 T *old; __asm__ volatile ( "lock; xchg %0, %2" : "=r" (old), "=m" (ptr) : "m" (ptr), "0" (val_)); return old; #elif defined ZMQ_ATOMIC_PTR_ARM T* old; unsigned int flag; __asm__ volatile ( " dmb sy\n\t" "1: ldrex %1, [%3]\n\t" " strex %0, %4, [%3]\n\t" " teq %0, #0\n\t" " bne 1b\n\t" " dmb sy\n\t" : "=&r"(flag), "=&r"(old), "+Qo"(ptr) : "r"(&ptr), "r"(val_) : "cc"); return old; #elif defined ZMQ_ATOMIC_PTR_MUTEX sync.lock (); T *old = (T*) ptr; ptr = val_; sync.unlock (); return old; #else #error atomic_ptr is not implemented for this platform #endif }
static void pefs_aesni_enter(struct pefs_session *xses) { struct pefs_aesni_ses *ses = &xses->o.ps_aesni; if (is_fpu_kern_thread(0)) { ses->fpu_saved = 0; return; } critical_enter(); ses->fpu_ctx = (void *)atomic_swap_ptr( (volatile void *)DPCPU_PTR(pefs_aesni_fpu), (uintptr_t)NULL); if (ses->fpu_ctx != NULL) { ses->td = curthread; ses->fpu_cpuid = curcpu; fpu_kern_enter(ses->td, ses->fpu_ctx, FPU_KERN_NORMAL); ses->fpu_saved = 1; } else { ses->fpu_saved = -1; } critical_exit(); }
// Perform atomic 'exchange pointers' operation. Pointer is set // to the 'val' value. Old value is returned. inline T *xchg (T *val_) { #if defined XS_ATOMIC_PTR_WINDOWS return (T*) InterlockedExchangePointer ((PVOID*) &ptr, val_); #elif defined XS_ATOMIC_PTR_ATOMIC_H return (T*) atomic_swap_ptr (&ptr, val_); #elif defined XS_ATOMIC_PTR_X86 T *old; __asm__ volatile ( "lock; xchg %0, %2" : "=r" (old), "=m" (ptr) : "m" (ptr), "0" (val_)); return old; #elif defined XS_ATOMIC_PTR_MUTEX sync.lock (); T *old = (T*) ptr; ptr = val_; sync.unlock (); return old; #else #error atomic_ptr is not implemented for this platform #endif }
/* * Acquire a lock waiting (spin or sleep) for it to become available. */ void _lock_acquire(struct lock *lck, struct lockuser *lu, int prio) { int i; int lval; /** * XXX - We probably want to remove these checks to optimize * performance. It is also a bug if any one of the * checks fail, so it's probably better to just let it * SEGV and fix it. */ #if 0 if (lck == NULL || lu == NULL || lck->l_head == NULL) return; #endif if ((lck->l_type & LCK_PRIORITY) != 0) { LCK_ASSERT(lu->lu_myreq->lr_locked == 1); LCK_ASSERT(lu->lu_myreq->lr_watcher == NULL); LCK_ASSERT(lu->lu_myreq->lr_owner == lu); LCK_ASSERT(lu->lu_watchreq == NULL); lu->lu_priority = prio; } /* * Atomically swap the head of the lock request with * this request. */ atomic_swap_ptr((void *)&lck->l_head, lu->lu_myreq, (void *)&lu->lu_watchreq); if (lu->lu_watchreq->lr_locked != 0) { atomic_store_rel_ptr ((volatile uintptr_t *)(void *)&lu->lu_watchreq->lr_watcher, (uintptr_t)lu); if ((lck->l_wait == NULL) || ((lck->l_type & LCK_ADAPTIVE) == 0)) { while (lu->lu_watchreq->lr_locked != 0) ; /* spin, then yield? */ } else { /* * Spin for a bit before invoking the wait function. * * We should be a little smarter here. If we're * running on a single processor, then the lock * owner got preempted and spinning will accomplish * nothing but waste time. If we're running on * multiple processors, the owner could be running * on another CPU and we might acquire the lock if * we spin for a bit. * * The other thing to keep in mind is that threads * acquiring these locks are considered to be in * critical regions; they will not be preempted by * the _UTS_ until they release the lock. It is * therefore safe to assume that if a lock can't * be acquired, it is currently held by a thread * running in another KSE. */ for (i = 0; i < MAX_SPINS; i++) { if (lu->lu_watchreq->lr_locked == 0) return; if (lu->lu_watchreq->lr_active == 0) break; } atomic_swap_int(&lu->lu_watchreq->lr_locked, 2, &lval); if (lval == 0) lu->lu_watchreq->lr_locked = 0; else lck->l_wait(lck, lu); } } lu->lu_myreq->lr_active = 1; }
static inline mcs_lock_qnode_t *mcs_qnode_swap(mcs_lock_qnode_t **addr, mcs_lock_qnode_t *val) { return (mcs_lock_qnode_t*)atomic_swap_ptr((void**)addr, val); }
void* atomicExchange(void* volatile& val, void* exch) { return atomic_swap_ptr(&val, exch); }
/* * Schedule a CPU. This optimizes for the case where we schedule * the same thread often, and we have nCPU >= nFrequently-Running-Thread * (where CPU is virtual rump cpu, not host CPU). */ void rump_schedule_cpu_interlock(struct lwp *l, void *interlock) { struct rumpcpu *rcpu; struct cpu_info *ci; void *old; bool domigrate; bool bound = l->l_pflag & LP_BOUND; l->l_stat = LSRUN; /* * First, try fastpath: if we were the previous user of the * CPU, everything is in order cachewise and we can just * proceed to use it. * * If we are a different thread (i.e. CAS fails), we must go * through a memory barrier to ensure we get a truthful * view of the world. */ KASSERT(l->l_target_cpu != NULL); rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu); if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) { if (interlock == rcpu->rcpu_mtx) rumpuser_mutex_exit(rcpu->rcpu_mtx); SCHED_FASTPATH(rcpu); /* jones, you're the man */ goto fastlane; } /* * Else, it's the slowpath for us. First, determine if we * can migrate. */ if (ncpu == 1) domigrate = false; else domigrate = true; /* Take lock. This acts as a load barrier too. */ if (interlock != rcpu->rcpu_mtx) rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); for (;;) { SCHED_SLOWPATH(rcpu); old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED); /* CPU is free? */ if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) { if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) { break; } } /* * Do we want to migrate once? * This may need a slightly better algorithm, or we * might cache pingpong eternally for non-frequent * threads. */ if (domigrate && !bound) { domigrate = false; SCHED_MIGRATED(rcpu); rumpuser_mutex_exit(rcpu->rcpu_mtx); rcpu = getnextcpu(); rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx); continue; } /* Want CPU, wait until it's released an retry */ rcpu->rcpu_wanted++; rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx); rcpu->rcpu_wanted--; } rumpuser_mutex_exit(rcpu->rcpu_mtx); fastlane: ci = rcpu->rcpu_ci; l->l_cpu = l->l_target_cpu = ci; l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex; l->l_ncsw++; l->l_stat = LSONPROC; /* * No interrupts, so ci_curlwp === cpu_onproc. * Okay, we could make an attempt to not set cpu_onproc * in the case that an interrupt is scheduled immediately * after a user proc, but leave that for later. */ ci->ci_curlwp = ci->ci_data.cpu_onproc = l; }