/* * Routine: lck_mtx_sleep_deadline */ wait_result_t lck_mtx_sleep_deadline( lck_mtx_t *lck, lck_sleep_action_t lck_sleep_action, event_t event, wait_interrupt_t interruptible, uint64_t deadline) { wait_result_t res; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_START, (int)lck, (int)lck_sleep_action, (int)event, (int)interruptible, 0); if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); res = assert_wait_deadline(event, interruptible, deadline); if (res == THREAD_WAITING) { lck_mtx_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) lck_mtx_lock(lck); } else if (lck_sleep_action & LCK_SLEEP_UNLOCK) lck_mtx_unlock(lck); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0); return res; }
/* * Routine: lck_mtx_unlock_wakeup * * Invoked on unlock when there is contention. * * Called with the interlock locked. */ void lck_mtx_unlock_wakeup ( lck_mtx_t *lck, thread_t holder) { thread_t thread = current_thread(); lck_mtx_t *mutex; if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; if (thread != holder) panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0); assert(mutex->lck_mtx_waiters > 0); thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); if (thread->promotions > 0) { spl_t s = splsched(); thread_lock(thread); if ( --thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED) ) { thread->sched_flags &= ~TH_SFLAG_PROMOTED; if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { /* Thread still has a RW lock promotion */ } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, thread->sched_pri, DEPRESSPRI, 0, lck, 0); set_sched_pri(thread, DEPRESSPRI); } else { if (thread->priority < thread->sched_pri) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, thread->sched_pri, thread->priority, 0, lck, 0); } SCHED(compute_priority)(thread, FALSE); } } thread_unlock(thread); splx(s); } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); }
/* * Routine: lck_mtx_sleep */ wait_result_t lck_mtx_sleep( lck_mtx_t *lck, lck_sleep_action_t lck_sleep_action, event_t event, wait_interrupt_t interruptible) { wait_result_t res; thread_t thread = current_thread(); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_START, VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0); if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { /* * We overload the RW lock promotion to give us a priority ceiling * during the time that this thread is asleep, so that when it * is re-awakened (and not yet contending on the mutex), it is * runnable at a reasonably high priority. */ thread->rwlock_count++; } res = assert_wait(event, interruptible); if (res == THREAD_WAITING) { lck_mtx_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) { if ((lck_sleep_action & LCK_SLEEP_SPIN)) lck_mtx_lock_spin(lck); else if ((lck_sleep_action & LCK_SLEEP_SPIN_ALWAYS)) lck_mtx_lock_spin_always(lck); else lck_mtx_lock(lck); } } else if (lck_sleep_action & LCK_SLEEP_UNLOCK) lck_mtx_unlock(lck); if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ lck_rw_clear_promotion(thread); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0); return res; }
/* * Routine: lck_mtx_sleep_deadline */ wait_result_t lck_mtx_sleep_deadline( lck_mtx_t *lck, lck_sleep_action_t lck_sleep_action, event_t event, wait_interrupt_t interruptible, uint64_t deadline) { wait_result_t res; thread_t thread = current_thread(); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_START, VM_KERNEL_UNSLIDE_OR_PERM(lck), (int)lck_sleep_action, VM_KERNEL_UNSLIDE_OR_PERM(event), (int)interruptible, 0); if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) panic("Invalid lock sleep action %x\n", lck_sleep_action); if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { /* * See lck_mtx_sleep(). */ thread->rwlock_count++; } res = assert_wait_deadline(event, interruptible, deadline); if (res == THREAD_WAITING) { lck_mtx_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) { if ((lck_sleep_action & LCK_SLEEP_SPIN)) lck_mtx_lock_spin(lck); else lck_mtx_lock(lck); } } else if (lck_sleep_action & LCK_SLEEP_UNLOCK) lck_mtx_unlock(lck); if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ lck_rw_clear_promotion(thread); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_SLEEP_DEADLINE_CODE) | DBG_FUNC_END, (int)res, 0, 0, 0, 0); return res; }
boolean_t thread_funnel_set( funnel_t * fnl, boolean_t funneled) { thread_t cur_thread; boolean_t funnel_state_prev; boolean_t intr; cur_thread = current_thread(); funnel_state_prev = ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED); if (funnel_state_prev != funneled) { intr = ml_set_interrupts_enabled(FALSE); if (funneled == TRUE) { if (cur_thread->funnel_lock) panic("Funnel lock called when holding one %p", cur_thread->funnel_lock); KERNEL_DEBUG(0x6032428 | DBG_FUNC_NONE, fnl, 1, 0, 0, 0); funnel_lock(fnl); KERNEL_DEBUG(0x6032434 | DBG_FUNC_NONE, fnl, 1, 0, 0, 0); cur_thread->funnel_state |= TH_FN_OWNED; cur_thread->funnel_lock = fnl; } else { if(cur_thread->funnel_lock->fnl_mutex != fnl->fnl_mutex) panic("Funnel unlock when not holding funnel"); cur_thread->funnel_state &= ~TH_FN_OWNED; KERNEL_DEBUG(0x603242c | DBG_FUNC_NONE, fnl, 1, 0, 0, 0); cur_thread->funnel_lock = THR_FUNNEL_NULL; funnel_unlock(fnl); } (void)ml_set_interrupts_enabled(intr); } else { /* if we are trying to acquire funnel recursively * check for funnel to be held already */ if (funneled && (fnl->fnl_mutex != cur_thread->funnel_lock->fnl_mutex)) { panic("thread_funnel_set: already holding a different funnel"); } } return(funnel_state_prev); }
/* * Routine: lck_mtx_lock_spinwait * * Invoked trying to acquire a mutex when there is contention but * the holder is running on another processor. We spin for up to a maximum * time waiting for the lock to be released. * * Called with the interlock unlocked. */ void lck_mtx_lock_spinwait( lck_mtx_t *lck) { thread_t holder; volatile lck_mtx_t *mutex; uint64_t deadline; if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; KERNEL_DEBUG( MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN) | DBG_FUNC_NONE, (int)lck, (int)mutex->lck_mtx_locked, 0, 0, 0); deadline = mach_absolute_time() + MutexSpin; /* * Spin while: * - mutex is locked, and * - its locked as a spin lock, or * - owner is running on another processor, and * - owner (processor) is not idling, and * - we haven't spun for long enough. */ while ((holder = (thread_t) mutex->lck_mtx_locked) != NULL) { if ((holder == (thread_t)MUTEX_LOCKED_AS_SPIN) || ((holder->machine.specFlags & OnProc) != 0 && (holder->state & TH_IDLE) == 0 && mach_absolute_time() < deadline)) { cpu_pause(); continue; } break; } #if CONFIG_DTRACE /* * We've already kept a count via deadline of how long we spun. * If dtrace is active, then we compute backwards to decide how * long we spun. * * Note that we record a different probe id depending on whether * this is a direct or indirect mutex. This allows us to * penalize only lock groups that have debug/stats enabled * with dtrace processing if desired. */ if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lck, mach_absolute_time() - (deadline - MutexSpin)); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lck, mach_absolute_time() - (deadline - MutexSpin)); } /* The lockstat acquire event is recorded by the assembly code beneath us. */ #endif }
/* * Routine: lck_rw_lock_exclusive_to_shared */ void lck_rw_lock_exclusive_to_shared( lck_rw_t *lck) { boolean_t wakeup_readers = FALSE; boolean_t wakeup_writers = FALSE; boolean_t istate; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0); istate = lck_interlock_lock(lck); lck->lck_rw_shared_count++; if (lck->lck_rw_want_upgrade) lck->lck_rw_want_upgrade = FALSE; else lck->lck_rw_want_write = FALSE; if (lck->lck_w_waiting) { lck->lck_w_waiting = FALSE; wakeup_writers = TRUE; } if (!(lck->lck_rw_priv_excl && wakeup_writers == TRUE) && lck->lck_r_waiting) { lck->lck_r_waiting = FALSE; wakeup_readers = TRUE; } lck_interlock_unlock(lck, istate); if (wakeup_readers) thread_wakeup(RW_LOCK_READER_EVENT(lck)); if (wakeup_writers) thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0); #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0); #endif }
void lck_rw_lock_shared_gen(lck_rw_t *lck) { int i; wait_result_t res; lck_rw_ilk_lock(lck); while ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || (lck->lck_rw_priv_excl))) { i = lock_wait_time[1]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, i, 0); if (i != 0) { lck_rw_ilk_unlock(lck); while (--i != 0 && (lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || (lck->lck_rw_priv_excl))) continue; lck_rw_ilk_lock(lck); } if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || (lck->lck_rw_priv_excl))) { lck->lck_rw_waiting = TRUE; res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); if (res == THREAD_WAITING) { lck_rw_ilk_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); lck_rw_ilk_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, res, 0); } lck->lck_rw_shared_count++; lck_rw_ilk_unlock(lck); }
void throttle_lowpri_io(boolean_t ok_to_sleep) { int i; int max_try_num; struct uthread *ut; struct _throttle_io_info_t *info; ut = get_bsdthread_info(current_thread()); if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL)) goto done; info = ut->uu_throttle_info; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, ut->uu_lowpri_window, ok_to_sleep, 0, 0, 0); if (ok_to_sleep == TRUE) { max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, info->numthreads_throttling); for (i=0; i<max_try_num; i++) { if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) { IOSleep(LOWPRI_SLEEP_INTERVAL); DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info ); } else { break; } } } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, ut->uu_lowpri_window, i*5, 0, 0, 0); SInt32 oldValue; oldValue = OSDecrementAtomic(&info->numthreads_throttling); if (oldValue <= 0) { panic("%s: numthreads negative", __func__); } done: ut->uu_lowpri_window = 0; if (ut->uu_throttle_info) throttle_info_rel(ut->uu_throttle_info); ut->uu_throttle_info = NULL; }
/* * Routine: lck_mtx_unlock_wakeup * * Invoked on unlock when there is contention. * * Called with the interlock locked. */ void lck_mtx_unlock_wakeup ( lck_mtx_t *lck, thread_t holder) { thread_t thread = current_thread(); lck_mtx_t *mutex; __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; if (thread != holder) panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0); assert(mutex->lck_mtx_waiters > 0); if (mutex->lck_mtx_waiters > 1) thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri); else thread_wakeup_one(LCK_MTX_EVENT(lck)); if (thread->promotions > 0) { spl_t s = splsched(); thread_lock(thread); if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) lck_mtx_clear_promoted(thread, trace_lck); thread_unlock(thread); splx(s); } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); }
void lck_mtx_unlockspin_wakeup ( lck_mtx_t *lck) { assert(lck->lck_mtx_waiters > 0); thread_wakeup_one((event_t)(((unsigned int*)lck)+(sizeof(lck_mtx_t)-1)/sizeof(unsigned int))); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, (int)lck, 0, 0, 1, 0); #if CONFIG_DTRACE /* * When there are waiters, we skip the hot-patch spot in the * fastpath, so we record it here. */ LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0); #endif }
void lck_mtx_unlockspin_wakeup ( lck_mtx_t *lck) { assert(lck->lck_mtx_waiters > 0); thread_wakeup_one(LCK_MTX_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0); #if CONFIG_DTRACE /* * When there are waiters, we skip the hot-patch spot in the * fastpath, so we record it here. */ LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0); #endif }
static int pmThreadGetUrgency(uint64_t *rt_period, uint64_t *rt_deadline) { int urgency; uint64_t arg1, arg2; urgency = thread_get_urgency(current_processor()->next_thread, &arg1, &arg2); if (urgency == THREAD_URGENCY_REAL_TIME) { if (rt_period != NULL) *rt_period = arg1; if (rt_deadline != NULL) *rt_deadline = arg2; } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_GET_URGENCY), urgency, arg1, arg2, 0, 0); return(urgency); }
/* * Routine: lck_mtx_lock_wait * * Invoked in order to wait on contention. * * Called with the interlock locked and * returns it unlocked. */ void lck_mtx_lock_wait ( lck_mtx_t *lck, thread_t holder) { thread_t self = current_thread(); lck_mtx_t *mutex; integer_t priority; spl_t s = splsched(); #if CONFIG_DTRACE uint64_t sleep_start = 0; if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { sleep_start = mach_absolute_time(); } #endif if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, (int)lck, (int)holder, 0, 0, 0); priority = self->sched_pri; if (priority < self->priority) priority = self->priority; if (priority < BASEPRI_DEFAULT) priority = BASEPRI_DEFAULT; thread_lock(holder); if (mutex->lck_mtx_pri == 0) holder->promotions++; holder->sched_mode |= TH_MODE_PROMOTED; if ( mutex->lck_mtx_pri < priority && holder->sched_pri < priority ) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, holder->sched_pri, priority, (int)holder, (int)lck, 0); set_sched_pri(holder, priority); } thread_unlock(holder); splx(s); if (mutex->lck_mtx_pri < priority) mutex->lck_mtx_pri = priority; if (self->pending_promoter[self->pending_promoter_index] == NULL) { self->pending_promoter[self->pending_promoter_index] = mutex; mutex->lck_mtx_waiters++; } else if (self->pending_promoter[self->pending_promoter_index] != mutex) { self->pending_promoter[++self->pending_promoter_index] = mutex; mutex->lck_mtx_waiters++; } assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_mtx_t)-1)/sizeof(unsigned int))), THREAD_UNINT); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); #if CONFIG_DTRACE /* * Record the Dtrace lockstat probe for blocking, block time * measured from when we were entered. */ if (sleep_start) { if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, lck, mach_absolute_time() - sleep_start); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, lck, mach_absolute_time() - sleep_start); } } #endif }
boolean_t lck_rw_lock_shared_to_exclusive( lck_rw_t *lck) { int i; boolean_t do_wakeup = FALSE; wait_result_t res; #if MACH_LDEBUG int decrementer; #endif /* MACH_LDEBUG */ boolean_t istate; #if CONFIG_DTRACE uint64_t wait_interval = 0; int slept = 0; int readers_at_sleep = 0; #endif istate = lck_interlock_lock(lck); lck->lck_rw_shared_count--; if (lck->lck_rw_want_upgrade) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); /* * Someone else has requested upgrade. * Since we've released a read lock, wake * him up. */ if (lck->lck_w_waiting && (lck->lck_rw_shared_count == 0)) { lck->lck_w_waiting = FALSE; do_wakeup = TRUE; } lck_interlock_unlock(lck, istate); if (do_wakeup) thread_wakeup(RW_LOCK_WRITER_EVENT(lck)); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); return (FALSE); } lck->lck_rw_want_upgrade = TRUE; #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ while (lck->lck_rw_shared_count != 0) { #if CONFIG_DTRACE if (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] && wait_interval == 0) { wait_interval = mach_absolute_time(); readers_at_sleep = lck->lck_rw_shared_count; } else { wait_interval = -1; } #endif i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_shared_count, i, 0, 0); if (i != 0) { lck_interlock_unlock(lck, istate); #if MACH_LDEBUG if (!--decrementer) Debugger("timeout - lck_rw_shared_count"); #endif /* MACH_LDEBUG */ while (--i != 0 && lck->lck_rw_shared_count != 0) lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } if (lck->lck_rw_can_sleep && lck->lck_rw_shared_count != 0) { lck->lck_w_waiting = TRUE; res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); #if CONFIG_DTRACE slept = 1; #endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_shared_count, 0, 0, 0); } lck_interlock_unlock(lck, istate); #if CONFIG_DTRACE /* * We infer whether we took the sleep/spin path above by checking readers_at_sleep. */ if (wait_interval != 0 && wait_interval != (unsigned) -1 && readers_at_sleep) { if (slept == 0) { LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0); } else { LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck, mach_absolute_time() - wait_interval, 1, (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); } } LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1); #endif return (TRUE); }
/* * Routine: lck_rw_lock_shared_gen */ void lck_rw_lock_shared_gen( lck_rw_t *lck) { int i; wait_result_t res; #if MACH_LDEBUG int decrementer; #endif /* MACH_LDEBUG */ boolean_t istate; #if CONFIG_DTRACE uint64_t wait_interval = 0; int slept = 0; int readers_at_sleep; #endif istate = lck_interlock_lock(lck); #if CONFIG_DTRACE readers_at_sleep = lck->lck_rw_shared_count; #endif #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ while ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, i, 0); #if CONFIG_DTRACE if ((lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK]) && wait_interval == 0) { wait_interval = mach_absolute_time(); } else { wait_interval = -1; } #endif if (i != 0) { lck_interlock_unlock(lck, istate); #if MACH_LDEBUG if (!--decrementer) Debugger("timeout - wait no writers"); #endif /* MACH_LDEBUG */ while (--i != 0 && (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } if (lck->lck_rw_can_sleep && (lck->lck_rw_want_write || lck->lck_rw_want_upgrade) && ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) { lck->lck_r_waiting = TRUE; res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); #if CONFIG_DTRACE slept = 1; #endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, res, 0); } lck->lck_rw_shared_count++; lck_interlock_unlock(lck, istate); #if CONFIG_DTRACE if (wait_interval != 0 && wait_interval != (unsigned) -1) { if (slept == 0) { LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0); } else { LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck, mach_absolute_time() - wait_interval, 0, (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); } } LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0); #endif }
u_int16_t inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, unsigned int len) { u_short *w; u_int32_t sum = 0; int mlen = 0; int byte_swapped = 0; union s_util s_util; union l_util l_util; KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0); /* sanity check */ if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) { panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n", m->m_pkthdr.len, skip, len); } /* include pseudo header checksum? */ if (nxt != 0) { struct ip *iph; if (m->m_len < sizeof (struct ip)) panic("inet_cksum: bad mbuf chain"); iph = mtod(m, struct ip *); sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, htonl(len + nxt)); } if (skip != 0) { for (; skip && m; m = m->m_next) { if (m->m_len > skip) { mlen = m->m_len - skip; w = (u_short *)(m->m_data+skip); goto skip_start; } else { skip -= m->m_len; } } } for (;m && len; m = m->m_next) { if (m->m_len == 0) continue; w = mtod(m, u_short *); if (mlen == -1) { /* * The first byte of this mbuf is the continuation * of a word spanning between this mbuf and the * last mbuf. * * s_util.c[0] is already saved when scanning previous * mbuf. */ s_util.c[1] = *(char *)w; sum += s_util.s; w = (u_short *)((char *)w + 1); mlen = m->m_len - 1; len--; } else { mlen = m->m_len; } skip_start: if (len < mlen) mlen = len; len -= mlen; /* * Force to even boundary. */ if ((1 & (uintptr_t) w) && (mlen > 0)) { REDUCE; sum <<= 8; s_util.c[0] = *(u_char *)w; w = (u_short *)((char *)w + 1); mlen--; byte_swapped = 1; } /* * Unroll the loop to make overhead from * branches &c small. */ while ((mlen -= 32) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; w += 16; } mlen += 32; while ((mlen -= 8) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; w += 4; } mlen += 8; if (mlen == 0 && byte_swapped == 0) continue; REDUCE; while ((mlen -= 2) >= 0) { sum += *w++; } if (byte_swapped) { REDUCE; sum <<= 8; byte_swapped = 0; if (mlen == -1) { s_util.c[1] = *(char *)w; sum += s_util.s; mlen = 0; } else mlen = -1; } else if (mlen == -1) s_util.c[0] = *(char *)w; } if (len) printf("cksum: out of data by %d\n", len); if (mlen == -1) { /* The last mbuf has odd # of bytes. Follow the standard (the odd byte may be shifted left by 8 bits or not as determined by endian-ness of the machine) */ s_util.c[1] = 0; sum += s_util.s; } REDUCE; KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0); return (~sum & 0xffff); }
boolean_t lck_rw_lock_shared_to_exclusive_gen(lck_rw_t *lck) { int i; boolean_t do_wakeup = FALSE; wait_result_t res; lck_rw_ilk_lock(lck); lck->lck_rw_shared_count--; if (lck->lck_rw_want_upgrade) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); /* * Someone else has requested upgrade. * Since we've released a read lock, wake * him up. */ if (lck->lck_rw_waiting && (lck->lck_rw_shared_count == 0)) { lck->lck_rw_waiting = FALSE; do_wakeup = TRUE; } lck_rw_ilk_unlock(lck); if (do_wakeup) thread_wakeup((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int)))); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0); return (FALSE); } lck->lck_rw_want_upgrade = TRUE; while (lck->lck_rw_shared_count != 0) { i = lock_wait_time[1]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_shared_count, i, 0, 0); if (i != 0) { lck_rw_ilk_unlock(lck); while (--i != 0 && lck->lck_rw_shared_count != 0) continue; lck_rw_ilk_lock(lck); } if (lck->lck_rw_shared_count != 0) { lck->lck_rw_waiting = TRUE; res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); if (res == THREAD_WAITING) { lck_rw_ilk_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); lck_rw_ilk_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX1_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_shared_count, 0, 0, 0); } lck_rw_ilk_unlock(lck); return (TRUE); }
void lck_rw_lock_exclusive_gen(lck_rw_t *lck) { int i; wait_result_t res; lck_rw_ilk_lock(lck); /* * Try to acquire the lck_rw_want_excl bit. */ while (lck->lck_rw_want_excl) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); i = lock_wait_time[1]; if (i != 0) { lck_rw_ilk_unlock(lck); while (--i != 0 && lck->lck_rw_want_excl) continue; lck_rw_ilk_lock(lck); } if (lck->lck_rw_want_excl) { lck->lck_rw_waiting = TRUE; res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); if (res == THREAD_WAITING) { lck_rw_ilk_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); lck_rw_ilk_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0); } lck->lck_rw_want_excl = TRUE; /* Wait for readers (and upgrades) to finish */ while ((lck->lck_rw_shared_count != 0) || lck->lck_rw_want_upgrade) { i = lock_wait_time[1]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, i, 0); if (i != 0) { lck_rw_ilk_unlock(lck); while (--i != 0 && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) continue; lck_rw_ilk_lock(lck); } if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) { lck->lck_rw_waiting = TRUE; res = assert_wait((event_t)(((unsigned int*)lck)+((sizeof(lck_rw_t)-1)/sizeof(unsigned int))), THREAD_UNINT); if (res == THREAD_WAITING) { lck_rw_ilk_unlock(lck); res = thread_block(THREAD_CONTINUE_NULL); lck_rw_ilk_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, res, 0); } lck_rw_ilk_unlock(lck); }
/* * Routine: lck_mtx_lock_wait * * Invoked in order to wait on contention. * * Called with the interlock locked and * returns it unlocked. */ void lck_mtx_lock_wait ( lck_mtx_t *lck, thread_t holder) { thread_t self = current_thread(); lck_mtx_t *mutex; __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder); integer_t priority; spl_t s = splsched(); #if CONFIG_DTRACE uint64_t sleep_start = 0; if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { sleep_start = mach_absolute_time(); } #endif if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0); priority = self->sched_pri; if (priority < self->base_pri) priority = self->base_pri; if (priority < BASEPRI_DEFAULT) priority = BASEPRI_DEFAULT; /* Do not promote past promotion ceiling */ priority = MIN(priority, MAXPRI_PROMOTE); thread_lock(holder); if (mutex->lck_mtx_pri == 0) holder->promotions++; holder->sched_flags |= TH_SFLAG_PROMOTED; if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) { KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, holder->sched_pri, priority, trace_holder, trace_lck, 0); set_sched_pri(holder, priority); } thread_unlock(holder); splx(s); if (mutex->lck_mtx_pri < priority) mutex->lck_mtx_pri = priority; if (self->pending_promoter[self->pending_promoter_index] == NULL) { self->pending_promoter[self->pending_promoter_index] = mutex; mutex->lck_mtx_waiters++; } else if (self->pending_promoter[self->pending_promoter_index] != mutex) { self->pending_promoter[++self->pending_promoter_index] = mutex; mutex->lck_mtx_waiters++; } assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); #if CONFIG_DTRACE /* * Record the Dtrace lockstat probe for blocking, block time * measured from when we were entered. */ if (sleep_start) { if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) { LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, lck, mach_absolute_time() - sleep_start); } else { LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, lck, mach_absolute_time() - sleep_start); } } #endif }
kern_return_t copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which) { unsigned int lop, csize; int bothphys = 0; KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64, (unsigned)snk64, size, which, 0); if ((which & (cppvPsrc | cppvPsnk)) == 0 ) /* Make sure that only one is virtual */ panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk)) bothphys = 1; /* both are physical */ while (size) { if (bothphys) { lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); /* Assume sink smallest */ if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)))) lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); /* No, source is smaller */ } else { /* * only need to compute the resid for the physical page * address... we don't care about where we start/finish in * the virtual since we just call the normal copyin/copyout */ if (which & cppvPsrc) lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); else lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); } csize = size; /* Assume we can copy it all */ if (lop < size) csize = lop; /* Nope, we can't do it all */ #if 0 /* * flush_dcache64 is currently a nop on the i386... * it's used when copying to non-system memory such * as video capture cards... on PPC there was a need * to flush due to how we mapped this memory... not * sure if it's needed on i386. */ if (which & cppvFsrc) flush_dcache64(src64, csize, 1); /* If requested, flush source before move */ if (which & cppvFsnk) flush_dcache64(snk64, csize, 1); /* If requested, flush sink before move */ #endif if (bothphys) bcopy_phys(src64, snk64, csize); /* Do a physical copy, virtually */ else { if (copyio_phys(src64, snk64, csize, which)) return (KERN_FAILURE); } #if 0 if (which & cppvFsrc) flush_dcache64(src64, csize, 1); /* If requested, flush source after move */ if (which & cppvFsnk) flush_dcache64(snk64, csize, 1); /* If requested, flush sink after move */ #endif size -= csize; /* Calculate what is left */ snk64 += csize; /* Bump sink to next physical address */ src64 += csize; /* Bump source to next physical address */ } KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64, (unsigned)snk64, size, which, 0); return KERN_SUCCESS; }
static int copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) { thread_t thread; pmap_t pmap; vm_size_t bytes_copied; int error = 0; boolean_t istate = FALSE; boolean_t recursive_CopyIOActive; #if KDEBUG int debug_type = 0xeff70010; debug_type += (copy_type << 2); #endif thread = current_thread(); KERNEL_DEBUG(debug_type | DBG_FUNC_START, (unsigned)(user_addr >> 32), (unsigned)user_addr, nbytes, thread->machine.copyio_state, 0); if (nbytes == 0) goto out; pmap = thread->map->pmap; if ((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS) && ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) { panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr); } /* Sanity and security check for addresses to/from a user */ if (((pmap != kernel_pmap) && (use_kernel_map == 0)) && ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map)))) { error = EFAULT; goto out; } /* * If the no_shared_cr3 boot-arg is set (true), the kernel runs on * its own pmap and cr3 rather than the user's -- so that wild accesses * from kernel or kexts can be trapped. So, during copyin and copyout, * we need to switch back to the user's map/cr3. The thread is flagged * "CopyIOActive" at this time so that if the thread is pre-empted, * we will later restore the correct cr3. */ recursive_CopyIOActive = thread->machine.specFlags & CopyIOActive; thread->machine.specFlags |= CopyIOActive; user_access_enable(); if (no_shared_cr3) { istate = ml_set_interrupts_enabled(FALSE); if (get_cr3_base() != pmap->pm_cr3) set_cr3_raw(pmap->pm_cr3); } /* * Ensure that we're running on the target thread's cr3. */ if ((pmap != kernel_pmap) && !use_kernel_map && (get_cr3_base() != pmap->pm_cr3)) { panic("copyio(%d,%p,%p,%ld,%p,%d) cr3 is %p expects %p", copy_type, (void *)user_addr, kernel_addr, nbytes, lencopied, use_kernel_map, (void *) get_cr3_raw(), (void *) pmap->pm_cr3); } if (no_shared_cr3) (void) ml_set_interrupts_enabled(istate); KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_addr, (unsigned)kernel_addr, nbytes, 0, 0); switch (copy_type) { case COPYIN: error = _bcopy((const void *) user_addr, kernel_addr, nbytes); break; case COPYOUT: error = _bcopy(kernel_addr, (void *) user_addr, nbytes); break; case COPYINPHYS: error = _bcopy((const void *) user_addr, PHYSMAP_PTOV(kernel_addr), nbytes); break; case COPYOUTPHYS: error = _bcopy((const void *) PHYSMAP_PTOV(kernel_addr), (void *) user_addr, nbytes); break; case COPYINSTR: error = _bcopystr((const void *) user_addr, kernel_addr, (int) nbytes, &bytes_copied); /* * lencopied should be updated on success * or ENAMETOOLONG... but not EFAULT */ if (error != EFAULT) *lencopied = bytes_copied; if (error) { #if KDEBUG nbytes = *lencopied; #endif break; } if (*(kernel_addr + bytes_copied - 1) == 0) { /* * we found a NULL terminator... we're done */ #if KDEBUG nbytes = *lencopied; #endif break; } else { /* * no more room in the buffer and we haven't * yet come across a NULL terminator */ #if KDEBUG nbytes = *lencopied; #endif error = ENAMETOOLONG; break; } break; } user_access_disable(); if (!recursive_CopyIOActive) { thread->machine.specFlags &= ~CopyIOActive; } if (no_shared_cr3) { istate = ml_set_interrupts_enabled(FALSE); if (get_cr3_raw() != kernel_pmap->pm_cr3) set_cr3_raw(kernel_pmap->pm_cr3); (void) ml_set_interrupts_enabled(istate); } out: KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, (unsigned)kernel_addr, (unsigned)nbytes, error, 0); return (error); }
/* * Routine: lck_rw_lock_exclusive */ void lck_rw_lock_exclusive( lck_rw_t *lck) { int i; wait_result_t res; #if MACH_LDEBUG int decrementer; #endif /* MACH_LDEBUG */ boolean_t istate; #if CONFIG_DTRACE uint64_t wait_interval = 0; int slept = 0; int readers_at_sleep; #endif istate = lck_interlock_lock(lck); #if CONFIG_DTRACE readers_at_sleep = lck->lck_rw_shared_count; #endif #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ /* * Try to acquire the lck_rw_want_write bit. */ while (lck->lck_rw_want_write) { KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_START, (int)lck, 0, 0, 0, 0); /* * Either sleeping or spinning is happening, start * a timing of our delay interval now. */ #if CONFIG_DTRACE if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { wait_interval = mach_absolute_time(); } else { wait_interval = -1; } #endif i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; if (i != 0) { lck_interlock_unlock(lck, istate); #if MACH_LDEBUG if (!--decrementer) Debugger("timeout - lck_rw_want_write"); #endif /* MACH_LDEBUG */ while (--i != 0 && lck->lck_rw_want_write) lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } if (lck->lck_rw_can_sleep && lck->lck_rw_want_write) { lck->lck_w_waiting = TRUE; res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); #if CONFIG_DTRACE slept = 1; #endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE_CODE) | DBG_FUNC_END, (int)lck, res, 0, 0, 0); } lck->lck_rw_want_write = TRUE; /* Wait for readers (and upgrades) to finish */ #if MACH_LDEBUG decrementer = DECREMENTER_TIMEOUT; #endif /* MACH_LDEBUG */ while ((lck->lck_rw_shared_count != 0) || lck->lck_rw_want_upgrade) { i = lock_wait_time[lck->lck_rw_can_sleep ? 1 : 0]; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_START, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, i, 0); #if CONFIG_DTRACE /* * Either sleeping or spinning is happening, start * a timing of our delay interval now. If we set it * to -1 we don't have accurate data so we cannot later * decide to record a dtrace spin or sleep event. */ if ((lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] || lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK]) && wait_interval == 0) { wait_interval = mach_absolute_time(); } else { wait_interval = (unsigned) -1; } #endif if (i != 0) { lck_interlock_unlock(lck, istate); #if MACH_LDEBUG if (!--decrementer) Debugger("timeout - wait for readers"); #endif /* MACH_LDEBUG */ while (--i != 0 && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) lck_rw_lock_pause(istate); istate = lck_interlock_lock(lck); } if (lck->lck_rw_can_sleep && (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade)) { lck->lck_w_waiting = TRUE; res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); if (res == THREAD_WAITING) { lck_interlock_unlock(lck, istate); res = thread_block(THREAD_CONTINUE_NULL); #if CONFIG_DTRACE slept = 1; #endif istate = lck_interlock_lock(lck); } } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EXCLUSIVE1_CODE) | DBG_FUNC_END, (int)lck, lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, res, 0); } lck_interlock_unlock(lck, istate); #if CONFIG_DTRACE /* * Decide what latencies we suffered that are Dtrace events. * If we have set wait_interval, then we either spun or slept. * At least we get out from under the interlock before we record * which is the best we can do here to minimize the impact * of the tracing. * If we have set wait_interval to -1, then dtrace was not enabled when we * started sleeping/spinning so we don't record this event. */ if (wait_interval != 0 && wait_interval != (unsigned) -1) { if (slept == 0) { LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 1); } else { /* * For the blocking case, we also record if when we blocked * it was held for read or write, and how many readers. * Notice that above we recorded this before we dropped * the interlock so the count is accurate. */ LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck, mach_absolute_time() - wait_interval, 1, (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep); } } LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1); #endif }
/* * Modify the packet so that the payload is encrypted. * The mbuf (m) must start with IPv4 or IPv6 header. * On failure, free the given mbuf and return NULL. * * on invocation: * m nexthdrp md * v v v * IP ......... payload * during the encryption: * m nexthdrp mprev md * v v v v * IP ............... esp iv payload pad padlen nxthdr * <--><-><------><---------------> * esplen plen extendsiz * ivlen * <-----> esphlen * <-> hlen * <-----------------> espoff */ static int esp_output( struct mbuf *m, u_char *nexthdrp, struct mbuf *md, int af, struct secasvar *sav) { struct mbuf *n; struct mbuf *mprev; struct esp *esp; struct esptail *esptail; const struct esp_algorithm *algo; u_int32_t spi; u_int8_t nxt = 0; size_t plen; /*payload length to be encrypted*/ size_t espoff; size_t esphlen; /* sizeof(struct esp/newesp) + ivlen */ int ivlen; int afnumber; size_t extendsiz; int error = 0; struct ipsecstat *stat; struct udphdr *udp = NULL; int udp_encapsulate = (sav->flags & SADB_X_EXT_NATT && (af == AF_INET || af == AF_INET6) && (esp_udp_encap_port & 0xFFFF) != 0); KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_START, sav->ivlen,0,0,0,0); switch (af) { #if INET case AF_INET: afnumber = 4; stat = &ipsecstat; break; #endif #if INET6 case AF_INET6: afnumber = 6; stat = &ipsec6stat; break; #endif default: ipseclog((LOG_ERR, "esp_output: unsupported af %d\n", af)); KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 1,0,0,0,0); return 0; /* no change at all */ } /* some sanity check */ if ((sav->flags & SADB_X_EXT_OLD) == 0 && !sav->replay) { switch (af) { #if INET case AF_INET: { struct ip *ip; ip = mtod(m, struct ip *); ipseclog((LOG_DEBUG, "esp4_output: internal error: " "sav->replay is null: %x->%x, SPI=%u\n", (u_int32_t)ntohl(ip->ip_src.s_addr), (u_int32_t)ntohl(ip->ip_dst.s_addr), (u_int32_t)ntohl(sav->spi))); IPSEC_STAT_INCREMENT(ipsecstat.out_inval); break; } #endif /*INET*/ #if INET6 case AF_INET6: ipseclog((LOG_DEBUG, "esp6_output: internal error: " "sav->replay is null: SPI=%u\n", (u_int32_t)ntohl(sav->spi))); IPSEC_STAT_INCREMENT(ipsec6stat.out_inval); break; #endif /*INET6*/ default: panic("esp_output: should not reach here"); } m_freem(m); KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 2,0,0,0,0); return EINVAL; } algo = esp_algorithm_lookup(sav->alg_enc); if (!algo) { ipseclog((LOG_ERR, "esp_output: unsupported algorithm: " "SPI=%u\n", (u_int32_t)ntohl(sav->spi))); m_freem(m); KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 3,0,0,0,0); return EINVAL; } spi = sav->spi; ivlen = sav->ivlen; /* should be okey */ if (ivlen < 0) { panic("invalid ivlen"); } { /* * insert ESP header. * XXX inserts ESP header right after IPv4 header. should * chase the header chain. * XXX sequential number */ #if INET struct ip *ip = NULL; #endif #if INET6 struct ip6_hdr *ip6 = NULL; #endif size_t esplen; /* sizeof(struct esp/newesp) */ size_t hlen = 0; /* ip header len */ if (sav->flags & SADB_X_EXT_OLD) { /* RFC 1827 */ esplen = sizeof(struct esp); } else { /* RFC 2406 */ if (sav->flags & SADB_X_EXT_DERIV) esplen = sizeof(struct esp); else esplen = sizeof(struct newesp); } esphlen = esplen + ivlen; for (mprev = m; mprev && mprev->m_next != md; mprev = mprev->m_next) ; if (mprev == NULL || mprev->m_next != md) { ipseclog((LOG_DEBUG, "esp%d_output: md is not in chain\n", afnumber)); m_freem(m); KERNEL_DEBUG(DBG_FNC_ESPOUT | DBG_FUNC_END, 4,0,0,0,0); return EINVAL; } plen = 0; for (n = md; n; n = n->m_next) plen += n->m_len; switch (af) { #if INET case AF_INET: ip = mtod(m, struct ip *); #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; #else hlen = ip->ip_hl << 2; #endif break; #endif #if INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); hlen = sizeof(*ip6); break; #endif } /* make the packet over-writable */ mprev->m_next = NULL; if ((md = ipsec_copypkt(md)) == NULL) { m_freem(m); error = ENOBUFS; goto fail; } mprev->m_next = md; /* * Translate UDP source port back to its original value. * SADB_X_EXT_NATT_MULTIPLEUSERS is only set for transort mode. */ if ((sav->flags & SADB_X_EXT_NATT_MULTIPLEUSERS) != 0) { /* if not UDP - drop it */ if (ip->ip_p != IPPROTO_UDP) { IPSEC_STAT_INCREMENT(ipsecstat.out_inval); m_freem(m); error = EINVAL; goto fail; } udp = mtod(md, struct udphdr *); /* if src port not set in sav - find it */ if (sav->natt_encapsulated_src_port == 0) if (key_natt_get_translated_port(sav) == 0) { m_freem(m); error = EINVAL; goto fail; } if (sav->remote_ike_port == htons(udp->uh_dport)) { /* translate UDP port */ udp->uh_dport = sav->natt_encapsulated_src_port; udp->uh_sum = 0; /* don't need checksum with ESP auth */ } else { /* drop the packet - can't translate the port */ IPSEC_STAT_INCREMENT(ipsecstat.out_inval); m_freem(m); error = EINVAL; goto fail; } } espoff = m->m_pkthdr.len - plen; if (udp_encapsulate) { esphlen += sizeof(struct udphdr); espoff += sizeof(struct udphdr); } /* * grow the mbuf to accomodate ESP header. * before: IP ... payload * after: IP ... [UDP] ESP IV payload */ if (M_LEADINGSPACE(md) < esphlen || (md->m_flags & M_EXT) != 0) { MGET(n, M_DONTWAIT, MT_DATA); if (!n) { m_freem(m); error = ENOBUFS; goto fail; } n->m_len = esphlen; mprev->m_next = n; n->m_next = md; m->m_pkthdr.len += esphlen; if (udp_encapsulate) { udp = mtod(n, struct udphdr *); esp = (struct esp *)(void *)((caddr_t)udp + sizeof(struct udphdr)); } else { esp = mtod(n, struct esp *); } } else {
// LP64todo - fix this! 'n' should be int64_t? int uiomove64(const addr64_t c_cp, int n, struct uio *uio) { addr64_t cp = c_cp; #if LP64KERN uint64_t acnt; #else u_int acnt; #endif int error = 0; #if DIAGNOSTIC if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE) panic("uiomove: mode"); #endif #if LP64_DEBUG if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) { panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__); } #endif /* LP64_DEBUG */ while (n > 0 && uio_resid(uio)) { acnt = uio_iov_len(uio); if (acnt == 0) { uio_next_iov(uio); uio->uio_iovcnt--; continue; } if (n > 0 && acnt > (uint64_t)n) acnt = n; switch (uio->uio_segflg) { case UIO_USERSPACE64: case UIO_USERISPACE64: // LP64 - 3rd argument in debug code is 64 bit, expected to be 32 bit if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 0,0); error = copyout( CAST_DOWN(caddr_t, cp), uio->uio_iovs.iov64p->iov_base, acnt ); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 0,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 0,0); error = copyin(uio->uio_iovs.iov64p->iov_base, CAST_DOWN(caddr_t, cp), acnt); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 0,0); } if (error) return (error); break; case UIO_USERSPACE32: case UIO_USERISPACE32: case UIO_USERSPACE: case UIO_USERISPACE: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 0,0); error = copyout( CAST_DOWN(caddr_t, cp), CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base), acnt ); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 0,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 0,0); error = copyin(CAST_USER_ADDR_T(uio->uio_iovs.iov32p->iov_base), CAST_DOWN(caddr_t, cp), acnt); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 0,0); } if (error) return (error); break; case UIO_SYSSPACE32: case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) error = copywithin(CAST_DOWN(caddr_t, cp), (caddr_t)uio->uio_iovs.iov32p->iov_base, acnt); else error = copywithin((caddr_t)uio->uio_iovs.iov32p->iov_base, CAST_DOWN(caddr_t, cp), acnt); break; case UIO_PHYS_USERSPACE64: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 1,0); error = copypv((addr64_t)cp, uio->uio_iovs.iov64p->iov_base, acnt, cppvPsrc | cppvNoRefSrc); if (error) /* Copy physical to virtual */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, (int)cp, (int)uio->uio_iovs.iov64p->iov_base, acnt, 1,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 1,0); error = copypv(uio->uio_iovs.iov64p->iov_base, (addr64_t)cp, acnt, cppvPsnk | cppvNoRefSrc | cppvNoModSnk); if (error) /* Copy virtual to physical */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, (int)uio->uio_iovs.iov64p->iov_base, (int)cp, acnt, 1,0); } if (error) return (error); break; case UIO_PHYS_USERSPACE32: case UIO_PHYS_USERSPACE: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 1,0); error = copypv((addr64_t)cp, (addr64_t)uio->uio_iovs.iov32p->iov_base, acnt, cppvPsrc | cppvNoRefSrc); if (error) /* Copy physical to virtual */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 1,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 1,0); error = copypv((addr64_t)uio->uio_iovs.iov32p->iov_base, (addr64_t)cp, acnt, cppvPsnk | cppvNoRefSrc | cppvNoModSnk); if (error) /* Copy virtual to physical */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 1,0); } if (error) return (error); break; case UIO_PHYS_SYSSPACE32: case UIO_PHYS_SYSSPACE: if (uio->uio_rw == UIO_READ) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_START, (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 2,0); error = copypv((addr64_t)cp, uio->uio_iovs.iov32p->iov_base, acnt, cppvKmap | cppvPsrc | cppvNoRefSrc); if (error) /* Copy physical to virtual */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYOUT)) | DBG_FUNC_END, (int)cp, (int)uio->uio_iovs.iov32p->iov_base, acnt, 2,0); } else { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_START, (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 2,0); error = copypv(uio->uio_iovs.iov32p->iov_base, (addr64_t)cp, acnt, cppvKmap | cppvPsnk | cppvNoRefSrc | cppvNoModSnk); if (error) /* Copy virtual to physical */ error = EFAULT; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, DBG_UIO_COPYIN)) | DBG_FUNC_END, (int)uio->uio_iovs.iov32p->iov_base, (int)cp, acnt, 2,0); } if (error) return (error); break; default: break; } uio_iov_base_add(uio, acnt); #if LP64KERN uio_iov_len_add(uio, -((int64_t)acnt)); uio_setresid(uio, (uio_resid(uio) - ((int64_t)acnt))); #else uio_iov_len_add(uio, -((int)acnt)); uio_setresid(uio, (uio_resid(uio) - ((int)acnt))); #endif uio->uio_offset += acnt; cp += acnt; n -= acnt; } return (error); }