void pmap_pcid_activate(pmap_t tpmap, int ccpu) { pcid_t new_pcid = tpmap->pmap_pcid_cpus[ccpu]; pmap_t last_pmap; boolean_t pcid_conflict = FALSE, pending_flush = FALSE; pmap_assert(cpu_datap(ccpu)->cpu_pmap_pcid_enabled); if (__improbable(new_pcid == PMAP_PCID_INVALID_PCID)) { new_pcid = tpmap->pmap_pcid_cpus[ccpu] = pmap_pcid_allocate_pcid(ccpu); } pmap_assert(new_pcid != PMAP_PCID_INVALID_PCID); #ifdef PCID_ASSERT cpu_datap(ccpu)->cpu_last_pcid = cpu_datap(ccpu)->cpu_active_pcid; #endif cpu_datap(ccpu)->cpu_active_pcid = new_pcid; pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); if (__probable(pending_flush == FALSE)) { last_pmap = cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid]; pcid_conflict = ((last_pmap != NULL) &&(tpmap != last_pmap)); } if (__improbable(pending_flush || pcid_conflict)) { pmap_pcid_validate_cpu(tpmap, ccpu); } /* Consider making this a unique id */ cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid] = tpmap; pmap_assert(new_pcid < PMAP_PCID_MAX_PCID); pmap_assert(((tpmap == kernel_pmap) && new_pcid == 0) || ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0))); #if PMAP_ASSERT pcid_record_array[ccpu % PCID_RECORD_SIZE] = tpmap->pm_cr3 | new_pcid | (((uint64_t)(!(pending_flush || pcid_conflict))) <<63); pml4_entry_t *pml4 = pmap64_pml4(tpmap, 0ULL); /* Diagnostic to detect pagetable anchor corruption */ if (pml4[KERNEL_PML4_INDEX] != kernel_pmap->pm_pml4[KERNEL_PML4_INDEX]) __asm__ volatile("int3"); #endif /* PMAP_ASSERT */ set_cr3_composed(tpmap->pm_cr3, new_pcid, !(pending_flush || pcid_conflict)); if (!pending_flush) { /* We did not previously observe a pending invalidation for this * ASID. However, the load from the coherency vector * could've been reordered ahead of the store to the * active_cr3 field (in the context switch path, our * caller). Re-consult the pending invalidation vector * after the CR3 write. We rely on MOV CR3's documented * serializing property to avoid insertion of an expensive * barrier. (DRK) */ pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); if (__improbable(pending_flush != 0)) { pmap_pcid_validate_cpu(tpmap, ccpu); set_cr3_composed(tpmap->pm_cr3, new_pcid, FALSE); } } cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = &(tpmap->pmap_pcid_coherency_vector[ccpu]); #if DEBUG KERNEL_DEBUG_CONSTANT(0x9c1d0000, tpmap, new_pcid, pending_flush, pcid_conflict, 0); #endif }
static unsigned int NOINLINE hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean_t do_panic) { uint64_t end = 0; uintptr_t holder = lock->lock_data; int i; if (timeout == 0) timeout = LOCK_PANIC_TIMEOUT; #if CONFIG_DTRACE uint64_t begin; boolean_t dtrace_enabled = lockstat_probemap[LS_LCK_SPIN_LOCK_SPIN] != 0; if (__improbable(dtrace_enabled)) begin = mach_absolute_time(); #endif for ( ; ; ) { for (i = 0; i < LOCK_SNOOP_SPINS; i++) { cpu_pause(); #if (!__ARM_ENABLE_WFE_) || (LOCK_PRETEST) holder = ordered_load_hw(lock); if (holder != 0) continue; #endif if (atomic_compare_exchange(&lock->lock_data, 0, data, memory_order_acquire_smp, TRUE)) { #if CONFIG_DTRACE if (__improbable(dtrace_enabled)) { uint64_t spintime = mach_absolute_time() - begin; if (spintime > dtrace_spin_threshold) LOCKSTAT_RECORD2(LS_LCK_SPIN_LOCK_SPIN, lock, spintime, dtrace_spin_threshold); } #endif return 1; } } if (end == 0) { end = ml_get_timebase() + timeout; } else if (ml_get_timebase() >= end) break; } if (do_panic) { // Capture the actual time spent blocked, which may be higher than the timeout // if a misbehaving interrupt stole this thread's CPU time. panic("Spinlock timeout after %llu ticks, %p = %lx", (ml_get_timebase() - end + timeout), lock, holder); } return 0; }
size_t __strlcpy_chk(char *dst, char const *src, size_t s, size_t chk_size) { if (__improbable(chk_size < s)) panic("__strlcpy_chk object size check failed: dst %p, src %p, (%zu < %zu)", dst, src, chk_size, s); return strlcpy(dst, src, s); }
void * __memset_chk(void *dst, int c, size_t s, size_t chk_size) { if (__improbable(chk_size < s)) panic("__memset_chk object size check failed: dst %p, c %c, (%zu < %zu)", dst, c, chk_size, s); return memset(dst, c, s); }
void * __memmove_chk(void *dst, void const *src, size_t s, size_t chk_size) { if (__improbable(chk_size < s)) panic("__memmove_chk object size check failed: dst %p, src %p, (%zu < %zu)", dst, src, chk_size, s); return memmove(dst, src, s); }
void vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy) { if (__improbable(vm_debug_events)) { vm_map_entry_t entry; for (entry = vm_map_copy_first_entry(copy); entry != vm_map_copy_to_entry(copy); entry = entry->vme_next) { DTRACE_VM4(map_entry_link_copy, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->links.start, vm_address_t, entry->links.end); } } if (map->holelistenabled) { vm_map_entry_t entry = NULL; entry = vm_map_copy_first_entry(copy); while (entry != vm_map_copy_to_entry(copy)) { vm_map_store_update_first_free(map, entry, TRUE); entry = entry->vme_next; } } vm_map_store_copy_insert_ll(map, after_where, copy); #ifdef VM_MAP_STORE_USE_RB if (vm_map_store_has_RB_support( &map->hdr )) { vm_map_store_copy_insert_rb(map, after_where, copy); } #endif }
void thread_tell_urgency(int urgency, uint64_t rt_period, uint64_t rt_deadline, thread_t nthread) { uint64_t urgency_notification_time_start, delta; boolean_t urgency_assert = (urgency_notification_assert_abstime_threshold != 0); assert(get_preemption_level() > 0 || ml_get_interrupts_enabled() == FALSE); #if DEBUG urgency_stats[cpu_number() % 64][urgency]++; #endif if (!pmInitDone || pmDispatch == NULL || pmDispatch->pmThreadTellUrgency == NULL) return; KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_START, urgency, rt_period, rt_deadline, 0, 0); if (__improbable((urgency_assert == TRUE))) urgency_notification_time_start = mach_absolute_time(); current_cpu_datap()->cpu_nthread = nthread; pmDispatch->pmThreadTellUrgency(urgency, rt_period, rt_deadline); if (__improbable((urgency_assert == TRUE))) { delta = mach_absolute_time() - urgency_notification_time_start; if (__improbable(delta > urgency_notification_max_recorded)) { /* This is not synchronized, but it doesn't matter * if we (rarely) miss an event, as it is statistically * unlikely that it will never recur. */ urgency_notification_max_recorded = delta; if (__improbable((delta > urgency_notification_assert_abstime_threshold) && !machine_timeout_suspended())) panic("Urgency notification callout %p exceeded threshold, 0x%llx abstime units", pmDispatch->pmThreadTellUrgency, delta); } } KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED,MACH_URGENCY) | DBG_FUNC_END, urgency, rt_period, rt_deadline, 0, 0); }
pcid_t pcid_for_pmap_cpu_tuple(pmap_t cpmap, thread_t cthread, int ccpu) { pmap_t active_pmap = cpmap; if (__improbable(cpmap->pagezero_accessible)) { if ((cthread->machine.specFlags & CopyIOActive) == 0) { active_pmap = kernel_pmap; } } return active_pmap->pmap_pcid_cpus[ccpu]; }
void _vm_map_store_entry_unlink( struct vm_map_header * mapHdr, vm_map_entry_t entry) { if (__improbable(vm_debug_events)) DTRACE_VM4(map_entry_unlink, vm_map_t, (char *)mapHdr - sizeof (lck_rw_t), vm_map_entry_t, entry, vm_address_t, entry->links.start, vm_address_t, entry->links.end); vm_map_store_entry_unlink_ll(mapHdr, entry); #ifdef VM_MAP_STORE_USE_RB if (vm_map_store_has_RB_support( mapHdr )) { vm_map_store_entry_unlink_rb(mapHdr, entry); } #endif }
static boolean_t timer_call_enter_internal( timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint32_t flags) { mpqueue_head_t *queue; mpqueue_head_t *old_queue; spl_t s; uint64_t slop = 0; s = splclock(); call->soft_deadline = deadline; call->flags = flags; if ((flags & TIMER_CALL_CRITICAL) == 0 && mach_timer_coalescing_enabled) { slop = timer_call_slop(deadline); deadline += slop; } #if defined(__i386__) || defined(__x86_64__) uint64_t ctime = mach_absolute_time(); if (__improbable(deadline < ctime)) { uint64_t delta = (ctime - deadline); past_deadline_timers++; past_deadline_deltas += delta; if (delta > past_deadline_longest) past_deadline_longest = deadline; if (delta < past_deadline_shortest) past_deadline_shortest = delta; deadline = ctime + past_deadline_timer_adjustment; call->soft_deadline = deadline; } #endif queue = timer_queue_assign(deadline); old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline); CE(call)->param1 = param1; splx(s); return (old_queue != NULL); }
void _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_where, vm_map_entry_t entry) { assert(entry->vme_start < entry->vme_end); if (__improbable(vm_debug_events)) DTRACE_VM4(map_entry_link, vm_map_t, (char *)mapHdr - sizeof (lck_rw_t), vm_map_entry_t, entry, vm_address_t, entry->links.start, vm_address_t, entry->links.end); vm_map_store_entry_link_ll(mapHdr, after_where, entry); #ifdef VM_MAP_STORE_USE_RB if (vm_map_store_has_RB_support( mapHdr )) { vm_map_store_entry_link_rb(mapHdr, after_where, entry); } #endif #if MAP_ENTRY_INSERTION_DEBUG fastbacktrace(&entry->vme_insertion_bt[0], (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t))); #endif }
int hfs_vnop_lookup(struct vnop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp; struct cnode *cp; struct cnode *dcp; struct hfsmount *hfsmp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct proc *p = vfs_context_proc(ap->a_context); int flags = cnp->cn_flags; int force_casesensitive_lookup = proc_is_forcing_hfs_case_sensitivity(p); int cnode_locked; *vpp = NULL; dcp = VTOC(dvp); hfsmp = VTOHFS(dvp); /* * Lookup an entry in the cache * * If the lookup succeeds, the vnode is returned in *vpp, * and a status of -1 is returned. * * If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. * * If the lookup fails, a status of zero is returned. */ error = cache_lookup(dvp, vpp, cnp); if (error != -1) { if ((error == ENOENT) && (cnp->cn_nameiop != CREATE)) goto exit; /* found a negative cache entry */ goto lookup; /* did not find it in the cache */ } /* * We have a name that matched * cache_lookup returns the vp with an iocount reference already taken */ error = 0; vp = *vpp; cp = VTOC(vp); /* We aren't allowed to vend out vp's via lookup to the hidden directory */ if (cp->c_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || cp->c_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { /* Drop the iocount from cache_lookup */ vnode_put (vp); error = ENOENT; goto exit; } /* * If this is a hard-link vnode then we need to update * the name (of the link), the parent ID, the cnid, the * text encoding and the catalog hint. This enables * getattrlist calls to return the correct link info. */ /* * Alternatively, if we are forcing a case-sensitive lookup * on a case-insensitive volume, the namecache entry * may have been for an incorrect case. Since we cannot * determine case vs. normalization, redrive the catalog * lookup based on any byte mismatch. */ if (((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) || (force_casesensitive_lookup && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE))) { int stale_link = 0; hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS); if ((cp->c_parentcnid != dcp->c_cnid) || (cnp->cn_namelen != cp->c_desc.cd_namelen) || (bcmp(cnp->cn_nameptr, cp->c_desc.cd_nameptr, cp->c_desc.cd_namelen) != 0)) { struct cat_desc desc; struct cat_attr lookup_attr; int lockflags; if (force_casesensitive_lookup && !(hfsmp->hfs_flags & HFS_CASE_SENSITIVE)) { /* * Since the name in the cnode doesn't match our lookup * string exactly, do a full lookup. */ hfs_unlock (cp); vnode_put(vp); goto lookup; } /* * Get an updated descriptor */ desc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; desc.cd_namelen = cnp->cn_namelen; desc.cd_parentcnid = dcp->c_fileid; desc.cd_hint = dcp->c_childhint; desc.cd_encoding = 0; desc.cd_cnid = 0; desc.cd_flags = S_ISDIR(cp->c_mode) ? CD_ISDIR : 0; /* * Because lookups call replace_desc to put a new descriptor in * the cnode we are modifying it is possible that this cnode's * descriptor is out of date for the parent ID / name that * we are trying to look up. (It may point to a different hardlink). * * We need to be cautious that when re-supplying the * descriptor below that the results of the catalog lookup * still point to the same raw inode for the hardlink. This would * not be the case if we found something in the cache above but * the vnode it returned no longer has a valid hardlink for the * parent ID/filename combo we are requesting. (This is because * hfs_unlink does not directly trigger namecache removal). * * As a result, before vending out the vnode (and replacing * its descriptor) verify that the fileID is the same by comparing * the in-cnode attributes vs. the one returned from the lookup call * below. If they do not match, treat this lookup as if we never hit * in the cache at all. */ lockflags = hfs_systemfile_lock(VTOHFS(dvp), SFL_CATALOG, HFS_SHARED_LOCK); error = cat_lookup(VTOHFS(vp), &desc, 0, 0, &desc, &lookup_attr, NULL, NULL); hfs_systemfile_unlock(VTOHFS(dvp), lockflags); /* * Note that cat_lookup may fail to find something with the name provided in the * stack-based descriptor above. In that case, an ENOENT is a legitimate errno * to be placed in error, which will get returned in the fastpath below. */ if (error == 0) { if (lookup_attr.ca_fileid == cp->c_attr.ca_fileid) { /* It still points to the right raw inode. Replacing the descriptor is fine */ replace_desc (cp, &desc); /* * Save the origin info for file and directory hardlinks. Directory hardlinks * need the origin for '..' lookups, and file hardlinks need it to ensure that * competing lookups do not cause us to vend different hardlinks than the ones requested. * We want to restrict saving the cache entries to LOOKUP namei operations, since * we're really doing this to protect getattr. */ if (cnp->cn_nameiop == LOOKUP) { hfs_savelinkorigin(cp, dcp->c_fileid); } } else { /* If the fileID does not match then do NOT replace the descriptor! */ stale_link = 1; } } } hfs_unlock (cp); if (stale_link) { /* * If we had a stale_link, then we need to pretend as though * we never found this vnode and force a lookup through the * traditional path. Drop the iocount acquired through * cache_lookup above and force a cat lookup / getnewvnode */ vnode_put(vp); goto lookup; } if (error) { /* * If the cat_lookup failed then the caller will not expect * a vnode with an iocount on it. */ vnode_put(vp); } } goto exit; lookup: /* * The vnode was not in the name cache or it was stale. * * So we need to do a real lookup. */ cnode_locked = 0; error = hfs_lookup(dvp, vpp, cnp, &cnode_locked, force_casesensitive_lookup); if (cnode_locked) hfs_unlock(VTOC(*vpp)); exit: { uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread()); /* * check to see if we issued any I/O while completing this lookup and * this thread/task is throttleable... if so, throttle now * * this allows us to throttle in between multiple meta data reads that * might result due to looking up a long pathname (since we'll have to * re-enter hfs_vnop_lookup for each component of the pathnam not in * the VFS cache), instead of waiting until the entire path lookup has * completed and throttling at the systemcall return */ if (__improbable(ut->uu_lowpri_window)) { throttle_lowpri_io(1); } } return (error); }
void mach_call_munger64(x86_saved_state_t *state) { int call_number; int argc; mach_call_t mach_call; struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; x86_saved_state64_t *regs; #if PROC_REF_DEBUG struct uthread *ut = get_bsdthread_info(current_thread()); uthread_reset_proc_refcount(ut); #endif assert(is_saved_state64(state)); regs = saved_state64(state); call_number = (int)(regs->rax & SYSCALL_NUMBER_MASK); DEBUG_KPRINT_SYSCALL_MACH( "mach_call_munger64: code=%d(%s)\n", call_number, mach_syscall_name_table[call_number]); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_START, regs->rdi, regs->rsi, regs->rdx, regs->r10, 0); if (call_number < 0 || call_number >= mach_trap_count) { i386_exception(EXC_SYSCALL, regs->rax, 1); /* NOTREACHED */ } mach_call = (mach_call_t)mach_trap_table[call_number].mach_trap_function; if (mach_call == (mach_call_t)kern_invalid) { i386_exception(EXC_SYSCALL, regs->rax, 1); /* NOTREACHED */ } argc = mach_trap_table[call_number].mach_trap_arg_count; if (argc) { int args_in_regs = MIN(6, argc); memcpy(&args.arg1, ®s->rdi, args_in_regs * sizeof(syscall_arg_t)); if (argc > 6) { int copyin_count; assert(argc <= 9); copyin_count = (argc - 6) * (int)sizeof(syscall_arg_t); if (copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&args.arg7, copyin_count)) { regs->rax = KERN_INVALID_ARGUMENT; thread_exception_return(); /* NOTREACHED */ } } } #ifdef MACH_BSD mach_kauth_cred_uthread_update(); #endif regs->rax = (uint64_t)mach_call((void *)&args); DEBUG_KPRINT_SYSCALL_MACH( "mach_call_munger64: retval=0x%llx\n", regs->rax); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_END, regs->rax, 0, 0, 0, 0); throttle_lowpri_io(1); #if PROC_REF_DEBUG if (__improbable(uthread_get_proc_refcount(ut) != 0)) { panic("system call returned with uu_proc_refcount != 0"); } #endif thread_exception_return(); /* NOTREACHED */ }
void mach_call_munger(x86_saved_state_t *state) { int argc; int call_number; mach_call_t mach_call; kern_return_t retval; struct mach_call_args args = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; x86_saved_state32_t *regs; #if PROC_REF_DEBUG struct uthread *ut = get_bsdthread_info(current_thread()); uthread_reset_proc_refcount(ut); #endif assert(is_saved_state32(state)); regs = saved_state32(state); call_number = -(regs->eax); DEBUG_KPRINT_SYSCALL_MACH( "mach_call_munger: code=%d(%s)\n", call_number, mach_syscall_name_table[call_number]); #if DEBUG_TRACE kprintf("mach_call_munger(0x%08x) code=%d\n", regs, call_number); #endif if (call_number < 0 || call_number >= mach_trap_count) { i386_exception(EXC_SYSCALL, call_number, 1); /* NOTREACHED */ } mach_call = (mach_call_t)mach_trap_table[call_number].mach_trap_function; if (mach_call == (mach_call_t)kern_invalid) { DEBUG_KPRINT_SYSCALL_MACH( "mach_call_munger: kern_invalid 0x%x\n", regs->eax); i386_exception(EXC_SYSCALL, call_number, 1); /* NOTREACHED */ } argc = mach_trap_table[call_number].mach_trap_arg_count; if (argc) { retval = mach_call_arg_munger32(regs->uesp, &args, &mach_trap_table[call_number]); if (retval != KERN_SUCCESS) { regs->eax = retval; DEBUG_KPRINT_SYSCALL_MACH( "mach_call_munger: retval=0x%x\n", retval); thread_exception_return(); /* NOTREACHED */ } } #ifdef MACH_BSD mach_kauth_cred_uthread_update(); #endif KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_SC, (call_number)) | DBG_FUNC_START, args.arg1, args.arg2, args.arg3, args.arg4, 0); retval = mach_call(&args); DEBUG_KPRINT_SYSCALL_MACH("mach_call_munger: retval=0x%x\n", retval); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_EXCP_SC,(call_number)) | DBG_FUNC_END, retval, 0, 0, 0, 0); regs->eax = retval; throttle_lowpri_io(1); #if PROC_REF_DEBUG if (__improbable(uthread_get_proc_refcount(ut) != 0)) { panic("system call returned with uu_proc_refcount != 0"); } #endif thread_exception_return(); /* NOTREACHED */ }
void unix_syscall64(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int args_in_regs; boolean_t args_start_at_rdi; int error; struct proc *p; struct uthread *uthread; x86_saved_state64_t *regs; assert(is_saved_state64(state)); regs = saved_state64(state); #if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->rax & SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; vt = (void *)uthread->uu_arg; if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' */ code = regs->rdi; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; args_start_at_rdi = FALSE; args_in_regs = 5; } else { args_start_at_rdi = TRUE; args_in_regs = 6; } if (callp->sy_narg != 0) { assert(callp->sy_narg <= 8); /* size of uu_arg */ args_in_regs = MIN(args_in_regs, callp->sy_narg); memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t)); if (code != 180) { uint64_t *ip = (uint64_t *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); } if (__improbable(callp->sy_narg > args_in_regs)) { int copyin_count; copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t); error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count); if (error) { regs->rax = error; regs->isf.rflags |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } } } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ pal_syscall_restart( thread, state ); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: regs->rax = uthread->uu_rval[0]; regs->rdx = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: regs->rax = ((u_int)uthread->uu_rval[0]); regs->rdx = ((u_int)uthread->uu_rval[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: break; default: panic("unix_syscall: unknown return type"); break; } regs->isf.rflags &= ~EFL_CF; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: error=%d retval=(%llu,%llu)\n", error, regs->rax, regs->rdx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ }
/* * Event timer interrupt. * * XXX a drawback of this implementation is that events serviced earlier must not set deadlines * that occur before the entire chain completes. * * XXX a better implementation would use a set of generic callouts and iterate over them */ void timer_intr(int user_mode, uint64_t rip) { uint64_t abstime; rtclock_timer_t *mytimer; cpu_data_t *pp; int64_t latency; uint64_t pmdeadline; boolean_t timer_processed = FALSE; pp = current_cpu_datap(); SCHED_STATS_TIMER_POP(current_processor()); abstime = mach_absolute_time(); /* Get the time now */ /* has a pending clock timer expired? */ mytimer = &pp->rtclock_timer; /* Point to the event timer */ if ((timer_processed = ((mytimer->deadline <= abstime) || (abstime >= (mytimer->queue.earliest_soft_deadline))))) { /* * Log interrupt service latency (-ve value expected by tool) * a non-PM event is expected next. * The requested deadline may be earlier than when it was set * - use MAX to avoid reporting bogus latencies. */ latency = (int64_t) (abstime - MAX(mytimer->deadline, mytimer->when_set)); /* Log zero timer latencies when opportunistically processing * coalesced timers. */ if (latency < 0) { TCOAL_DEBUG(0xEEEE0000, abstime, mytimer->queue.earliest_soft_deadline, abstime - mytimer->queue.earliest_soft_deadline, 0, 0); latency = 0; } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_TRAP_LATENCY | DBG_FUNC_NONE, -latency, ((user_mode != 0) ? rip : VM_KERNEL_UNSLIDE(rip)), user_mode, 0, 0); mytimer->has_expired = TRUE; /* Remember that we popped */ mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); mytimer->has_expired = FALSE; /* Get the time again since we ran a bit */ abstime = mach_absolute_time(); mytimer->when_set = abstime; } /* is it time for power management state change? */ if ((pmdeadline = pmCPUGetDeadline(pp)) && (pmdeadline <= abstime)) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_PM_DEADLINE | DBG_FUNC_START, 0, 0, 0, 0, 0); pmCPUDeadline(pp); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, DECR_PM_DEADLINE | DBG_FUNC_END, 0, 0, 0, 0, 0); timer_processed = TRUE; } /* schedule our next deadline */ x86_lcpu()->rtcDeadline = EndOfAllTime; timer_resync_deadlines(); if (__improbable(timer_processed == FALSE)) spurious_timers++; }
static boolean_t timer_call_enter_internal( timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited) { mpqueue_head_t *queue = NULL; mpqueue_head_t *old_queue; spl_t s; uint64_t slop; uint32_t urgency; uint64_t sdeadline, ttd; s = splclock(); sdeadline = deadline; uint64_t ctime = mach_absolute_time(); TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START, VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE_OR_PERM(param1), deadline, flags, 0); urgency = (flags & TIMER_CALL_URGENCY_MASK); boolean_t slop_ratelimited = FALSE; slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited); if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) slop = leeway; if (UINT64_MAX - deadline <= slop) { deadline = UINT64_MAX; } else { deadline += slop; } if (__improbable(deadline < ctime)) { uint64_t delta = (ctime - deadline); past_deadline_timers++; past_deadline_deltas += delta; if (delta > past_deadline_longest) past_deadline_longest = deadline; if (delta < past_deadline_shortest) past_deadline_shortest = delta; deadline = ctime + past_deadline_timer_adjustment; sdeadline = deadline; } if (ratelimited || slop_ratelimited) { flags |= TIMER_CALL_RATELIMITED; } else { flags &= ~TIMER_CALL_RATELIMITED; } ttd = sdeadline - ctime; #if CONFIG_DTRACE DTRACE_TMR7(callout__create, timer_call_func_t, TCE(call)->func, timer_call_param_t, TCE(call)->param0, uint32_t, flags, (deadline - sdeadline), (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call); #endif /* Program timer callout parameters under the appropriate per-CPU or * longterm queue lock. The callout may have been previously enqueued * and in-flight on this or another timer queue. */ if (!ratelimited && !slop_ratelimited) { queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags); } if (queue == NULL) { queue = timer_queue_assign(deadline); old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags); } #if TIMER_TRACE TCE(call)->entry_time = ctime; #endif TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END, VM_KERNEL_UNSLIDE_OR_PERM(call), (old_queue != NULL), deadline, queue->count, 0); splx(s); return (old_queue != NULL); }
/* * Function: unix_syscall * * Inputs: regs - pointer to i386 save area * * Outputs: none */ void unix_syscall(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int error; vm_offset_t params; struct proc *p; struct uthread *uthread; x86_saved_state32_t *regs; boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); #if DEBUG if (regs->eax == 0x800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ is_vfork = uthread->uu_flag & UT_VFORK; if (__improbable(is_vfork != 0)) p = current_proc(); else p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->eax & I386_SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof (int)); regs->efl &= ~(EFL_CF); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; } vt = (void *)uthread->uu_arg; if (callp->sy_arg_bytes != 0) { #if CONFIG_REQUIRES_U32_MUNGING sy_munge_t *mungerp; #else #error U32 syscalls on x86_64 kernel requires munging #endif uint32_t nargs; assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg)); nargs = callp->sy_arg_bytes; error = copyin((user_addr_t) params, (char *) vt, nargs); if (error) { regs->eax = error; regs->efl |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } if (__probable(code != 180)) { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, *ip, *(ip+1), *(ip+2), *(ip+3), 0); } #if CONFIG_REQUIRES_U32_MUNGING mungerp = callp->sy_arg_munge32; if (mungerp != NULL) (*mungerp)(vt); #endif } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. * The SYSENTER_TF_CS covers single-stepping over a sysenter * - see debug trap handler in idt.s/idt64.s */ pal_syscall_restart(thread, state); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ /* * We split retval across two registers, in case the * syscall had a 64-bit return value, in which case * eax/edx matches the function call ABI. */ regs->eax = uthread->uu_rval[0]; regs->edx = uthread->uu_rval[1]; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall: error=%d retval=(%u,%u)\n", error, regs->eax, regs->edx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { pal_execve_return(thread); } thread_exception_return(); /* NOTREACHED */ }
vm_offset_t gzalloc_alloc(zone_t zone, boolean_t canblock) { vm_offset_t addr = 0; if (__improbable(gzalloc_mode && (((zone->elem_size >= gzalloc_min) && (zone->elem_size <= gzalloc_max))) && (zone->gzalloc_exempt == 0))) { if (get_preemption_level() != 0) { if (canblock == TRUE) { pdzalloc_count++; } else return 0; } vm_offset_t rounded_size = round_page(zone->elem_size + GZHEADER_SIZE); vm_offset_t residue = rounded_size - zone->elem_size; vm_offset_t gzaddr = 0; gzhdr_t *gzh; if (!kmem_ready || (vm_page_zone == ZONE_NULL)) { /* Early allocations are supplied directly from the * reserve. */ if (gzalloc_reserve_size < rounded_size) panic("gzalloc reserve exhausted"); gzaddr = gzalloc_reserve; /* No guard page for these early allocations, just * waste an additional page. */ gzalloc_reserve += rounded_size + PAGE_SIZE; gzalloc_reserve_size -= rounded_size + PAGE_SIZE; OSAddAtomic64((SInt32) (rounded_size), &gzalloc_early_alloc); } else { kern_return_t kr = kernel_memory_allocate(gzalloc_map, &gzaddr, rounded_size + (1*PAGE_SIZE), 0, KMA_KOBJECT | gzalloc_guard); if (kr != KERN_SUCCESS) panic("gzalloc: kernel_memory_allocate for size 0x%llx failed with %d", (uint64_t)rounded_size, kr); } if (gzalloc_uf_mode) { gzaddr += PAGE_SIZE; /* The "header" becomes a "footer" in underflow * mode. */ gzh = (gzhdr_t *) (gzaddr + zone->elem_size); addr = gzaddr; } else { gzh = (gzhdr_t *) (gzaddr + residue - GZHEADER_SIZE); addr = (gzaddr + residue); } /* Fill with a pattern on allocation to trap uninitialized * data use. Since the element size may be "rounded up" * by higher layers such as the kalloc layer, this may * also identify overruns between the originally requested * size and the rounded size via visual inspection. * TBD: plumb through the originally requested size, * prior to rounding by kalloc/IOMalloc etc. * We also add a signature and the zone of origin in a header * prefixed to the allocation. */ memset((void *)gzaddr, gzalloc_fill_pattern, rounded_size); gzh->gzone = (kmem_ready && vm_page_zone) ? zone : GZDEADZONE; gzh->gzsize = (uint32_t) zone->elem_size; gzh->gzsig = GZALLOC_SIGNATURE; lock_zone(zone); zone->count++; zone->sum_count++; zone->cur_size += rounded_size; unlock_zone(zone); OSAddAtomic64((SInt32) rounded_size, &gzalloc_allocated); OSAddAtomic64((SInt32) (rounded_size - zone->elem_size), &gzalloc_wasted); } return addr; }
static boolean_t timer_call_enter_internal( timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited) { mpqueue_head_t *queue = NULL; mpqueue_head_t *old_queue; spl_t s; uint64_t slop; uint32_t urgency; s = splclock(); call->soft_deadline = deadline; call->flags = flags; uint64_t ctime = mach_absolute_time(); TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START, call, param1, deadline, flags, 0); urgency = (flags & TIMER_CALL_URGENCY_MASK); boolean_t slop_ratelimited = FALSE; slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited); if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) slop = leeway; if (UINT64_MAX - deadline <= slop) { deadline = UINT64_MAX; } else { deadline += slop; } if (__improbable(deadline < ctime)) { uint64_t delta = (ctime - deadline); past_deadline_timers++; past_deadline_deltas += delta; if (delta > past_deadline_longest) past_deadline_longest = deadline; if (delta < past_deadline_shortest) past_deadline_shortest = delta; deadline = ctime + past_deadline_timer_adjustment; call->soft_deadline = deadline; } /* Bit 0 of the "soft" deadline indicates that * this particular timer call requires rate-limiting * behaviour. Maintain the invariant deadline >= soft_deadline by * setting bit 0 of "deadline". */ deadline |= 1; if (ratelimited || slop_ratelimited) { call->soft_deadline |= 1ULL; } else { call->soft_deadline &= ~0x1ULL; } call->ttd = call->soft_deadline - ctime; #if CONFIG_DTRACE DTRACE_TMR7(callout__create, timer_call_func_t, CE(call)->func, timer_call_param_t, CE(call)->param0, uint32_t, call->flags, (deadline - call->soft_deadline), (call->ttd >> 32), (unsigned) (call->ttd & 0xFFFFFFFF), call); #endif if (!ratelimited && !slop_ratelimited) { queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue); } if (queue == NULL) { queue = timer_queue_assign(deadline); old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline); } CE(call)->param1 = param1; #if TIMER_TRACE CE(call)->entry_time = ctime; #endif TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END, call, (old_queue != NULL), call->soft_deadline, queue->count, 0); splx(s); return (old_queue != NULL); }
boolean_t gzalloc_free(zone_t zone, void *addr) { boolean_t gzfreed = FALSE; kern_return_t kr; if (__improbable(gzalloc_mode && (((zone->elem_size >= gzalloc_min) && (zone->elem_size <= gzalloc_max))) && (zone->gzalloc_exempt == 0))) { gzhdr_t *gzh; vm_offset_t rounded_size = round_page(zone->elem_size + GZHEADER_SIZE); vm_offset_t residue = rounded_size - zone->elem_size; vm_offset_t saddr; vm_offset_t free_addr = 0; if (gzalloc_uf_mode) { gzh = (gzhdr_t *)((vm_offset_t)addr + zone->elem_size); saddr = (vm_offset_t) addr - PAGE_SIZE; } else { gzh = (gzhdr_t *)((vm_offset_t)addr - GZHEADER_SIZE); saddr = ((vm_offset_t)addr) - residue; } assert((saddr & PAGE_MASK) == 0); if (gzalloc_consistency_checks) { if (gzh->gzsig != GZALLOC_SIGNATURE) { panic("GZALLOC signature mismatch for element %p, expected 0x%x, found 0x%x", addr, GZALLOC_SIGNATURE, gzh->gzsig); } if (gzh->gzone != zone && (gzh->gzone != GZDEADZONE)) panic("%s: Mismatched zone or under/overflow, current zone: %p, recorded zone: %p, address: %p", __FUNCTION__, zone, gzh->gzone, (void *)addr); /* Partially redundant given the zone check, but may flag header corruption */ if (gzh->gzsize != zone->elem_size) { panic("Mismatched zfree or under/overflow for zone %p, recorded size: 0x%x, element size: 0x%x, address: %p\n", zone, gzh->gzsize, (uint32_t) zone->elem_size, (void *)addr); } } if (!kmem_ready || gzh->gzone == GZDEADZONE) { /* For now, just leak frees of early allocations * performed before kmem is fully configured. * They don't seem to get freed currently; * consider ml_static_mfree in the future. */ OSAddAtomic64((SInt32) (rounded_size), &gzalloc_early_free); return TRUE; } if (get_preemption_level() != 0) { pdzfree_count++; } if (gzfc_size) { /* Either write protect or unmap the newly freed * allocation */ kr = vm_map_protect( gzalloc_map, saddr, saddr + rounded_size + (1 * PAGE_SIZE), gzalloc_prot, FALSE); if (kr != KERN_SUCCESS) panic("%s: vm_map_protect: %p, 0x%x", __FUNCTION__, (void *)saddr, kr); } else { free_addr = saddr; } lock_zone(zone); /* Insert newly freed element into the protected free element * cache, and rotate out the LRU element. */ if (gzfc_size) { if (zone->gz.gzfc_index >= gzfc_size) { zone->gz.gzfc_index = 0; } free_addr = zone->gz.gzfc[zone->gz.gzfc_index]; zone->gz.gzfc[zone->gz.gzfc_index++] = saddr; } if (free_addr) { zone->count--; zone->cur_size -= rounded_size; } unlock_zone(zone); if (free_addr) { kr = vm_map_remove( gzalloc_map, free_addr, free_addr + rounded_size + (1 * PAGE_SIZE), VM_MAP_REMOVE_KUNWIRE); if (kr != KERN_SUCCESS) panic("gzfree: vm_map_remove: %p, 0x%x", (void *)free_addr, kr); OSAddAtomic64((SInt32)rounded_size, &gzalloc_freed); OSAddAtomic64(-((SInt32) (rounded_size - zone->elem_size)), &gzalloc_wasted); } gzfreed = TRUE; } return gzfreed; }