/* * mac_soft_ring_bind * * Bind a soft ring worker thread to supplied CPU. */ cpu_t * mac_soft_ring_bind(mac_soft_ring_t *ringp, processorid_t cpuid) { cpu_t *cp; boolean_t clear = B_FALSE; ASSERT(MUTEX_HELD(&cpu_lock)); if (mac_soft_ring_thread_bind == 0) { DTRACE_PROBE1(mac__soft__ring__no__cpu__bound, mac_soft_ring_t *, ringp); return (NULL); } cp = cpu_get(cpuid); if (cp == NULL || !cpu_is_online(cp)) return (NULL); mutex_enter(&ringp->s_ring_lock); ringp->s_ring_state |= S_RING_BOUND; if (ringp->s_ring_cpuid != -1) clear = B_TRUE; ringp->s_ring_cpuid = cpuid; mutex_exit(&ringp->s_ring_lock); if (clear) thread_affinity_clear(ringp->s_ring_worker); DTRACE_PROBE2(mac__soft__ring__cpu__bound, mac_soft_ring_t *, ringp, processorid_t, cpuid); thread_affinity_set(ringp->s_ring_worker, cpuid); return (cp); }
/* * Top level routine to direct suspend/resume of a domain. */ void xen_suspend_domain(void) { extern void rtcsync(void); extern hrtime_t hres_last_tick; mfn_t start_info_mfn; ulong_t flags; pfn_t pfn; int i; /* * Check that we are happy to suspend on this hypervisor. */ if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { cpr_err(CE_WARN, "Cannot suspend on this hypervisor " "version: v%lu.%lu%s, need at least version v3.0.4 or " "-xvm based hypervisor", XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); return; } /* * XXPV - Are we definitely OK to suspend by the time we've connected * the handler? */ cpr_err(CE_NOTE, "Domain suspending for save/migrate"); SUSPEND_DEBUG("xen_suspend_domain\n"); /* * suspend interrupts and devices * XXPV - we use suspend/resume for both save/restore domains (like sun * cpr) and for migration. Would be nice to know the difference if * possible. For save/restore where down time may be a long time, we * may want to do more of the things that cpr does. (i.e. notify user * processes, shrink memory footprint for faster restore, etc.) */ xen_suspend_devices(); SUSPEND_DEBUG("xenbus_suspend\n"); xenbus_suspend(); pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); start_info_mfn = pfn_to_mfn(pfn); /* * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe * wrt xenbus being suspended here? */ mutex_enter(&cpu_lock); /* * Suspend must be done on vcpu 0, as no context for other CPUs is * saved. * * XXPV - add to taskq API ? */ thread_affinity_set(curthread, 0); kpreempt_disable(); SUSPEND_DEBUG("xen_start_migrate\n"); xen_start_migrate(); if (ncpus > 1) suspend_cpus(); /* * We can grab the ec_lock as it's a spinlock with a high SPL. Hence * any holder would have dropped it to get through suspend_cpus(). */ mutex_enter(&ec_lock); /* * From here on in, we can't take locks. */ SUSPEND_DEBUG("ec_suspend\n"); ec_suspend(); SUSPEND_DEBUG("gnttab_suspend\n"); gnttab_suspend(); flags = intr_clear(); xpv_time_suspend(); /* * Currently, the hypervisor incorrectly fails to bring back * powered-down VCPUs. Thus we need to record any powered-down VCPUs * to prevent any attempts to operate on them. But we have to do this * *after* the very first time we do ec_suspend(). */ for (i = 1; i < ncpus; i++) { if (cpu[i] == NULL) continue; if (cpu_get_state(cpu[i]) == P_POWEROFF) CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); } /* * The dom0 save/migrate code doesn't automatically translate * these into PFNs, but expects them to be, so we do it here. * We don't use mfn_to_pfn() because so many OS services have * been disabled at this point. */ xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; xen_info->console.domU.mfn = mfn_to_pfn_mapping[xen_info->console.domU.mfn]; if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { prom_printf("xen_suspend_domain(): " "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 0, UVMF_INVLPG)) { prom_printf("xen_suspend_domain(): " "HYPERVISOR_update_va_mapping() failed\n"); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } SUSPEND_DEBUG("HYPERVISOR_suspend\n"); /* * At this point we suspend and sometime later resume. */ if (HYPERVISOR_suspend(start_info_mfn)) { prom_printf("xen_suspend_domain(): " "HYPERVISOR_suspend() failed\n"); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } /* * Point HYPERVISOR_shared_info to its new value. */ if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, UVMF_INVLPG)) (void) HYPERVISOR_shutdown(SHUTDOWN_crash); if (xen_info->nr_pages != mfn_count) { prom_printf("xen_suspend_domain(): number of pages" " changed, was 0x%lx, now 0x%lx\n", mfn_count, xen_info->nr_pages); (void) HYPERVISOR_shutdown(SHUTDOWN_crash); } xpv_time_resume(); cached_max_mfn = 0; SUSPEND_DEBUG("gnttab_resume\n"); gnttab_resume(); /* XXPV: add a note that this must be lockless. */ SUSPEND_DEBUG("ec_resume\n"); ec_resume(); intr_restore(flags); if (ncpus > 1) resume_cpus(); mutex_exit(&ec_lock); xen_end_migrate(); mutex_exit(&cpu_lock); /* * Now we can take locks again. */ /* * Force the tick value used for tv_nsec in hres_tick() to be up to * date. rtcsync() will reset the hrestime value appropriately. */ hres_last_tick = xpv_gethrtime(); /* * XXPV: we need to have resumed the CPUs since this takes locks, but * can remote CPUs see bad state? Presumably yes. Should probably nest * taking of todlock inside of cpu_lock, or vice versa, then provide an * unlocked version. Probably need to call clkinitf to reset cpu freq * and re-calibrate if we migrated to a different speed cpu. Also need * to make a (re)init_cpu_info call to update processor info structs * and device tree info. That remains to be written at the moment. */ rtcsync(); rebuild_mfn_list(); SUSPEND_DEBUG("xenbus_resume\n"); xenbus_resume(); SUSPEND_DEBUG("xenbus_resume_devices\n"); xen_resume_devices(); thread_affinity_clear(curthread); kpreempt_enable(); SUSPEND_DEBUG("finished xen_suspend_domain\n"); /* * We have restarted our suspended domain, update the hypervisor * details. NB: This must be done at the end of this function, * since we need the domain to be completely resumed before * these functions will work correctly. */ xen_set_version(XENVER_CURRENT_IDX); /* * We can check and report a warning, but we don't stop the * process. */ if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " "but need at least version v3.0.4", XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); cmn_err(CE_NOTE, "domain restore/migrate completed"); }
kern_return_t thread_policy_set_internal( thread_t thread, thread_policy_flavor_t flavor, thread_policy_t policy_info, mach_msg_type_number_t count) { kern_return_t result = KERN_SUCCESS; spl_t s; thread_mtx_lock(thread); if (!thread->active) { thread_mtx_unlock(thread); return (KERN_TERMINATED); } switch (flavor) { case THREAD_EXTENDED_POLICY: { boolean_t timeshare = TRUE; if (count >= THREAD_EXTENDED_POLICY_COUNT) { thread_extended_policy_t info; info = (thread_extended_policy_t)policy_info; timeshare = info->timeshare; } sched_mode_t mode = (timeshare == TRUE) ? TH_MODE_TIMESHARE : TH_MODE_FIXED; s = splsched(); thread_lock(thread); thread_set_user_sched_mode_and_recompute_pri(thread, mode); thread_unlock(thread); splx(s); sfi_reevaluate(thread); break; } case THREAD_TIME_CONSTRAINT_POLICY: { thread_time_constraint_policy_t info; if (count < THREAD_TIME_CONSTRAINT_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } info = (thread_time_constraint_policy_t)policy_info; if ( info->constraint < info->computation || info->computation > max_rt_quantum || info->computation < min_rt_quantum ) { result = KERN_INVALID_ARGUMENT; break; } s = splsched(); thread_lock(thread); thread->realtime.period = info->period; thread->realtime.computation = info->computation; thread->realtime.constraint = info->constraint; thread->realtime.preemptible = info->preemptible; thread_set_user_sched_mode_and_recompute_pri(thread, TH_MODE_REALTIME); thread_unlock(thread); splx(s); sfi_reevaluate(thread); break; } case THREAD_PRECEDENCE_POLICY: { thread_precedence_policy_t info; if (count < THREAD_PRECEDENCE_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } info = (thread_precedence_policy_t)policy_info; s = splsched(); thread_lock(thread); thread->importance = info->importance; thread_recompute_priority(thread); thread_unlock(thread); splx(s); break; } case THREAD_AFFINITY_POLICY: { thread_affinity_policy_t info; if (!thread_affinity_is_supported()) { result = KERN_NOT_SUPPORTED; break; } if (count < THREAD_AFFINITY_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } info = (thread_affinity_policy_t) policy_info; /* * Unlock the thread mutex here and * return directly after calling thread_affinity_set(). * This is necessary for correct lock ordering because * thread_affinity_set() takes the task lock. */ thread_mtx_unlock(thread); return thread_affinity_set(thread, info->affinity_tag); } case THREAD_THROUGHPUT_QOS_POLICY: { thread_throughput_qos_policy_t info = (thread_throughput_qos_policy_t) policy_info; int tqos; if (count < THREAD_LATENCY_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } if ((result = qos_throughput_policy_validate(info->thread_throughput_qos_tier)) != KERN_SUCCESS) { break; } tqos = qos_extract(info->thread_throughput_qos_tier); thread->effective_policy.t_through_qos = tqos; } break; case THREAD_LATENCY_QOS_POLICY: { thread_latency_qos_policy_t info = (thread_latency_qos_policy_t) policy_info; int lqos; if (count < THREAD_THROUGHPUT_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } if ((result = qos_latency_policy_validate(info->thread_latency_qos_tier)) != KERN_SUCCESS) { break; } lqos = qos_extract(info->thread_latency_qos_tier); /* The expected use cases (opt-in) of per-thread latency QoS would seem to * preclude any requirement at present to re-evaluate timers on a thread level * latency QoS change. */ thread->effective_policy.t_latency_qos = lqos; } break; case THREAD_QOS_POLICY: case THREAD_QOS_POLICY_OVERRIDE: { thread_qos_policy_t info = (thread_qos_policy_t)policy_info; if (count < THREAD_QOS_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } if (info->qos_tier < 0 || info->qos_tier >= THREAD_QOS_LAST) { result = KERN_INVALID_ARGUMENT; break; } if (info->tier_importance > 0 || info->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) { result = KERN_INVALID_ARGUMENT; break; } if (info->qos_tier == THREAD_QOS_UNSPECIFIED && info->tier_importance != 0) { result = KERN_INVALID_ARGUMENT; break; } /* * Going into task policy requires the task mutex, * because of the way synchronization against the IO policy * subsystem works. * * We need to move thread policy to the thread mutex instead. * <rdar://problem/15831652> separate thread policy from task policy */ if (flavor == THREAD_QOS_POLICY_OVERRIDE) { int strongest_override = info->qos_tier; if (info->qos_tier != THREAD_QOS_UNSPECIFIED && thread->requested_policy.thrp_qos_override != THREAD_QOS_UNSPECIFIED) strongest_override = MAX(thread->requested_policy.thrp_qos_override, info->qos_tier); thread_mtx_unlock(thread); /* There is a race here. To be closed in <rdar://problem/15831652> separate thread policy from task policy */ proc_set_task_policy(thread->task, thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, strongest_override); return (result); } thread_mtx_unlock(thread); proc_set_task_policy2(thread->task, thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, info->qos_tier, -info->tier_importance); thread_mtx_lock(thread); if (!thread->active) { thread_mtx_unlock(thread); return (KERN_TERMINATED); } break; } default: result = KERN_INVALID_ARGUMENT; break; } thread_mtx_unlock(thread); return (result); }
kern_return_t thread_policy_set( thread_t thread, thread_policy_flavor_t flavor, thread_policy_t policy_info, mach_msg_type_number_t count) { kern_return_t result = KERN_SUCCESS; spl_t s; if (thread == THREAD_NULL) return (KERN_INVALID_ARGUMENT); thread_mtx_lock(thread); if (!thread->active) { thread_mtx_unlock(thread); return (KERN_TERMINATED); } if (thread->static_param) { thread_mtx_unlock(thread); return (KERN_SUCCESS); } switch (flavor) { case THREAD_EXTENDED_POLICY: { boolean_t timeshare = TRUE; if (count >= THREAD_EXTENDED_POLICY_COUNT) { thread_extended_policy_t info; info = (thread_extended_policy_t)policy_info; timeshare = info->timeshare; } s = splsched(); thread_lock(thread); if (!(thread->sched_mode & TH_MODE_FAILSAFE)) { integer_t oldmode = (thread->sched_mode & TH_MODE_TIMESHARE); thread->sched_mode &= ~TH_MODE_REALTIME; if (timeshare && !oldmode) { thread->sched_mode |= TH_MODE_TIMESHARE; if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_incr(); } else if (!timeshare && oldmode) { thread->sched_mode &= ~TH_MODE_TIMESHARE; if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_decr(); } thread_recompute_priority(thread); } else { thread->safe_mode &= ~TH_MODE_REALTIME; if (timeshare) thread->safe_mode |= TH_MODE_TIMESHARE; else thread->safe_mode &= ~TH_MODE_TIMESHARE; } thread_unlock(thread); splx(s); break; } case THREAD_TIME_CONSTRAINT_POLICY: { thread_time_constraint_policy_t info; if (count < THREAD_TIME_CONSTRAINT_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } info = (thread_time_constraint_policy_t)policy_info; if ( info->constraint < info->computation || info->computation > max_rt_quantum || info->computation < min_rt_quantum ) { result = KERN_INVALID_ARGUMENT; break; } s = splsched(); thread_lock(thread); thread->realtime.period = info->period; thread->realtime.computation = info->computation; thread->realtime.constraint = info->constraint; thread->realtime.preemptible = info->preemptible; if (!(thread->sched_mode & TH_MODE_FAILSAFE)) { if (thread->sched_mode & TH_MODE_TIMESHARE) { thread->sched_mode &= ~TH_MODE_TIMESHARE; if ((thread->state & (TH_RUN|TH_IDLE)) == TH_RUN) sched_share_decr(); } thread->sched_mode |= TH_MODE_REALTIME; thread_recompute_priority(thread); } else { thread->safe_mode &= ~TH_MODE_TIMESHARE; thread->safe_mode |= TH_MODE_REALTIME; } thread_unlock(thread); splx(s); break; } case THREAD_PRECEDENCE_POLICY: { thread_precedence_policy_t info; if (count < THREAD_PRECEDENCE_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } info = (thread_precedence_policy_t)policy_info; s = splsched(); thread_lock(thread); thread->importance = info->importance; thread_recompute_priority(thread); thread_unlock(thread); splx(s); break; } case THREAD_AFFINITY_POLICY: { thread_affinity_policy_t info; if (!thread_affinity_is_supported()) { result = KERN_NOT_SUPPORTED; break; } if (count < THREAD_AFFINITY_POLICY_COUNT) { result = KERN_INVALID_ARGUMENT; break; } info = (thread_affinity_policy_t) policy_info; /* * Unlock the thread mutex here and * return directly after calling thread_affinity_set(). * This is necessary for correct lock ordering because * thread_affinity_set() takes the task lock. */ thread_mtx_unlock(thread); return thread_affinity_set(thread, info->affinity_tag); } default: result = KERN_INVALID_ARGUMENT; break; } thread_mtx_unlock(thread); return (result); }
/* * Top level routine to direct suspend/resume of a domain. */ void xen_suspend_domain(void) { extern void rtcsync(void); extern void ec_resume(void); extern kmutex_t ec_lock; struct xen_add_to_physmap xatp; ulong_t flags; int err; cmn_err(CE_NOTE, "Domain suspending for save/migrate"); SUSPEND_DEBUG("xen_suspend_domain\n"); /* * We only want to suspend the PV devices, since the emulated devices * are suspended by saving the emulated device state. The PV devices * are all children of the xpvd nexus device. So we search the * device tree for the xpvd node to use as the root of the tree to * be suspended. */ if (xpvd_dip == NULL) ddi_walk_devs(ddi_root_node(), check_xpvd, NULL); /* * suspend interrupts and devices */ if (xpvd_dip != NULL) (void) xen_suspend_devices(ddi_get_child(xpvd_dip)); else cmn_err(CE_WARN, "No PV devices found to suspend"); SUSPEND_DEBUG("xenbus_suspend\n"); xenbus_suspend(); mutex_enter(&cpu_lock); /* * Suspend on vcpu 0 */ thread_affinity_set(curthread, 0); kpreempt_disable(); if (ncpus > 1) pause_cpus(NULL, NULL); /* * We can grab the ec_lock as it's a spinlock with a high SPL. Hence * any holder would have dropped it to get through pause_cpus(). */ mutex_enter(&ec_lock); /* * From here on in, we can't take locks. */ flags = intr_clear(); SUSPEND_DEBUG("HYPERVISOR_suspend\n"); /* * At this point we suspend and sometime later resume. * Note that this call may return with an indication of a cancelled * for now no matter ehat the return we do a full resume of all * suspended drivers, etc. */ (void) HYPERVISOR_shutdown(SHUTDOWN_suspend); /* * Point HYPERVISOR_shared_info to the proper place. */ xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = xen_shared_info_frame; if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) panic("Could not set shared_info page. error: %d", err); SUSPEND_DEBUG("gnttab_resume\n"); gnttab_resume(); SUSPEND_DEBUG("ec_resume\n"); ec_resume(); intr_restore(flags); if (ncpus > 1) start_cpus(); mutex_exit(&ec_lock); mutex_exit(&cpu_lock); /* * Now we can take locks again. */ rtcsync(); SUSPEND_DEBUG("xenbus_resume\n"); xenbus_resume(); SUSPEND_DEBUG("xen_resume_devices\n"); if (xpvd_dip != NULL) (void) xen_resume_devices(ddi_get_child(xpvd_dip), 0); thread_affinity_clear(curthread); kpreempt_enable(); SUSPEND_DEBUG("finished xen_suspend_domain\n"); cmn_err(CE_NOTE, "domain restore/migrate completed"); }