static void cpu_idle_stop(cpu_t *cp) { cpupm_mach_state_t *mach_state = (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state); cpu_acpi_handle_t handle = mach_state->ms_acpi_handle; cpu_acpi_cstate_t *cstate; uint_t cpu_max_cstates, i; /* * place the CPUs in a safe place so that we can disable * deep c-state on them. */ pause_cpus(NULL); cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu; start_cpus(); cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle); if (cstate) { cpu_max_cstates = cpu_acpi_get_max_cstates(handle); for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) { if (cstate->cs_ksp != NULL) kstat_delete(cstate->cs_ksp); cstate++; } } cpupm_free_ms_cstate(cp); cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains); cpu_acpi_free_cstate_data(handle); }
/* Flush inflight message buffers. */ int xc_flush_cpu(struct cpu *cpup) { int i; ASSERT((cpup->cpu_flags & CPU_READY) == 0); /* * Pause all working CPUs, which ensures that there's no CPU in * function xc_common(). * This is used to work around a race condition window in xc_common() * between checking CPU_READY flag and increasing working item count. */ pause_cpus(cpup); start_cpus(); for (i = 0; i < XC_FLUSH_MAX_WAITS; i++) { if (cpup->cpu_m.xc_work_cnt == 0) { break; } DELAY(1); } for (; i < XC_FLUSH_MAX_WAITS; i++) { if (!BT_TEST(xc_priority_set, cpup->cpu_id)) { break; } DELAY(1); } return (i >= XC_FLUSH_MAX_WAITS ? ETIME : 0); }
/* * This routine is a special form of pause_cpus(). It ensures that * prom functions are callable while the cpus are paused. */ void promsafe_pause_cpus(void) { pause_cpus(NULL); /* If some other cpu is entering or is in the prom, spin */ while (prom_cpu || mutex_owner(&prom_mutex)) { start_cpus(); mutex_enter(&prom_mutex); /* Wait for other cpu to exit prom */ while (prom_cpu) cv_wait(&prom_cv, &prom_mutex); mutex_exit(&prom_mutex); pause_cpus(NULL); } /* At this point all cpus are paused and none are in the prom */ }
void mp_enter_barrier(void) { hrtime_t last_poke_time = 0; int poke_allowed = 0; int done = 0; int i; ASSERT(MUTEX_HELD(&cpu_lock)); pause_cpus(NULL); while (!done) { done = 1; poke_allowed = 0; if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) { last_poke_time = xpv_gethrtime(); poke_allowed = 1; } for (i = 0; i < NCPU; i++) { cpu_t *cp = cpu_get(i); if (cp == NULL || cp == CPU) continue; switch (cpu_phase[i]) { case CPU_PHASE_NONE: cpu_phase[i] = CPU_PHASE_WAIT_SAFE; poke_cpu(i); done = 0; break; case CPU_PHASE_WAIT_SAFE: if (poke_allowed) poke_cpu(i); done = 0; break; case CPU_PHASE_SAFE: case CPU_PHASE_POWERED_OFF: break; } } SMT_PAUSE(); } }
/*ARGSUSED*/ void mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb) { processorid_t bootcpuid = 0; static int is_first_quiesce = 1; static int is_first_reset = 1; int reset_status = 0; static char fallback_str[] = "Falling back to regular reboot.\n"; if (fcn == AD_FASTREBOOT && !newkernel.fi_valid) fcn = AD_BOOT; if (!panicstr) { kpreempt_disable(); if (fcn == AD_FASTREBOOT) { mutex_enter(&cpu_lock); if (CPU_ACTIVE(cpu_get(bootcpuid))) { affinity_set(bootcpuid); } mutex_exit(&cpu_lock); } else { affinity_set(CPU_CURRENT); } } if (force_shutdown_method != AD_UNKNOWN) fcn = force_shutdown_method; /* * XXX - rconsvp is set to NULL to ensure that output messages * are sent to the underlying "hardware" device using the * monitor's printf routine since we are in the process of * either rebooting or halting the machine. */ rconsvp = NULL; /* * Print the reboot message now, before pausing other cpus. * There is a race condition in the printing support that * can deadlock multiprocessor machines. */ if (!(fcn == AD_HALT || fcn == AD_POWEROFF)) prom_printf("rebooting...\n"); if (IN_XPV_PANIC()) reset(); /* * We can't bring up the console from above lock level, so do it now */ pm_cfb_check_and_powerup(); /* make sure there are no more changes to the device tree */ devtree_freeze(); if (invoke_cb) (void) callb_execute_class(CB_CL_MDBOOT, NULL); /* * Clear any unresolved UEs from memory. */ page_retire_mdboot(); #if defined(__xpv) /* * XXPV Should probably think some more about how we deal * with panicing before it's really safe to panic. * On hypervisors, we reboot very quickly.. Perhaps panic * should only attempt to recover by rebooting if, * say, we were able to mount the root filesystem, * or if we successfully launched init(1m). */ if (panicstr && proc_init == NULL) (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); #endif /* * stop other cpus and raise our priority. since there is only * one active cpu after this, and our priority will be too high * for us to be preempted, we're essentially single threaded * from here on out. */ (void) spl6(); if (!panicstr) { mutex_enter(&cpu_lock); pause_cpus(NULL, NULL); mutex_exit(&cpu_lock); } /* * If the system is panicking, the preloaded kernel is valid, and * fastreboot_onpanic has been set, and the system has been up for * longer than fastreboot_onpanic_uptime (default to 10 minutes), * choose Fast Reboot. */ if (fcn == AD_BOOT && panicstr && newkernel.fi_valid && fastreboot_onpanic && (panic_lbolt - lbolt_at_boot) > fastreboot_onpanic_uptime) { fcn = AD_FASTREBOOT; } /* * Try to quiesce devices. */ if (is_first_quiesce) { /* * Clear is_first_quiesce before calling quiesce_devices() * so that if quiesce_devices() causes panics, it will not * be invoked again. */ is_first_quiesce = 0; quiesce_active = 1; quiesce_devices(ddi_root_node(), &reset_status); if (reset_status == -1) { if (fcn == AD_FASTREBOOT && !force_fastreboot) { prom_printf("Driver(s) not capable of fast " "reboot.\n"); prom_printf(fallback_str); fastreboot_capable = 0; fcn = AD_BOOT; } else if (fcn != AD_FASTREBOOT) fastreboot_capable = 0; } quiesce_active = 0; } /* * Try to reset devices. reset_leaves() should only be called * a) when there are no other threads that could be accessing devices, * and * b) on a system that's not capable of fast reboot (fastreboot_capable * being 0), or on a system where quiesce_devices() failed to * complete (quiesce_active being 1). */ if (is_first_reset && (!fastreboot_capable || quiesce_active)) { /* * Clear is_first_reset before calling reset_devices() * so that if reset_devices() causes panics, it will not * be invoked again. */ is_first_reset = 0; reset_leaves(); } /* Verify newkernel checksum */ if (fastreboot_capable && fcn == AD_FASTREBOOT && fastboot_cksum_verify(&newkernel) != 0) { fastreboot_capable = 0; prom_printf("Fast reboot: checksum failed for the new " "kernel.\n"); prom_printf(fallback_str); } (void) spl8(); if (fastreboot_capable && fcn == AD_FASTREBOOT) { /* * psm_shutdown is called within fast_reboot() */ fast_reboot(); } else { (*psm_shutdownf)(cmd, fcn); if (fcn == AD_HALT || fcn == AD_POWEROFF) halt((char *)NULL); else prom_reboot(""); } /*NOTREACHED*/ }
/* * Promote PG above it's current parent. * This is only legal if PG has an equal or greater number of CPUs than its * parent. * * This routine operates on the CPU specific processor group data (for the CPUs * in the PG being promoted), and may be invoked from a context where one CPU's * PG data is under construction. In this case the argument "pgdata", if not * NULL, is a reference to the CPU's under-construction PG data. */ static void cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) { pg_cmt_t *parent; group_t *children; cpu_t *cpu; group_iter_t iter; pg_cpu_itr_t cpu_iter; int r; int err; ASSERT(MUTEX_HELD(&cpu_lock)); parent = pg->cmt_parent; if (parent == NULL) { /* * Nothing to do */ return; } ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); /* * We're changing around the hierarchy, which is actively traversed * by the dispatcher. Pause CPUS to ensure exclusivity. */ pause_cpus(NULL); /* * If necessary, update the parent's sibling set, replacing parent * with PG. */ if (parent->cmt_siblings) { if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) != -1) { r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); ASSERT(r != -1); } } /* * If the parent is at the top of the hierarchy, replace it's entry * in the root lgroup's group of top level PGs. */ if (parent->cmt_parent == NULL && parent->cmt_siblings != &cmt_root->cl_pgs) { if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) != -1) { r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); ASSERT(r != -1); } } /* * We assume (and therefore assert) that the PG being promoted is an * only child of it's parent. Update the parent's children set * replacing PG's entry with the parent (since the parent is becoming * the child). Then have PG and the parent swap children sets. */ ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { r = group_add(parent->cmt_children, parent, GRP_NORESIZE); ASSERT(r != -1); } children = pg->cmt_children; pg->cmt_children = parent->cmt_children; parent->cmt_children = children; /* * Update the sibling references for PG and it's parent */ pg->cmt_siblings = parent->cmt_siblings; parent->cmt_siblings = pg->cmt_children; /* * Update any cached lineages in the per CPU pg data. */ PG_CPU_ITR_INIT(pg, cpu_iter); while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { int idx; pg_cmt_t *cpu_pg; cpu_pg_t *pgd; /* CPU's PG data */ /* * The CPU's whose lineage is under construction still * references the bootstrap CPU PG data structure. */ if (pg_cpu_is_bootstrapped(cpu)) pgd = pgdata; else pgd = cpu->cpu_pg; /* * Iterate over the CPU's PGs updating the children * of the PG being promoted, since they have a new parent. */ group_iter_init(&iter); while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { if (cpu_pg->cmt_parent == pg) { cpu_pg->cmt_parent = parent; } } /* * Update the CMT load balancing lineage */ if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { /* * Unless this is the CPU who's lineage is being * constructed, the PG being promoted should be * in the lineage. */ ASSERT(pg_cpu_is_bootstrapped(cpu)); continue; } ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); ASSERT(idx > 0); /* * Have the child and the parent swap places in the CPU's * lineage */ group_remove_at(&pgd->cmt_pgs, idx); group_remove_at(&pgd->cmt_pgs, idx - 1); err = group_add_at(&pgd->cmt_pgs, parent, idx); ASSERT(err == 0); err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); ASSERT(err == 0); } /* * Update the parent references for PG and it's parent */ pg->cmt_parent = parent->cmt_parent; parent->cmt_parent = pg; start_cpus(); }
int dr_suspend(dr_sr_handle_t *srh) { dr_handle_t *handle; int force; int dev_errs_idx; uint64_t dev_errs[DR_MAX_ERR_INT]; int rc = DDI_SUCCESS; handle = srh->sr_dr_handlep; force = dr_cmd_flags(handle) & SBD_FLAG_FORCE; /* * update the signature block */ CPU_SIGNATURE(OS_SIG, SIGST_QUIESCE_INPROGRESS, SIGSUBST_NULL, CPU->cpu_id); i_ndi_block_device_tree_changes(&handle->h_ndi); prom_printf("\nDR: suspending user threads...\n"); srh->sr_suspend_state = DR_SRSTATE_USER; if (((rc = dr_stop_user_threads(srh)) != DDI_SUCCESS) && dr_check_user_stop_result) { dr_resume(srh); return (rc); } if (!force) { struct dr_ref drc = {0}; prom_printf("\nDR: checking devices...\n"); dev_errs_idx = 0; drc.arr = dev_errs; drc.idx = &dev_errs_idx; drc.len = DR_MAX_ERR_INT; /* * Since the root node can never go away, it * doesn't have to be held. */ ddi_walk_devs(ddi_root_node(), dr_check_unsafe_major, &drc); if (dev_errs_idx) { handle->h_err = drerr_int(ESBD_UNSAFE, dev_errs, dev_errs_idx, 1); dr_resume(srh); return (DDI_FAILURE); } PR_QR("done\n"); } else { prom_printf("\nDR: dr_suspend invoked with force flag\n"); } #ifndef SKIP_SYNC /* * This sync swap out all user pages */ vfs_sync(SYNC_ALL); #endif /* * special treatment for lock manager */ lm_cprsuspend(); #ifndef SKIP_SYNC /* * sync the file system in case we never make it back */ sync(); #endif /* * now suspend drivers */ prom_printf("DR: suspending drivers...\n"); srh->sr_suspend_state = DR_SRSTATE_DRIVER; srh->sr_err_idx = 0; /* No parent to hold busy */ if ((rc = dr_suspend_devices(ddi_root_node(), srh)) != DDI_SUCCESS) { if (srh->sr_err_idx && srh->sr_dr_handlep) { (srh->sr_dr_handlep)->h_err = drerr_int(ESBD_SUSPEND, srh->sr_err_ints, srh->sr_err_idx, 1); } dr_resume(srh); return (rc); } drmach_suspend_last(); /* * finally, grab all cpus */ srh->sr_suspend_state = DR_SRSTATE_FULL; /* * if watchdog was activated, disable it */ if (watchdog_activated) { mutex_enter(&tod_lock); tod_ops.tod_clear_watchdog_timer(); mutex_exit(&tod_lock); srh->sr_flags |= SR_FLAG_WATCHDOG; } else { srh->sr_flags &= ~(SR_FLAG_WATCHDOG); } /* * Update the signature block. * This must be done before cpus are paused, since on Starcat the * cpu signature update aquires an adaptive mutex in the iosram driver. * Blocking with cpus paused can lead to deadlock. */ CPU_SIGNATURE(OS_SIG, SIGST_QUIESCED, SIGSUBST_NULL, CPU->cpu_id); mutex_enter(&cpu_lock); pause_cpus(NULL); dr_stop_intr(); return (rc); }
int sbdp_suspend(sbdp_sr_handle_t *srh) { int force; int rc = DDI_SUCCESS; force = (srh && (srh->sr_flags & SBDP_IOCTL_FLAG_FORCE)); /* * if no force flag, check for unsafe drivers */ if (force) { SBDP_DBG_QR("\nsbdp_suspend invoked with force flag"); } /* * update the signature block */ CPU_SIGNATURE(OS_SIG, SIGST_QUIESCE_INPROGRESS, SIGSUBST_NULL, CPU->cpu_id); /* * first, stop all user threads */ SBDP_DBG_QR("SBDP: suspending user threads...\n"); SR_SET_STATE(srh, SBDP_SRSTATE_USER); if (((rc = sbdp_stop_user_threads(srh)) != DDI_SUCCESS) && sbdp_check_user_stop_result) { sbdp_resume(srh); return (rc); } #ifndef SKIP_SYNC /* * This sync swap out all user pages */ vfs_sync(SYNC_ALL); #endif /* * special treatment for lock manager */ lm_cprsuspend(); #ifndef SKIP_SYNC /* * sync the file system in case we never make it back */ sync(); #endif /* * now suspend drivers */ SBDP_DBG_QR("SBDP: suspending drivers...\n"); SR_SET_STATE(srh, SBDP_SRSTATE_DRIVER); /* * Root node doesn't have to be held in any way. */ if ((rc = sbdp_suspend_devices(ddi_root_node(), srh)) != DDI_SUCCESS) { sbdp_resume(srh); return (rc); } /* * finally, grab all cpus */ SR_SET_STATE(srh, SBDP_SRSTATE_FULL); /* * if watchdog was activated, disable it */ if (watchdog_activated) { mutex_enter(&tod_lock); saved_watchdog_seconds = tod_ops.tod_clear_watchdog_timer(); mutex_exit(&tod_lock); SR_SET_FLAG(srh, SR_FLAG_WATCHDOG); } else { SR_CLEAR_FLAG(srh, SR_FLAG_WATCHDOG); } mutex_enter(&cpu_lock); pause_cpus(NULL); sbdp_stop_intr(); /* * update the signature block */ CPU_SIGNATURE(OS_SIG, SIGST_QUIESCED, SIGSUBST_NULL, CPU->cpu_id); return (rc); }
/* * launch slave cpus into kernel text, pause them, * and restore the original prom pages */ void i_cpr_mp_setup(void) { extern void restart_other_cpu(int); cpu_t *cp; uint64_t kctx = kcontextreg; /* * Do not allow setting page size codes in MMU primary context * register while using cif wrapper. This is needed to work * around OBP incorrect handling of this MMU register. */ kcontextreg = 0; /* * reset cpu_ready_set so x_calls work properly */ CPUSET_ZERO(cpu_ready_set); CPUSET_ADD(cpu_ready_set, getprocessorid()); /* * setup cif to use the cookie from the new/tmp prom * and setup tmp handling for calling prom services. */ i_cpr_cif_setup(CIF_SPLICE); /* * at this point, only the nucleus and a few cpr pages are * mapped in. once we switch to the kernel trap table, * we can access the rest of kernel space. */ prom_set_traptable(&trap_table); if (ncpus > 1) { sfmmu_init_tsbs(); mutex_enter(&cpu_lock); /* * All of the slave cpus are not ready at this time, * yet the cpu structures have various cpu_flags set; * clear cpu_flags and mutex_ready. * Since we are coming up from a CPU suspend, the slave cpus * are frozen. */ for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) { cp->cpu_flags = CPU_FROZEN; cp->cpu_m.mutex_ready = 0; } for (cp = CPU->cpu_next; cp != CPU; cp = cp->cpu_next) restart_other_cpu(cp->cpu_id); pause_cpus(NULL, NULL); mutex_exit(&cpu_lock); i_cpr_xcall(i_cpr_clear_entries); } else i_cpr_clear_entries(0, 0); /* * now unlink the cif wrapper; WARNING: do not call any * prom_xxx() routines until after prom pages are restored. */ i_cpr_cif_setup(CIF_UNLINK); (void) i_cpr_prom_pages(CPR_PROM_RESTORE); /* allow setting page size codes in MMU primary context register */ kcontextreg = kctx; }
/*ARGSUSED*/ int suspend_start(char *error_reason, size_t max_reason_len) { uint64_t source_tick; uint64_t source_stick; uint64_t rv; timestruc_t source_tod; int spl; ASSERT(suspend_supported()); DBG("suspend: %s", __func__); sfmmu_ctxdoms_lock(); mutex_enter(&cpu_lock); /* Suspend the watchdog */ watchdog_suspend(); /* Record the TOD */ mutex_enter(&tod_lock); source_tod = tod_get(); mutex_exit(&tod_lock); /* Pause all other CPUs */ pause_cpus(NULL); DBG_PROM("suspend: CPUs paused\n"); /* Suspend cyclics */ cyclic_suspend(); DBG_PROM("suspend: cyclics suspended\n"); /* Disable interrupts */ spl = spl8(); DBG_PROM("suspend: spl8()\n"); source_tick = gettick_counter(); source_stick = gettick(); DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); /* * Call into the HV to initiate the suspend. hv_guest_suspend() * returns after the guest has been resumed or if the suspend * operation failed or was cancelled. After a successful suspend, * the %tick and %stick registers may have changed by an amount * that is not proportional to the amount of time that has passed. * They may have jumped forwards or backwards. Some variation is * allowed and accounted for using suspend_tick_stick_max_delta, * but otherwise this jump must be uniform across all CPUs and we * operate under the assumption that it is (maintaining two global * offset variables--one for %tick and one for %stick.) */ DBG_PROM("suspend: suspending... \n"); rv = hv_guest_suspend(); if (rv != 0) { splx(spl); cyclic_resume(); start_cpus(); watchdog_resume(); mutex_exit(&cpu_lock); sfmmu_ctxdoms_unlock(); DBG("suspend: failed, rv: %ld\n", rv); return (rv); } suspend_count++; /* Update the global tick and stick offsets and the preserved TOD */ set_tick_offsets(source_tick, source_stick, &source_tod); /* Ensure new offsets are globally visible before resuming CPUs */ membar_sync(); /* Enable interrupts */ splx(spl); /* Set the {%tick,%stick}.NPT bits on all CPUs */ if (enable_user_tick_stick_emulation) { xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); xt_sync(cpu_ready_set); ASSERT(gettick_npt() != 0); ASSERT(getstick_npt() != 0); } /* If emulation is enabled, but not currently active, enable it */ if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { tick_stick_emulation_active = B_TRUE; } sfmmu_ctxdoms_remove(); /* Resume cyclics, unpause CPUs */ cyclic_resume(); start_cpus(); /* Set the TOD */ mutex_enter(&tod_lock); tod_set(source_tod); mutex_exit(&tod_lock); /* Re-enable the watchdog */ watchdog_resume(); mutex_exit(&cpu_lock); /* Download the latest MD */ if ((rv = mach_descrip_update()) != 0) cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld", rv); sfmmu_ctxdoms_update(); sfmmu_ctxdoms_unlock(); /* Get new MD, update CPU mappings/relationships */ if (suspend_update_cpu_mappings) update_cpu_mappings(); DBG("suspend: target tick: 0x%lx", gettick_counter()); DBG("suspend: target stick: 0x%llx", gettick()); DBG("suspend: user %%tick/%%stick emulation is %d", tick_stick_emulation_active); DBG("suspend: finished"); return (0); }
/* * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW * sharing data structures, and processor groups. */ static void update_cpu_mappings(void) { md_t *mdp; processorid_t id; cpu_t *cp; cpu_pg_t *pgps[NCPU]; if ((mdp = md_get_handle()) == NULL) { DBG("suspend: md_get_handle failed"); return; } DBG("suspend: updating CPU mappings"); mutex_enter(&cpu_lock); setup_chip_mappings(mdp); setup_exec_unit_mappings(mdp); for (id = 0; id < NCPU; id++) { if ((cp = cpu_get(id)) == NULL) continue; cpu_map_exec_units(cp); } /* * Re-calculate processor groups. * * First tear down all PG information before adding any new PG * information derived from the MD we just downloaded. We must * call pg_cpu_inactive and pg_cpu_active with CPUs paused and * we want to minimize the number of times pause_cpus is called. * Inactivating all CPUs would leave PGs without any active CPUs, * so while CPUs are paused, call pg_cpu_inactive and swap in the * bootstrap PG structure saving the original PG structure to be * fini'd afterwards. This prevents the dispatcher from encountering * PGs in which all CPUs are inactive. Offline CPUs are already * inactive in their PGs and shouldn't be reactivated, so we must * not call pg_cpu_inactive or pg_cpu_active for those CPUs. */ pause_cpus(NULL); for (id = 0; id < NCPU; id++) { if ((cp = cpu_get(id)) == NULL) continue; if ((cp->cpu_flags & CPU_OFFLINE) == 0) pg_cpu_inactive(cp); pgps[id] = cp->cpu_pg; pg_cpu_bootstrap(cp); } start_cpus(); /* * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are * not paused. Use two separate loops here so that we do not * initialize PG data for CPUs until all the old PG data structures * are torn down. */ for (id = 0; id < NCPU; id++) { if ((cp = cpu_get(id)) == NULL) continue; pg_cpu_fini(cp, pgps[id]); mpo_cpu_remove(id); } /* * Initialize PG data for each CPU, but leave the bootstrapped * PG structure in place to avoid running with any PGs containing * nothing but inactive CPUs. */ for (id = 0; id < NCPU; id++) { if ((cp = cpu_get(id)) == NULL) continue; mpo_cpu_add(mdp, id); pgps[id] = pg_cpu_init(cp, B_TRUE); } /* * Now that PG data has been initialized for all CPUs in the * system, replace the bootstrapped PG structure with the * initialized PG structure and call pg_cpu_active for each CPU. */ pause_cpus(NULL); for (id = 0; id < NCPU; id++) { if ((cp = cpu_get(id)) == NULL) continue; cp->cpu_pg = pgps[id]; if ((cp->cpu_flags & CPU_OFFLINE) == 0) pg_cpu_active(cp); } start_cpus(); mutex_exit(&cpu_lock); (void) md_fini_handle(mdp); }
/* * Destroy a partition. */ int cpupart_destroy(psetid_t psid) { cpu_t *cp, *first_cp; cpupart_t *pp, *newpp; int err = 0; ASSERT(pool_lock_held()); mutex_enter(&cpu_lock); pp = cpupart_find(psid); if (pp == NULL || pp == &cp_default) { mutex_exit(&cpu_lock); return (EINVAL); } /* * Unbind all the threads currently bound to the partition. */ err = cpupart_unbind_threads(pp, B_TRUE); if (err) { mutex_exit(&cpu_lock); return (err); } newpp = &cp_default; while ((cp = pp->cp_cpulist) != NULL) { if (err = cpupart_move_cpu(cp, newpp, 0)) { mutex_exit(&cpu_lock); return (err); } } ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); ASSERT(bitset_is_null(&pp->cp_haltset)); /* * Teardown the partition's group of active CMT PGs and halted * CPUs now that they have all left. */ bitset_fini(&pp->cp_cmt_pgs); bitset_fini(&pp->cp_haltset); /* * Reset the pointers in any offline processors so they won't * try to rejoin the destroyed partition when they're turned * online. */ first_cp = cp = CPU; do { if (cp->cpu_part == pp) { ASSERT(cp->cpu_flags & CPU_OFFLINE); cp->cpu_part = newpp; } cp = cp->cpu_next; } while (cp != first_cp); /* * Pause all CPUs while changing the partition list, to make sure * the clock thread (which traverses the list without holding * cpu_lock) isn't running. */ pause_cpus(NULL); pp->cp_prev->cp_next = pp->cp_next; pp->cp_next->cp_prev = pp->cp_prev; if (cp_list_head == pp) cp_list_head = pp->cp_next; start_cpus(); if (cp_id_next > pp->cp_id) cp_id_next = pp->cp_id; if (pp->cp_kstat) kstat_delete(pp->cp_kstat); cp_numparts--; disp_kp_free(&pp->cp_kp_queue); cpupart_lpl_teardown(pp); kmem_free(pp, sizeof (cpupart_t)); mutex_exit(&cpu_lock); return (err); }
/* * Create a new partition. On MP systems, this also allocates a * kpreempt disp queue for that partition. */ int cpupart_create(psetid_t *psid) { cpupart_t *pp; ASSERT(pool_lock_held()); pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); pp->cp_nlgrploads = lgrp_plat_max_lgrps(); pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, KM_SLEEP); mutex_enter(&cpu_lock); if (cp_numparts == cp_max_numparts) { mutex_exit(&cpu_lock); kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); pp->cp_lgrploads = NULL; kmem_free(pp, sizeof (cpupart_t)); return (ENOMEM); } cp_numparts++; /* find the next free partition ID */ while (cpupart_find(CPTOPS(cp_id_next)) != NULL) cp_id_next++; pp->cp_id = cp_id_next++; pp->cp_ncpus = 0; pp->cp_cpulist = NULL; pp->cp_attr = 0; klgrpset_clear(pp->cp_lgrpset); pp->cp_kp_queue.disp_maxrunpri = -1; pp->cp_kp_queue.disp_max_unbound_pri = -1; pp->cp_kp_queue.disp_cpu = NULL; pp->cp_gen = 0; DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); *psid = CPTOPS(pp->cp_id); disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); cpupart_kstat_create(pp); cpupart_lpl_initialize(pp); bitset_init(&pp->cp_cmt_pgs); /* * Initialize and size the partition's bitset of halted CPUs. */ bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout); bitset_resize(&pp->cp_haltset, max_ncpus); /* * Pause all CPUs while changing the partition list, to make sure * the clock thread (which traverses the list without holding * cpu_lock) isn't running. */ pause_cpus(NULL); pp->cp_next = cp_list_head; pp->cp_prev = cp_list_head->cp_prev; cp_list_head->cp_prev->cp_next = pp; cp_list_head->cp_prev = pp; start_cpus(); mutex_exit(&cpu_lock); return (0); }
static int cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) { cpupart_t *oldpp; cpu_t *ncp, *newlist; kthread_t *t; int move_threads = 1; lgrp_id_t lgrpid; proc_t *p; int lgrp_diff_lpl; lpl_t *cpu_lpl; int ret; boolean_t unbind_all_threads = (forced != 0); ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(newpp != NULL); oldpp = cp->cpu_part; ASSERT(oldpp != NULL); ASSERT(oldpp->cp_ncpus > 0); if (newpp == oldpp) { /* * Don't need to do anything. */ return (0); } cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); if (!disp_bound_partition(cp, 0)) { /* * Don't need to move threads if there are no threads in * the partition. Note that threads can't enter the * partition while we're holding cpu_lock. */ move_threads = 0; } else if (oldpp->cp_ncpus == 1) { /* * The last CPU is removed from a partition which has threads * running in it. Some of these threads may be bound to this * CPU. * * Attempt to unbind threads from the CPU and from the processor * set. Note that no threads should be bound to this CPU since * cpupart_move_threads will refuse to move bound threads to * other CPUs. */ (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); (void) cpupart_unbind_threads(oldpp, B_FALSE); if (!disp_bound_partition(cp, 0)) { /* * No bound threads in this partition any more */ move_threads = 0; } else { /* * There are still threads bound to the partition */ cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); return (EBUSY); } } /* * If forced flag is set unbind any threads from this CPU. * Otherwise unbind soft-bound threads only. */ if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); return (ret); } /* * Stop further threads weak binding to this cpu. */ cpu_inmotion = cp; membar_enter(); /* * Notify the Processor Groups subsystem that the CPU * will be moving cpu partitions. This is done before * CPUs are paused to provide an opportunity for any * needed memory allocations. */ pg_cpupart_out(cp, oldpp); pg_cpupart_in(cp, newpp); again: if (move_threads) { int loop_count; /* * Check for threads strong or weak bound to this CPU. */ for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { if (loop_count >= 5) { cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); pg_cpupart_out(cp, newpp); pg_cpupart_in(cp, oldpp); cpu_inmotion = NULL; return (EBUSY); /* some threads still bound */ } delay(1); } } /* * Before we actually start changing data structures, notify * the cyclic subsystem that we want to move this CPU out of its * partition. */ if (!cyclic_move_out(cp)) { /* * This CPU must be the last CPU in a processor set with * a bound cyclic. */ cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); pg_cpupart_out(cp, newpp); pg_cpupart_in(cp, oldpp); cpu_inmotion = NULL; return (EBUSY); } pause_cpus(cp); if (move_threads) { /* * The thread on cpu before the pause thread may have read * cpu_inmotion before we raised the barrier above. Check * again. */ if (disp_bound_threads(cp, 1)) { start_cpus(); goto again; } } /* * Now that CPUs are paused, let the PG subsystem perform * any necessary data structure updates. */ pg_cpupart_move(cp, oldpp, newpp); /* save this cpu's lgroup -- it'll be the same in the new partition */ lgrpid = cp->cpu_lpl->lpl_lgrpid; cpu_lpl = cp->cpu_lpl; /* * let the lgroup framework know cp has left the partition */ lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); /* move out of old partition */ oldpp->cp_ncpus--; if (oldpp->cp_ncpus > 0) { ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; if (oldpp->cp_cpulist == cp) { oldpp->cp_cpulist = ncp; } } else { ncp = oldpp->cp_cpulist = NULL; cp_numparts_nonempty--; ASSERT(cp_numparts_nonempty != 0); } oldpp->cp_gen++; /* move into new partition */ newlist = newpp->cp_cpulist; if (newlist == NULL) { newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; cp_numparts_nonempty++; ASSERT(cp_numparts_nonempty != 0); } else { cp->cpu_next_part = newlist; cp->cpu_prev_part = newlist->cpu_prev_part; newlist->cpu_prev_part->cpu_next_part = cp; newlist->cpu_prev_part = cp; } cp->cpu_part = newpp; newpp->cp_ncpus++; newpp->cp_gen++; ASSERT(bitset_is_null(&newpp->cp_haltset)); ASSERT(bitset_is_null(&oldpp->cp_haltset)); /* * let the lgroup framework know cp has entered the partition */ lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); /* * If necessary, move threads off processor. */ if (move_threads) { ASSERT(ncp != NULL); /* * Walk thru the active process list to look for * threads that need to have a new home lgroup, * or the last CPU they run on is the same CPU * being moved out of the partition. */ for (p = practive; p != NULL; p = p->p_next) { t = p->p_tlist; if (t == NULL) continue; lgrp_diff_lpl = 0; do { ASSERT(t->t_lpl != NULL); /* * Update the count of how many threads are * in this CPU's lgroup but have a different lpl */ if (t->t_lpl != cpu_lpl && t->t_lpl->lpl_lgrpid == lgrpid) lgrp_diff_lpl++; /* * If the lgroup that t is assigned to no * longer has any CPUs in t's partition, * we'll have to choose a new lgroup for t. */ if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, t->t_cpupart)) { lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 0); } /* * make sure lpl points to our own partition */ ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && (t->t_lpl < t->t_cpupart->cp_lgrploads + t->t_cpupart->cp_nlgrploads)); ASSERT(t->t_lpl->lpl_ncpu > 0); /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, t->t_pri, NULL); } t = t->t_forw; } while (t != p->p_tlist); /* * Didn't find any threads in the same lgroup as this * CPU with a different lpl, so remove the lgroup from * the process lgroup bitmask. */ if (lgrp_diff_lpl) klgrpset_del(p->p_lgrpset, lgrpid); } /* * Walk thread list looking for threads that need to be * rehomed, since there are some threads that are not in * their process's p_tlist. */ t = curthread; do { ASSERT(t != NULL && t->t_lpl != NULL); /* * If the lgroup that t is assigned to no * longer has any CPUs in t's partition, * we'll have to choose a new lgroup for t. * Also, choose best lgroup for home when * thread has specified lgroup affinities, * since there may be an lgroup with more * affinity available after moving CPUs * around. */ if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, t->t_cpupart) || t->t_lgrp_affinity) { lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 1); } /* make sure lpl points to our own partition */ ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && (t->t_lpl < t->t_cpupart->cp_lgrploads + t->t_cpupart->cp_nlgrploads)); ASSERT(t->t_lpl->lpl_ncpu > 0); /* Update CPU last ran on if it was this CPU */ if (t->t_cpu == cp && t->t_cpupart == oldpp && t->t_bound_cpu != cp) { t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, t->t_pri, NULL); } t = t->t_next; } while (t != curthread); /* * Clear off the CPU's run queue, and the kp queue if the * partition is now empty. */ disp_cpu_inactive(cp); /* * Make cp switch to a thread from the new partition. */ cp->cpu_runrun = 1; cp->cpu_kprunrun = 1; } cpu_inmotion = NULL; start_cpus(); /* * Let anyone interested know that cpu has been added to the set. */ cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); /* * Now let the cyclic subsystem know that it can reshuffle cyclics * bound to the new processor set. */ cyclic_move_in(cp); return (0); }
/* * Top level routine to direct suspend/resume of a domain. */ void xen_suspend_domain(void) { extern void rtcsync(void); extern void ec_resume(void); extern kmutex_t ec_lock; struct xen_add_to_physmap xatp; ulong_t flags; int err; cmn_err(CE_NOTE, "Domain suspending for save/migrate"); SUSPEND_DEBUG("xen_suspend_domain\n"); /* * We only want to suspend the PV devices, since the emulated devices * are suspended by saving the emulated device state. The PV devices * are all children of the xpvd nexus device. So we search the * device tree for the xpvd node to use as the root of the tree to * be suspended. */ if (xpvd_dip == NULL) ddi_walk_devs(ddi_root_node(), check_xpvd, NULL); /* * suspend interrupts and devices */ if (xpvd_dip != NULL) (void) xen_suspend_devices(ddi_get_child(xpvd_dip)); else cmn_err(CE_WARN, "No PV devices found to suspend"); SUSPEND_DEBUG("xenbus_suspend\n"); xenbus_suspend(); mutex_enter(&cpu_lock); /* * Suspend on vcpu 0 */ thread_affinity_set(curthread, 0); kpreempt_disable(); if (ncpus > 1) pause_cpus(NULL, NULL); /* * We can grab the ec_lock as it's a spinlock with a high SPL. Hence * any holder would have dropped it to get through pause_cpus(). */ mutex_enter(&ec_lock); /* * From here on in, we can't take locks. */ flags = intr_clear(); SUSPEND_DEBUG("HYPERVISOR_suspend\n"); /* * At this point we suspend and sometime later resume. * Note that this call may return with an indication of a cancelled * for now no matter ehat the return we do a full resume of all * suspended drivers, etc. */ (void) HYPERVISOR_shutdown(SHUTDOWN_suspend); /* * Point HYPERVISOR_shared_info to the proper place. */ xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = xen_shared_info_frame; if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) panic("Could not set shared_info page. error: %d", err); SUSPEND_DEBUG("gnttab_resume\n"); gnttab_resume(); SUSPEND_DEBUG("ec_resume\n"); ec_resume(); intr_restore(flags); if (ncpus > 1) start_cpus(); mutex_exit(&ec_lock); mutex_exit(&cpu_lock); /* * Now we can take locks again. */ rtcsync(); SUSPEND_DEBUG("xenbus_resume\n"); xenbus_resume(); SUSPEND_DEBUG("xen_resume_devices\n"); if (xpvd_dip != NULL) (void) xen_resume_devices(ddi_get_child(xpvd_dip), 0); thread_affinity_clear(curthread); kpreempt_enable(); SUSPEND_DEBUG("finished xen_suspend_domain\n"); cmn_err(CE_NOTE, "domain restore/migrate completed"); }