void cpu_idle(void) { int cpu = smp_processor_id(); boot_init_stack_canary(); current_thread_info()->status |= TS_POLLING; while (1) { tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) play_dead(); local_irq_disable(); stop_critical_timings(); pm_idle(); start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); } }
/* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ void cpu_idle(void) { int cpu = smp_processor_id(); current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(); while (!need_resched()) { check_pgt_cache(); rmb(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, 0); if (cpu_is_offline(cpu)) play_dead(); local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; /* Don't trace irqs off for idle */ stop_critical_timings(); pm_idle(); start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); } }
void cpu_idle(void) { set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); clear_thread_flag(TIF_POLLING_NRFLAG); local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); if (!need_resched() && powersave != NULL) powersave(); start_critical_timings(); local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); } rcu_idle_exit(); tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); } }
/** * release_console_sem - unlock the console system * * Releases the semaphore which the caller holds on the console system * and the console driver list. * * While the semaphore was held, console output may have been buffered * by printk(). If this is the case, release_console_sem() emits * the output prior to releasing the semaphore. * * If there is output waiting for klogd, we wake it up. * * release_console_sem() may be called from any context. */ void release_console_sem(void) { unsigned long flags; unsigned _con_start, _log_end; unsigned wake_klogd = 0; if (console_suspended) { up(&console_sem); return; } console_may_schedule = 0; for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); start_critical_timings(); local_irq_restore(flags); } console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); }
/* * The body of the idle task. */ void cpu_idle(void) { if (ppc_md.idle_loop) ppc_md.idle_loop(); /* doesn't return */ set_thread_flag(TIF_POLLING_NRFLAG); while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); if (ppc_md.power_save) { clear_thread_flag(TIF_POLLING_NRFLAG); /* * smp_mb is so clearing of TIF_POLLING_NRFLAG * is ordered w.r.t. need_resched() test. */ smp_mb(); local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); /* check again after disabling irqs */ if (!need_resched() && !cpu_should_die()) ppc_md.power_save(); start_critical_timings(); /* Some power_save functions return with * interrupts enabled, some don't. */ if (irqs_disabled()) local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); } else { /* * Go into low thread priority and possibly * low power mode. */ HMT_low(); HMT_very_low(); } } HMT_medium(); ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); if (cpu_should_die()) { sched_preempt_enable_no_resched(); cpu_die(); } schedule_preempt_disabled(); } }
/** * default_idle_call - Default CPU idle routine. * * To use when the cpuidle framework cannot be used. */ void default_idle_call(void) { if (current_clr_polling_and_test()) { local_irq_enable(); } else { stop_critical_timings(); arch_cpu_idle(); start_critical_timings(); } }
/* * Generic idle loop implementation */ static void cpu_idle_loop(void) { while (1) { tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(smp_processor_id())) arch_cpu_idle_dead(); local_irq_disable(); arch_cpu_idle_enter(); /* * In poll mode we reenable interrupts and spin. * * Also if we detected in the wakeup from idle * path that the tick broadcast device expired * for us, we don't want to go deep idle as we * know that the IPI is going to arrive right * away */ if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); WARN_ON_ONCE(irqs_disabled()); rcu_idle_exit(); start_critical_timings(); } else { local_irq_enable(); } __current_set_polling(); } arch_cpu_idle_exit(); } /* * Since we fell out of the loop above, we know * TIF_NEED_RESCHED must be set, propagate it into * PREEMPT_NEED_RESCHED. * * This is required because for polling idle loops we will * not have had an IPI to fold the state for us. */ preempt_set_need_resched(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } }
static inline int cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); stop_critical_timings(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); return 1; }
void cpu_idle(void) { if (ppc_md.idle_loop) ppc_md.idle_loop(); set_thread_flag(TIF_POLLING_NRFLAG); while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched() && !cpu_should_die()) { ppc64_runlatch_off(); if (ppc_md.power_save) { clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); stop_critical_timings(); if (!need_resched() && !cpu_should_die()) ppc_md.power_save(); start_critical_timings(); if (irqs_disabled()) local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); } else { HMT_low(); HMT_very_low(); } } HMT_medium(); ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); if (cpu_should_die()) { sched_preempt_enable_no_resched(); cpu_die(); } schedule_preempt_disabled(); } }
static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { stop_critical_timings(); if (cx->entry_method == ACPI_CSTATE_FFH) { acpi_processor_ffh_cstate_enter(cx); } else if (cx->entry_method == ACPI_CSTATE_HALT) { acpi_safe_halt(); } else { inb(cx->address); inl(acpi_gbl_FADT.xpm_timer_block.address); } start_critical_timings(); }
/* * Generic idle loop implementation */ static void cpu_idle_loop(void) { while (1) { tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); local_irq_disable(); arch_cpu_idle_enter(); /* * In poll mode we reenable interrupts and spin. * * Also if we detected in the wakeup from idle * path that the tick broadcast device expired * for us, we don't want to go deep idle as we * know that the IPI is going to arrive right * away */ if (cpu_idle_force_poll || tick_check_broadcast_expired() || __get_cpu_var(idle_force_poll)) { cpu_idle_poll(); } else { if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); WARN_ON_ONCE(irqs_disabled()); rcu_idle_exit(); start_critical_timings(); } else { local_irq_enable(); } __current_set_polling(); } arch_cpu_idle_exit(); } tick_nohz_idle_exit(); schedule_preempt_disabled(); if (cpu_is_offline(smp_processor_id())) arch_cpu_idle_dead(); } }
/* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ void cpu_idle(void) { int cpu = smp_processor_id(); /* * If we're the non-boot CPU, nothing set the stack canary up * for us. CPU0 already has it initialized but no harm in * doing it again. This is a good place for updating it, as * we wont ever return from this function (so the invalid * canaries already on the stack wont ever trigger). */ boot_init_stack_canary(); current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(1); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) play_dead(); local_irq_disable(); enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); pm_idle(); start_critical_timings(); /* * In many cases the interrupt that ended idle * has already called exit_idle. But some idle * loops can be woken up without interrupt. */ __exit_idle(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); } }
static void default_idle(void) { if (!hlt_counter) { clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb__after_clear_bit(); set_bl_bit(); stop_critical_timings(); while (!need_resched()) cpu_sleep(); start_critical_timings(); clear_bl_bit(); set_thread_flag(TIF_POLLING_NRFLAG); } else while (!need_resched()) cpu_relax(); }
void default_idle(void) { if (likely(hlt_counter)) { local_irq_disable(); stop_critical_timings(); cpu_relax(); start_critical_timings(); local_irq_enable(); } else { clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb__after_clear_bit(); local_irq_disable(); while (!need_resched()) cpu_sleep(); local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); } }
/* * The idle thread, has rather strange semantics for calling pm_idle, * but this is what x86 does and we need to do the same, so that * things like cpuidle get called in the same way. The only difference * is that we always respect 'hlt_counter' to prevent low power idle. */ void cpu_idle(void) { local_fiq_enable(); /* endless idle loop with no priority at all */ while (1) { idle_notifier_call_chain(IDLE_START); tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched()) { /* * We need to disable interrupts here * to ensure we don't miss a wakeup call. */ local_irq_disable(); #ifdef CONFIG_PL310_ERRATA_769419 wmb(); #endif if (hlt_counter) { local_irq_enable(); cpu_relax(); } else if (!need_resched()) { stop_critical_timings(); if (cpuidle_idle_call()) pm_idle(); start_critical_timings(); /* * pm_idle functions must always * return with IRQs enabled. */ WARN_ON(irqs_disabled()); } else local_irq_enable(); } rcu_idle_exit(); tick_nohz_idle_exit(); idle_notifier_call_chain(IDLE_END); schedule_preempt_disabled(); #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(smp_processor_id())) cpu_die(); #endif } }
/** * acpi_idle_do_entry - a helper function that does C2 and C3 type entry * @cx: cstate data * * Caller disables interrupt before call and enables interrupt after return. */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { /* Don't trace irqs off for idle */ stop_critical_timings(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); } else if (cx->entry_method == ACPI_CSTATE_HALT) { acpi_safe_halt(); } else { /* IO port based C-state */ inb(cx->address); /* Dummy wait op - must do something useless after P_LVL2 read because chipsets cannot guarantee that STPCLK# signal gets asserted in time to freeze execution properly. */ inl(acpi_gbl_FADT.xpm_timer_block.address); } start_critical_timings(); }
/* * The idle thread. There's no useful work to be done, so just try to conserve * power and have a low exit latency (ie sit in a loop waiting for somebody to * say that they'd like to reschedule) */ void __noreturn cpu_idle(void) { int cpu; /* CPU is going idle. */ cpu = smp_processor_id(); /* endless idle loop with no priority at all */ while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); smtc_idle_loop_hook(); #endif if (cpu_wait) { /* Don't trace irqs off for idle */ stop_critical_timings(); (*cpu_wait)(); start_critical_timings(); } } #ifdef CONFIG_HOTPLUG_CPU if (!cpu_online(cpu) && !cpu_isset(cpu, cpu_callin_map) && (system_state == SYSTEM_RUNNING || system_state == SYSTEM_BOOTING)) play_dead(); #endif rcu_idle_exit(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } }
/* * The idle thread. There's no useful work to be done, so just try to conserve * power and have a low exit latency (ie sit in a loop waiting for somebody to * say that they'd like to reschedule) */ void cpu_idle(void) { unsigned int cpu = smp_processor_id(); set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) play_dead(); local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); if (cpuidle_idle_call()) sh_idle(); /* * Sanity check to ensure that sh_idle() returns * with IRQs enabled */ WARN_ON(irqs_disabled()); start_critical_timings(); } rcu_idle_exit(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } }
/* avoid HT sibilings if possible */ if (cpumask_empty(tmp)) cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); if (cpumask_empty(tmp)) { mutex_unlock(&round_robin_lock); return; } for_each_cpu(cpu, tmp) { if (cpu_weight[cpu] < min_weight) { min_weight = cpu_weight[cpu]; preferred_cpu = cpu; } } if (tsk_in_cpu[tsk_index] != -1) cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = preferred_cpu; cpumask_set_cpu(preferred_cpu, pad_busy_cpus); cpu_weight[preferred_cpu]++; mutex_unlock(&round_robin_lock); set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); } static void exit_round_robin(unsigned int tsk_index) { struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits); cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = -1; } static unsigned int idle_pct = 5; /* percentage */ static unsigned int round_robin_time = 1; /* second */ static int power_saving_thread(void *data) { struct sched_param param = {.sched_priority = 1}; int do_sleep; unsigned int tsk_index = (unsigned long)data; u64 last_jiffies = 0; sched_setscheduler(current, SCHED_RR, ¶m); set_freezable(); while (!kthread_should_stop()) { int cpu; u64 expire_time; try_to_freeze(); /* round robin to cpus */ if (last_jiffies + round_robin_time * HZ < jiffies) { last_jiffies = jiffies; round_robin_cpu(tsk_index); } do_sleep = 0; expire_time = jiffies + HZ * (100 - idle_pct) / 100; while (!need_resched()) { if (tsc_detected_unstable && !tsc_marked_unstable) { /* TSC could halt in idle, so notify users */ mark_tsc_unstable("TSC halts in idle"); tsc_marked_unstable = 1; } if (lapic_detected_unstable && !lapic_marked_unstable) { int i; /* LAPIC could halt in idle, so notify users */ for_each_online_cpu(i) clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ON, &i); lapic_marked_unstable = 1; } local_irq_disable(); cpu = smp_processor_id(); if (lapic_marked_unstable) clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); stop_critical_timings(); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(power_saving_mwait_eax, 1); start_critical_timings(); if (lapic_marked_unstable) clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); local_irq_enable(); if (jiffies > expire_time) { do_sleep = 1; break; } } /* * current sched_rt has threshold for rt task running time. * When a rt task uses 95% CPU time, the rt thread will be * scheduled out for 5% CPU time to not starve other tasks. But * the mechanism only works when all CPUs have RT task running, * as if one CPU hasn't RT task, RT task from other CPUs will * borrow CPU time from this CPU and cause RT task use > 95% * CPU time. To make 'avoid starvation' work, takes a nap here. */ if (do_sleep) schedule_timeout_killable(HZ * idle_pct / 100); } exit_round_robin(tsk_index); return 0; } static struct task_struct *ps_tsks[NR_CPUS]; static unsigned int ps_tsk_num; static int create_power_saving_task(void) { int rc = -ENOMEM; ps_tsks[ps_tsk_num] = kthread_run(power_saving_thread, (void *)(unsigned long)ps_tsk_num, "acpi_pad/%d", ps_tsk_num); rc = PTR_RET(ps_tsks[ps_tsk_num]); if (!rc) ps_tsk_num++; else ps_tsks[ps_tsk_num] = NULL; return rc; }
/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here */ static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; bool broadcast; /* * Check if the idle task must be rescheduled. If it is the * case, exit the function after re-enabling the local irq. */ if (need_resched()) { local_irq_enable(); return; } /* * During the idle period, stop measuring the disabled irqs * critical sections latencies */ stop_critical_timings(); /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period */ rcu_idle_enter(); /* * Ask the cpuidle framework to choose a convenient idle state. * Fall back to the default arch idle method on errors. */ next_state = cpuidle_select(drv, dev); if (next_state < 0) { use_default: /* * We can't use the cpuidle framework, let's use the default * idle routine. */ if (current_clr_polling_and_test()) local_irq_enable(); else arch_cpu_idle(); goto exit_idle; } /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get * out of this function */ if (current_clr_polling_and_test()) { dev->last_residency = 0; entered_state = next_state; local_irq_enable(); goto exit_idle; } broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); /* * Tell the time framework to switch to a broadcast timer * because our local timer will be shutdown. If a local timer * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ if (broadcast && clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; trace_cpu_idle_rcuidle(next_state, dev->cpu); /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take * care of re-enabling the local interrupts */ entered_state = cpuidle_enter(drv, dev, next_state); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); /* * Give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); /* * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); rcu_idle_exit(); start_critical_timings(); }
/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here * * On archs that support TIF_POLLING_NRFLAG, is called with polling * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; /* * Check if the idle task must be rescheduled. If it is the * case, exit the function after re-enabling the local irq. */ if (need_resched()) { local_irq_enable(); return; } /* * During the idle period, stop measuring the disabled irqs * critical sections latencies */ stop_critical_timings(); /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period */ rcu_idle_enter(); /* * Check if the cpuidle framework is ready, otherwise fallback * to the default arch specific idle method */ next_state = cpuidle_select(drv, dev); if (next_state < 0) { default_idle_call(); goto exit_idle; } /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get * out of this function */ if (current_clr_polling_and_test()) { dev->last_residency = 0; entered_state = next_state; local_irq_enable(); goto exit_idle; } /* Take note of the planned idle state. */ idle_set_state(this_rq(), &drv->states[next_state]); /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take * care of re-enabling the local interrupts */ entered_state = cpuidle_enter(drv, dev, next_state); /* The cpu is no longer idle or about to enter idle. */ idle_set_state(this_rq(), NULL); if (entered_state == -EBUSY) { default_idle_call(); goto exit_idle; } /* * Give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); /* * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); rcu_idle_exit(); start_critical_timings(); }
static int clamp_thread(void *arg) { int cpunr = (unsigned long)arg; DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; unsigned int count = 0; unsigned int target_ratio; set_bit(cpunr, cpu_clamping_mask); set_freezable(); init_timer_on_stack(&wakeup_timer); sched_setscheduler(current, SCHED_FIFO, ¶m); while (true == clamping && !kthread_should_stop() && cpu_online(cpunr)) { int sleeptime; unsigned long target_jiffies; unsigned int guard; unsigned int compensated_ratio; int interval; /* jiffies to sleep for each attempt */ unsigned int duration_jiffies = msecs_to_jiffies(duration); unsigned int window_size_now; try_to_freeze(); /* * make sure user selected ratio does not take effect until * the next round. adjust target_ratio if user has changed * target such that we can converge quickly. */ target_ratio = set_target_ratio; guard = 1 + target_ratio/20; window_size_now = window_size; count++; /* * systems may have different ability to enter package level * c-states, thus we need to compensate the injected idle ratio * to achieve the actual target reported by the HW. */ compensated_ratio = target_ratio + get_compensation(target_ratio); if (compensated_ratio <= 0) compensated_ratio = 1; interval = duration_jiffies * 100 / compensated_ratio; /* align idle time */ target_jiffies = roundup(jiffies, interval); sleeptime = target_jiffies - jiffies; if (sleeptime <= 0) sleeptime = 1; schedule_timeout_interruptible(sleeptime); /* * only elected controlling cpu can collect stats and update * control parameters. */ if (cpunr == control_cpu && !(count%window_size_now)) { should_skip = powerclamp_adjust_controls(target_ratio, guard, window_size_now); smp_mb(); } if (should_skip) continue; target_jiffies = jiffies + duration_jiffies; mod_timer(&wakeup_timer, target_jiffies); if (unlikely(local_softirq_pending())) continue; /* * stop tick sched during idle time, interrupts are still * allowed. thus jiffies are updated properly. */ preempt_disable(); /* mwait until target jiffies is reached */ while (time_before(jiffies, target_jiffies)) { unsigned long ecx = 1; unsigned long eax = target_mwait; /* * REVISIT: may call enter_idle() to notify drivers who * can save power during cpu idle. same for exit_idle() */ local_touch_nmi(); stop_critical_timings(); mwait_idle_with_hints(eax, ecx); start_critical_timings(); atomic_inc(&idle_wakeup_counter); } preempt_enable(); } del_timer_sync(&wakeup_timer); clear_bit(cpunr, cpu_clamping_mask); return 0; } /* * 1 HZ polling while clamping is active, useful for userspace * to monitor actual idle ratio. */ static void poll_pkg_cstate(struct work_struct *dummy); static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); static void poll_pkg_cstate(struct work_struct *dummy) { static u64 msr_last; static u64 tsc_last; static unsigned long jiffies_last; u64 msr_now; unsigned long jiffies_now; u64 tsc_now; u64 val64; msr_now = pkg_state_counter(); tsc_now = rdtsc(); jiffies_now = jiffies; /* calculate pkg cstate vs tsc ratio */ if (!msr_last || !tsc_last) pkg_cstate_ratio_cur = 1; else { if (tsc_now - tsc_last) { val64 = 100 * (msr_now - msr_last); do_div(val64, (tsc_now - tsc_last)); pkg_cstate_ratio_cur = val64; } } /* update record */ msr_last = msr_now; jiffies_last = jiffies_now; tsc_last = tsc_now; if (true == clamping) schedule_delayed_work(&poll_pkg_cstate_work, HZ); } static int start_power_clamp(void) { unsigned long cpu; struct task_struct *thread; set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); /* prevent cpu hotplug */ get_online_cpus(); /* prefer BSP */ control_cpu = 0; if (!cpu_online(control_cpu)) control_cpu = smp_processor_id(); clamping = true; schedule_delayed_work(&poll_pkg_cstate_work, 0); /* start one thread per online cpu */ for_each_online_cpu(cpu) { struct task_struct **p = per_cpu_ptr(powerclamp_thread, cpu); thread = kthread_create_on_node(clamp_thread, (void *) cpu, cpu_to_node(cpu), "kidle_inject/%ld", cpu); /* bind to cpu here */ if (likely(!IS_ERR(thread))) { kthread_bind(thread, cpu); wake_up_process(thread); *p = thread; } } put_online_cpus(); return 0; }
/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here * * On archs that support TIF_POLLING_NRFLAG, is called with polling * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; unsigned int broadcast; bool reflect; /* * Check if the idle task must be rescheduled. If it is the * case, exit the function after re-enabling the local irq. */ if (need_resched()) { local_irq_enable(); return; } /* * During the idle period, stop measuring the disabled irqs * critical sections latencies */ stop_critical_timings(); /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period */ rcu_idle_enter(); if (cpuidle_not_available(drv, dev)) goto use_default; /* * Suspend-to-idle ("freeze") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only * activity happens here and in iterrupts (if any). In that case bypass * the cpuidle governor and go stratight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); if (entered_state >= 0) { local_irq_enable(); goto exit_idle; } reflect = false; next_state = cpuidle_find_deepest_state(drv, dev); } else { reflect = true; /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev); } /* Fall back to the default arch idle method on errors. */ if (next_state < 0) goto use_default; /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get * out of this function */ if (current_clr_polling_and_test()) { dev->last_residency = 0; entered_state = next_state; local_irq_enable(); goto exit_idle; } broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; /* * Tell the time framework to switch to a broadcast timer * because our local timer will be shutdown. If a local timer * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ if (broadcast && tick_broadcast_enter()) goto use_default; /* Take note of the planned idle state. */ idle_set_state(this_rq(), &drv->states[next_state]); /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take * care of re-enabling the local interrupts */ entered_state = cpuidle_enter(drv, dev, next_state); /* The cpu is no longer idle or about to enter idle. */ idle_set_state(this_rq(), NULL); if (broadcast) tick_broadcast_exit(); /* * Give the governor an opportunity to reflect on the outcome */ if (reflect) cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); /* * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); rcu_idle_exit(); start_critical_timings(); return; use_default: /* * We can't use the cpuidle framework, let's use the default * idle routine. */ if (current_clr_polling_and_test()) local_irq_enable(); else arch_cpu_idle(); goto exit_idle; }
/* avoid HT sibilings if possible */ if (cpumask_empty(tmp)) cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); if (cpumask_empty(tmp)) { mutex_unlock(&round_robin_lock); return; } for_each_cpu(cpu, tmp) { if (cpu_weight[cpu] < min_weight) { min_weight = cpu_weight[cpu]; preferred_cpu = cpu; } } if (tsk_in_cpu[tsk_index] != -1) cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = preferred_cpu; cpumask_set_cpu(preferred_cpu, pad_busy_cpus); cpu_weight[preferred_cpu]++; mutex_unlock(&round_robin_lock); set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); } static void exit_round_robin(unsigned int tsk_index) { struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits); cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = -1; } static unsigned int idle_pct = 5; /* percentage */ static unsigned int round_robin_time = 1; /* second */ static int power_saving_thread(void *data) { struct sched_param param = {.sched_priority = 1}; int do_sleep; unsigned int tsk_index = (unsigned long)data; u64 last_jiffies = 0; sched_setscheduler(current, SCHED_RR, ¶m); while (!kthread_should_stop()) { unsigned long expire_time; try_to_freeze(); /* round robin to cpus */ expire_time = last_jiffies + round_robin_time * HZ; if (time_before(expire_time, jiffies)) { last_jiffies = jiffies; round_robin_cpu(tsk_index); } do_sleep = 0; expire_time = jiffies + HZ * (100 - idle_pct) / 100; while (!need_resched()) { if (tsc_detected_unstable && !tsc_marked_unstable) { /* TSC could halt in idle, so notify users */ mark_tsc_unstable("TSC halts in idle"); tsc_marked_unstable = 1; } local_irq_disable(); tick_broadcast_enable(); tick_broadcast_enter(); stop_critical_timings(); mwait_idle_with_hints(power_saving_mwait_eax, 1); start_critical_timings(); tick_broadcast_exit(); local_irq_enable(); if (time_before(expire_time, jiffies)) { do_sleep = 1; break; } } /* * current sched_rt has threshold for rt task running time. * When a rt task uses 95% CPU time, the rt thread will be * scheduled out for 5% CPU time to not starve other tasks. But * the mechanism only works when all CPUs have RT task running, * as if one CPU hasn't RT task, RT task from other CPUs will * borrow CPU time from this CPU and cause RT task use > 95% * CPU time. To make 'avoid starvation' work, takes a nap here. */ if (unlikely(do_sleep)) schedule_timeout_killable(HZ * idle_pct / 100); /* If an external event has set the need_resched flag, then * we need to deal with it, or this loop will continue to * spin without calling __mwait(). */ if (unlikely(need_resched())) schedule(); } exit_round_robin(tsk_index); return 0; } static struct task_struct *ps_tsks[NR_CPUS]; static unsigned int ps_tsk_num; static int create_power_saving_task(void) { int rc; ps_tsks[ps_tsk_num] = kthread_run(power_saving_thread, (void *)(unsigned long)ps_tsk_num, "acpi_pad/%d", ps_tsk_num); if (IS_ERR(ps_tsks[ps_tsk_num])) { rc = PTR_ERR(ps_tsks[ps_tsk_num]); ps_tsks[ps_tsk_num] = NULL; } else { rc = 0; ps_tsk_num++; } return rc; }
/* avoid HT sibilings if possible */ if (cpumask_empty(tmp)) cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); if (cpumask_empty(tmp)) { mutex_unlock(&isolated_cpus_lock); return; } for_each_cpu(cpu, tmp) { if (cpu_weight[cpu] < min_weight) { min_weight = cpu_weight[cpu]; preferred_cpu = cpu; } } if (tsk_in_cpu[tsk_index] != -1) cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = preferred_cpu; cpumask_set_cpu(preferred_cpu, pad_busy_cpus); cpu_weight[preferred_cpu]++; mutex_unlock(&isolated_cpus_lock); set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); } static void exit_round_robin(unsigned int tsk_index) { struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits); cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = -1; } static unsigned int idle_pct = 5; /* percentage */ static unsigned int round_robin_time = 10; /* second */ static int power_saving_thread(void *data) { struct sched_param param = {.sched_priority = 1}; int do_sleep; unsigned int tsk_index = (unsigned long)data; u64 last_jiffies = 0; sched_setscheduler(current, SCHED_RR, ¶m); while (!kthread_should_stop()) { int cpu; u64 expire_time; try_to_freeze(); /* round robin to cpus */ if (last_jiffies + round_robin_time * HZ < jiffies) { last_jiffies = jiffies; round_robin_cpu(tsk_index); } do_sleep = 0; current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we test * NEED_RESCHED: */ smp_mb(); expire_time = jiffies + HZ * (100 - idle_pct) / 100; while (!need_resched()) { local_irq_disable(); cpu = smp_processor_id(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); stop_critical_timings(); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(power_saving_mwait_eax, 1); start_critical_timings(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); local_irq_enable(); if (jiffies > expire_time) { do_sleep = 1; break; } } current_thread_info()->status |= TS_POLLING; /* * current sched_rt has threshold for rt task running time. * When a rt task uses 95% CPU time, the rt thread will be * scheduled out for 5% CPU time to not starve other tasks. But * the mechanism only works when all CPUs have RT task running, * as if one CPU hasn't RT task, RT task from other CPUs will * borrow CPU time from this CPU and cause RT task use > 95% * CPU time. To make 'avoid starvation' work, takes a nap here. */ if (do_sleep) schedule_timeout_killable(HZ * idle_pct / 100); } exit_round_robin(tsk_index); return 0; } static struct task_struct *ps_tsks[NR_CPUS]; static unsigned int ps_tsk_num; static int create_power_saving_task(void) { int rc = -ENOMEM; ps_tsks[ps_tsk_num] = kthread_run(power_saving_thread, (void *)(unsigned long)ps_tsk_num, "power_saving/%d", ps_tsk_num); rc = IS_ERR(ps_tsks[ps_tsk_num]) ? PTR_ERR(ps_tsks[ps_tsk_num]) : 0; if (!rc) ps_tsk_num++; else ps_tsks[ps_tsk_num] = NULL; return rc; }