static cputime64_t get_idle_time(int cpu) { cputime64_t idle; idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) idle += arch_idle_time(cpu); return idle; }
/* * Return a multiplier for the exit latency that is intended * to take performance requirements into account. * The more performance critical we estimate the system * to be, the higher this multiplier, and thus the higher * the barrier to go to an expensive C state. */ static inline int performance_multiplier(void) { int mult = 1; /* for IO wait tasks (per cpu!) we add 5x each */ mult += 10 * nr_iowait_cpu(smp_processor_id()); return mult; }
static cputime64_t get_iowait_time(int cpu) { cputime64_t iowait; iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) iowait += arch_idle_time(cpu); return iowait; }
static inline int performance_multiplier(void) { int mult = 1; mult += 10 * nr_iowait_cpu(smp_processor_id()); return mult; }
/** * sched_get_nr_running_avg * @return: Average nr_running and iowait value since last poll. * Returns the avg * 100 to return up to two decimal points * of accuracy. * * Obtains the average nr_running value since the last poll. * This function may not be called concurrently with itself */ void sched_get_nr_running_avg(int *avg, int *iowait_avg) { int cpu; u64 curr_time; u64 diff_sgnra_last; u64 diff_last; u32 faultyclk_cpumask = 0; u64 tmp; *avg = 0; *iowait_avg = 0; /* read and reset nr_running counts */ for_each_possible_cpu(cpu) { unsigned long flags; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); curr_time = sched_clock(); /* error handling for problematic clock violation */ if (curr_time > per_cpu(sgnra_last_time, cpu) && curr_time >= per_cpu(last_time, cpu)) { diff_last = curr_time - per_cpu(last_time, cpu); diff_sgnra_last = curr_time - per_cpu(sgnra_last_time, cpu); tmp = per_cpu(nr, cpu) * diff_last; tmp += per_cpu(nr_prod_sum, cpu); *avg += (int)div64_u64(tmp * 100, diff_sgnra_last); tmp = nr_iowait_cpu(cpu) * diff_last; tmp += per_cpu(iowait_prod_sum, cpu); *iowait_avg += (int)div64_u64(tmp * 100, diff_sgnra_last); } else { faultyclk_cpumask |= 1 << cpu; pr_warn("[%s]**** (curr_time %lld), (per_cpu(sgnra_last_time, %d), %lld), (per_cpu(last_time, %d), %lld)\n", __func__, curr_time, cpu, per_cpu(sgnra_last_time, cpu), cpu, per_cpu(last_time, cpu)); } per_cpu(sgnra_last_time, cpu) = curr_time; per_cpu(last_time, cpu) = curr_time; per_cpu(nr_prod_sum, cpu) = 0; per_cpu(iowait_prod_sum, cpu) = 0; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } /* error handling for problematic clock violation*/ if (faultyclk_cpumask) { *avg = 0; *iowait_avg = 0; pr_warn("[%s]**** CPU (%d) clock may unstable !!\n", __func__, faultyclk_cpumask); return; } WARN(*avg < 0, "[sched_get_nr_running_avg] avg:%d", *avg); WARN(*iowait_avg < 0, "[sched_get_nr_running_avg] iowait_avg:%d", *iowait_avg); }
/* * Return a multiplier for the exit latency that is intended * to take performance requirements into account. * The more performance critical we estimate the system * to be, the higher this multiplier, and thus the higher * the barrier to go to an expensive C state. */ static inline int performance_multiplier(void) { int mult = 1; if (tune_multiplier <= 1) return tune_multiplier; /* for higher loadavg, we are more reluctant */ /* for IO wait tasks (per cpu!) we add 5x each */ mult += 10 * nr_iowait_cpu(smp_processor_id()); if (tune_multiplier != 1024) mult = (tune_multiplier * mult) / 1024; return mult; }
/* * Updates the per cpu time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } if (last_update_time) *last_update_time = ktime_to_us(now); }
/* * Return a multiplier for the exit latency that is intended * to take performance requirements into account. * The more performance critical we estimate the system * to be, the higher this multiplier, and thus the higher * the barrier to go to an expensive C state. */ static inline int performance_multiplier(void) { int mult = 1; /* for higher loadavg, we are more reluctant */ /* * this doesn't work as intended - it is almost always 0, but can * sometimes, depending on workload, spike very high into the hundreds * even when the average cpu load is under 10%. */ /* mult += 2 * get_loadavg(); */ /* for IO wait tasks (per cpu!) we add 5x each */ mult += 10 * nr_iowait_cpu(smp_processor_id()); return mult; }
static void update_ts_time_stats_wo_cpuoffline(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active && (!ts->cpu_plug_off_flag)) { delta = ktime_sub(now, ts->idle_entrytime_wo_cpuoffline); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime_wo_cpuoffline= ktime_add(ts->iowait_sleeptime_wo_cpuoffline, delta); else ts->idle_sleeptime_wo_cpuoffline= ktime_add(ts->idle_sleeptime_wo_cpuoffline, delta); ts->idle_entrytime_wo_cpuoffline= now; } if (last_update_time) *last_update_time = ktime_to_us(now); }
/** * sched_update_nr_prod * @cpu: The core id of the nr running driver. * @nr: Updated nr running value for cpu. * @inc: Whether we are increasing or decreasing the count * @return: N/A * * Update average with latest nr_running value for CPU */ void sched_update_nr_prod(int cpu, unsigned long nr_running, bool inc) { int diff; s64 curr_time; unsigned long flags; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); curr_time = sched_clock(); diff = curr_time - per_cpu(last_time, cpu); per_cpu(last_time, cpu) = curr_time; per_cpu(nr, cpu) = nr_running + (inc ? 1 : -1); BUG_ON(per_cpu(nr, cpu) < 0); per_cpu(nr_prod_sum, cpu) += nr_running * diff; per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); }
static inline int which_bucket(unsigned int duration) { int bucket = 0; if (nr_iowait_cpu(smp_processor_id())) bucket = BUCKETS/2; if (duration < 10) return bucket; if (duration < 100) return bucket + 1; if (duration < 1000) return bucket + 2; if (duration < 10000) return bucket + 3; if (duration < 100000) return bucket + 4; return bucket + 5; }
/** * sched_get_nr_running_avg * @return: Average nr_running and iowait value since last poll. * Returns the avg * 100 to return up to two decimal points * of accuracy. * * Obtains the average nr_running value since the last poll. * This function may not be called concurrently with itself */ void sched_get_nr_running_avg(int *avg, int *iowait_avg) { int cpu; u64 curr_time = sched_clock(); u64 diff = curr_time - last_get_time; u64 tmp_avg = 0, tmp_iowait = 0; *avg = 0; *iowait_avg = 0; if (!diff) return; last_get_time = curr_time; /* read and reset nr_running counts */ for_each_possible_cpu(cpu) { unsigned long flags; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); tmp_avg += per_cpu(nr_prod_sum, cpu); tmp_avg += per_cpu(nr, cpu) * (curr_time - per_cpu(last_time, cpu)); tmp_iowait = per_cpu(iowait_prod_sum, cpu); tmp_iowait += nr_iowait_cpu(cpu) * (curr_time - per_cpu(last_time, cpu)); per_cpu(last_time, cpu) = curr_time; per_cpu(nr_prod_sum, cpu) = 0; per_cpu(iowait_prod_sum, cpu) = 0; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } *avg = (int)div64_u64(tmp_avg * 100, diff); *iowait_avg = (int)div64_u64(tmp_iowait * 100, diff); BUG_ON(*avg < 0); pr_debug("%s - avg:%d\n", __func__, *avg); BUG_ON(*iowait_avg < 0); pr_debug("%s - avg:%d\n", __func__, *iowait_avg); }
/** * sched_update_nr_prod * @cpu: The core id of the nr running driver. * @nr: Updated nr running value for cpu. * @inc: Whether we are increasing or decreasing the count * @return: N/A * * Update average with latest nr_running value for CPU */ void sched_update_nr_prod(int cpu, unsigned long nr_running, bool inc) { u64 diff; u64 curr_time; unsigned long flags; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); curr_time = sched_clock(); diff = curr_time - per_cpu(last_time, cpu); /* skip this problematic clock violation */ if (curr_time < per_cpu(last_time, cpu)) { pr_warn("[%s]**** CPU (%d) clock may unstable!! (curr_time %lld) < (per_cpu(last_time, %d), %lld)\n", __func__, cpu, curr_time, cpu, per_cpu(last_time, cpu)); spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); return; } BUG_ON(nr_running == 0 && inc == 0); per_cpu(last_time, cpu) = curr_time; per_cpu(nr, cpu) = nr_running + (inc ? 1 : -1); per_cpu(nr_prod_sum, cpu) += nr_running * diff; per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); }
/** * get_cpu_iowait_time_us - get the total iowait time of a cpu * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cummulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; if (!tick_nohz_enabled) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); iowait = ts->iowait_sleeptime; } else { if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); iowait = ktime_add(ts->iowait_sleeptime, delta); } else { iowait = ts->iowait_sleeptime; } } return ktime_to_us(iowait); }
/* ic, extend hcube helper functionalities */ int __ref op_is_online_idle_core(unsigned int cpu) { int ret = idle_cpu(cpu) && !nr_iowait_cpu(cpu); return ret; }
/** * sched_get_nr_running_avg * @return: Average nr_running and iowait value since last poll. * Returns the avg * 100 to return up to two decimal points * of accuracy. * * Obtains the average nr_running value since the last poll. * This function may not be called concurrently with itself */ void sched_get_nr_running_avg(int *avg, int *iowait_avg) { int cpu; u64 curr_time = sched_clock(); s64 diff = (s64)(curr_time - last_get_time); u64 tmp_avg = 0, tmp_iowait = 0; bool clk_faulty = 0; u32 cpumask = 0; *avg = 0; *iowait_avg = 0; if (!diff) return; WARN(diff<0, "[sched_get_nr_running_avg] time last:%lld curr:%llu ", last_get_time, curr_time); last_get_time = curr_time; /* read and reset nr_running counts */ for_each_possible_cpu(cpu) { unsigned long flags; spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); // error handling for problematic clock violation if ((s64)(curr_time - per_cpu(last_time, cpu) < 0)) { clk_faulty = 1; cpumask |= 1 << cpu; } //////// tmp_avg += per_cpu(nr_prod_sum, cpu); tmp_avg += per_cpu(nr, cpu) * (curr_time - per_cpu(last_time, cpu)); tmp_iowait = per_cpu(iowait_prod_sum, cpu); tmp_iowait += nr_iowait_cpu(cpu) * (curr_time - per_cpu(last_time, cpu)); per_cpu(last_time, cpu) = curr_time; per_cpu(nr_prod_sum, cpu) = 0; per_cpu(iowait_prod_sum, cpu) = 0; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } // error handling for problematic clock violation if (clk_faulty) { *avg = 0; *iowait_avg = 0; pr_warn("[%s] **** CPU (0x%08x)clock may unstable !!\n", __func__, cpumask); return; } /////// *avg = (int)div64_u64(tmp_avg * 100, (u64)diff); *iowait_avg = (int)div64_u64(tmp_iowait * 100, (u64)diff); WARN(*avg<0, "[sched_get_nr_running_avg] avg:%d time last:%llu curr:%llu ", *avg, last_get_time, curr_time); pr_debug("[%s] avg:%d\n", __func__, *avg); WARN(*iowait_avg<0, "[sched_get_nr_running_avg] iowait_avg:%d time last:%llu curr:%llu ", *iowait_avg, last_get_time, curr_time); pr_debug("[%s] iowait_avg:%d\n", __func__, *iowait_avg); }
/** * menu_select - selects the next idle state to enter * @drv: cpuidle driver containing state data * @dev: the CPU */ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = this_cpu_ptr(&menu_devices); int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); int i; unsigned int interactivity_req; int repeat = 0, low_predicted = 0; int cpu = smp_processor_id(); struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); unsigned long nr_iowaiters; if (data->needs_update) { menu_update(drv, dev); data->needs_update = 0; } data->last_state_idx = 0; /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) return 0; /* determine the expected residency time, round up */ data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); nr_iowaiters = nr_iowait_cpu(smp_processor_id()); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); /* * if the correction factor is 0 (eg first time init or cpu hotplug * etc), we actually want to start out with a unity factor. */ if (data->correction_factor[data->bucket] == 0) data->correction_factor[data->bucket] = RESOLUTION * DECAY; /* Make sure to round up for half microseconds */ #ifdef CONFIG_SKIP_IDLE_CORRELATION if (dev->skip_idle_correlation) data->predicted_us = data->next_timer_us; else #endif data->predicted_us = div_round64(data->next_timer_us * data->correction_factor[data->bucket], RESOLUTION * DECAY); /* This patch is not checked */ #ifndef CONFIG_CPU_THERMAL_IPA repeat = get_typical_interval(data); #else /* * HACK - Ignore repeating patterns when we're * forecasting a very large idle period. */ if(data->predicted_us < MAX_INTERESTING) repeat = get_typical_interval(data); #endif /* * Performance multiplier defines a minimum predicted idle * duration / latency ratio. Adjust the latency limit if * necessary. */ interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters); if (latency_req > interactivity_req) latency_req = interactivity_req; /* * We want to default to C1 (hlt), not to busy polling * unless the timer is happening really really soon. */ if (data->next_timer_us > 5 && !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; /* * Find the idle state with the lowest power while satisfying * our constraints. */ for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; struct cpuidle_state_usage *su = &dev->states_usage[i]; if (s->disabled || su->disable) continue; if (s->target_residency > data->predicted_us) { low_predicted = 1; continue; } if (s->exit_latency > latency_req) continue; data->last_state_idx = i; } /* not deepest C-state chosen for low predicted residency */ if (low_predicted) { unsigned int timer_us = 0; unsigned int perfect_us = 0; /* * Set a timer to detect whether this sleep is much * longer than repeat mode predicted. If the timer * triggers, the code will evaluate whether to put * the CPU into a deeper C-state. * The timer is cancelled on CPU wakeup. */ timer_us = 2 * (data->predicted_us + MAX_DEVIATION); perfect_us = perfect_cstate_ms * 1000; if (repeat && (4 * timer_us < data->next_timer_us)) { RCU_NONIDLE(hrtimer_start(hrtmr, ns_to_ktime(1000 * timer_us), HRTIMER_MODE_REL_PINNED)); /* In repeat case, menu hrtimer is started */ per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT; } else if (perfect_us < data->next_timer_us) { /* * The next timer is long. This could be because * we did not make a useful prediction. * In that case, it makes sense to re-enter * into a deeper C-state after some time. */ RCU_NONIDLE(hrtimer_start(hrtmr, ns_to_ktime(1000 * timer_us), HRTIMER_MODE_REL_PINNED)); /* In general case, menu hrtimer is started */ per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_GENERAL; } } return data->last_state_idx; }