/* * For per-cpu interrupts, we need to avoid unmasking any interrupts * that we disabled via disable_percpu_irq(). */ static void tile_irq_chip_eoi(struct irq_data *d) { if (!(__get_cpu_var(irq_disable_mask) & (1UL << d->irq))) unmask_irqs(1UL << d->irq); }
/* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires_next, now, entry_time, delta; int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; entry_time = now = ktime_get(); retry: sec_debug_aux_log(SEC_DEBUG_AUXLOG_HRTIMER_CHANGE, "hrtimer_interrupt now:%lld, retry:%d", now.tv64, retries); expires_next.tv64 = KTIME_MAX; raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next.tv64 = KTIME_MAX; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *base; struct timerqueue_node *node; ktime_t basenow; if (!(cpu_base->active_bases & (1 << i))) continue; base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing querry for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ sec_debug_hrtimer_log(timer, &basenow.tv64, timer->function, 0); sec_debug_aux_log(SEC_DEBUG_AUXLOG_HRTIMER_CHANGE, "i:%d, now:%lld, basenow:%lld, _softexpire:%lld", i, now.tv64, basenow.tv64, hrtimer_get_softexpires_tv64(timer)); if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < expires_next.tv64) expires_next = expires; break; } __run_hrtimer(timer, &basenow); } } /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ if (expires_next.tv64 == KTIME_MAX || !tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; sec_debug_aux_log(SEC_DEBUG_AUXLOG_HRTIMER_CHANGE, "hrtimer_interrupt exit now:%lld, retry:%d", now.tv64, retries); return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. */ now = ktime_get(); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; delta = ktime_sub(now, entry_time); if (delta.tv64 > cpu_base->max_hang_time.tv64) cpu_base->max_hang_time = delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta.tv64 > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); }
static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the cpu which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { return; } if (need_resched()) return; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", (unsigned int) local_softirq_pending()); ratelimit++; } return; } ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; if (rcu_delta_jiffies < delta_jiffies) { next_jiffies = last_jiffies + rcu_delta_jiffies; delta_jiffies = rcu_delta_jiffies; } } /* * Do not stop the tick, if we are only one off * or if the cpu is required for rcu */ if (!ts->tick_stopped && delta_jiffies == 1) goto out; /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { /* * If this cpu is the one which updates jiffies, then * give up the assignment and let it be taken by the * cpu which runs the tick timer next, which might be * this cpu as well. If we don't drop this here the * jiffies might be stale and do_timer() never * invoked. Keep track of the fact that it was the one * which had the do_timer() duty last. If this cpu is * the one which had the do_timer() duty last, we * limit the sleep time to the timekeeping * max_deferement value which we retrieved * above. Otherwise we can sleep as long as we want. */ if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { time_delta = KTIME_MAX; ts->do_timer_last = 0; } else if (!ts->do_timer_last) { time_delta = KTIME_MAX; } /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals * that there is no timer pending or at least extremely * far into the future (12 days for HZ=1000). In this * case we set the expiry to the end of time. */ if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { /* * Calculate the time delta for the next timer event. * If the time delta exceeds the maximum time delta * permitted by the current clocksource then adjust * the time delta accordingly to ensure the * clocksource does not wrap. */ time_delta = min_t(u64, time_delta, tick_period.tv64 * delta_jiffies); } if (time_delta < KTIME_MAX) expires = ktime_add_ns(last_update, time_delta); else expires.tv64 = KTIME_MAX; /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; } ts->idle_sleeps++; /* Mark expires */ ts->idle_expires = expires; /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. */ if (unlikely(expires.tv64 == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) goto out; } else if (!tick_program_event(expires, 0)) goto out; /* * We are past the event already. So we crossed a * jiffie boundary. Update jiffies and raise the * softirq. */ tick_do_update_jiffies64(ktime_get()); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; }
enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { return __get_cpu_var(paravirt_lazy_mode); }
/* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long pause = 1; bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, .range_cyclic = 1, }; nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); global_dirty_limits(&background_thresh, &dirty_thresh); /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need * to ensure we accurately count the 'dirty' pages when * the threshold is low. * * Otherwise it would be possible to get thresh+n pages * reported dirty, even though there are thresh-m pages * actually dirty; with m+n sitting in the percpu * deltas. */ if (bdi_thresh < 2*bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); } else { bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } /* * The bdi thresh is somehow "soft" limit derived from the * global "hard" limit. The former helps to prevent heavy IO * bdi or process from holding back light ones; The latter is * the last resort safeguard. */ dirty_exceeded = (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) || (nr_reclaimable + nr_writeback > dirty_thresh); if (!dirty_exceeded) break; if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. * Only move pages to writeback if this bdi is over its * threshold otherwise wait until the disk writes catch * up. */ trace_wbc_balance_dirty_start(&wbc, bdi); if (bdi_nr_reclaimable > bdi_thresh) { writeback_inodes_wb(&bdi->wb, &wbc); pages_written += write_chunk - wbc.nr_to_write; trace_wbc_balance_dirty_written(&wbc, bdi); if (pages_written >= write_chunk) break; /* We've done our duty */ } trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); /* * Increase the delay for each loop, up to our previous * default of taking a 100ms nap. */ pause <<= 1; if (pause > HZ / 10) pause = HZ / 10; } if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; /* * In laptop mode, we wait until hitting the higher threshold before * starting background writeout, and then write out all the way down * to the lower threshold. So slow writers cause minimal disk activity. * * In normal mode, we start background writeout at the lower * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || (!laptop_mode && (nr_reclaimable > background_thresh))) bdi_start_background_writeback(bdi); } void set_page_dirty_balance(struct page *page, int page_mkwrite) { if (set_page_dirty(page) || page_mkwrite) { struct address_space *mapping = page_mapping(page); if (mapping) balance_dirty_pages_ratelimited(mapping); } } static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * * On really big machines, get_writeback_state is expensive, so try to avoid * calling it too often (ratelimiting). But once we're over the dirty memory * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { unsigned long ratelimit; unsigned long *p; ratelimit = ratelimit_pages; if (mapping->backing_dev_info->dirty_exceeded) ratelimit = 8; /* * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ preempt_disable(); p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); }
/** * tick_program_event */ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; return __tick_program_event(dev, expires, force); }
bool kmemcheck_active(struct pt_regs *regs) { struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); return data->balance > 0; }
/* * Put CPU in low power mode. */ void arch_idle(void) { bool allow[MSM_PM_SLEEP_MODE_NR]; uint32_t sleep_limit = SLEEP_LIMIT_NONE; int64_t timer_expiration; int latency_qos; int ret; int i; unsigned int cpu; int64_t t1; static DEFINE_PER_CPU(int64_t, t2); int exit_stat; if (!atomic_read(&msm_pm_init_done)) return; cpu = smp_processor_id(); latency_qos = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); /* get the next timer expiration */ timer_expiration = ktime_to_ns(tick_nohz_get_sleep_length()); t1 = ktime_to_ns(ktime_get()); msm_pm_add_stat(MSM_PM_STAT_NOT_IDLE, t1 - __get_cpu_var(t2)); msm_pm_add_stat(MSM_PM_STAT_REQUESTED_IDLE, timer_expiration); exit_stat = MSM_PM_STAT_IDLE_SPIN; for (i = 0; i < ARRAY_SIZE(allow); i++) allow[i] = true; if (num_online_cpus() > 1 || (timer_expiration < msm_pm_idle_sleep_min_time) || !msm_pm_irq_extns->idle_sleep_allowed()) { allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE] = false; allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE_NO_XO_SHUTDOWN] = false; } for (i = 0; i < ARRAY_SIZE(allow); i++) { struct msm_pm_platform_data *mode = &msm_pm_modes[MSM_PM_MODE(cpu, i)]; if (!mode->idle_supported || !mode->idle_enabled || mode->latency >= latency_qos || mode->residency * 1000ULL >= timer_expiration) allow[i] = false; } if (allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE] || allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE_NO_XO_SHUTDOWN]) { uint32_t wait_us = CONFIG_MSM_IDLE_WAIT_ON_MODEM; while (msm_pm_modem_busy() && wait_us) { if (wait_us > 100) { udelay(100); wait_us -= 100; } else { udelay(wait_us); wait_us = 0; } } if (msm_pm_modem_busy()) { allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE] = false; allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE_NO_XO_SHUTDOWN] = false; } } MSM_PM_DPRINTK(MSM_PM_DEBUG_IDLE, KERN_INFO, "%s(): latency qos %d, next timer %lld, sleep limit %u\n", __func__, latency_qos, timer_expiration, sleep_limit); for (i = 0; i < ARRAY_SIZE(allow); i++) MSM_PM_DPRINTK(MSM_PM_DEBUG_IDLE, KERN_INFO, "%s(): allow %s: %d\n", __func__, msm_pm_sleep_mode_labels[i], (int)allow[i]); if (allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE] || allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE_NO_XO_SHUTDOWN]) { /* Sync the timer with SCLK, it is needed only for modem * assissted pollapse case. */ int64_t next_timer_exp = msm_timer_enter_idle(); uint32_t sleep_delay; bool low_power = false; sleep_delay = (uint32_t) msm_pm_convert_and_cap_time( next_timer_exp, MSM_PM_SLEEP_TICK_LIMIT); if (sleep_delay == 0) /* 0 would mean infinite time */ sleep_delay = 1; if (!allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE]) sleep_limit = SLEEP_LIMIT_NO_TCXO_SHUTDOWN; #if defined(CONFIG_MSM_MEMORY_LOW_POWER_MODE_IDLE_ACTIVE) sleep_limit |= SLEEP_RESOURCE_MEMORY_BIT1; #elif defined(CONFIG_MSM_MEMORY_LOW_POWER_MODE_IDLE_RETENTION) sleep_limit |= SLEEP_RESOURCE_MEMORY_BIT0; #endif ret = msm_pm_power_collapse(true, sleep_delay, sleep_limit); low_power = (ret != -EBUSY && ret != -ETIMEDOUT); msm_timer_exit_idle(low_power); if (ret) exit_stat = MSM_PM_STAT_IDLE_FAILED_POWER_COLLAPSE; else { exit_stat = MSM_PM_STAT_IDLE_POWER_COLLAPSE; msm_pm_sleep_limit = sleep_limit; } } else if (allow[MSM_PM_SLEEP_MODE_POWER_COLLAPSE_STANDALONE]) { ret = msm_pm_power_collapse_standalone(true); exit_stat = ret ? MSM_PM_STAT_IDLE_FAILED_STANDALONE_POWER_COLLAPSE : MSM_PM_STAT_IDLE_STANDALONE_POWER_COLLAPSE; } else if (allow[MSM_PM_SLEEP_MODE_RAMP_DOWN_AND_WAIT_FOR_INTERRUPT]) { ret = msm_pm_swfi(true); if (ret) while (!msm_pm_irq_extns->irq_pending()) udelay(1); exit_stat = ret ? MSM_PM_STAT_IDLE_SPIN : MSM_PM_STAT_IDLE_WFI; } else if (allow[MSM_PM_SLEEP_MODE_WAIT_FOR_INTERRUPT]) { msm_pm_swfi(false); exit_stat = MSM_PM_STAT_IDLE_WFI; } else { while (!msm_pm_irq_extns->irq_pending()) udelay(1); exit_stat = MSM_PM_STAT_IDLE_SPIN; } __get_cpu_var(t2) = ktime_to_ns(ktime_get()); msm_pm_add_stat(exit_stat, __get_cpu_var(t2) - t1); }
int msm_spm_set_low_power_mode(unsigned int mode, bool notify_rpm) { struct msm_spm_device *dev = &__get_cpu_var(msm_cpu_spm_device); return msm_spm_dev_set_low_power_mode(dev, mode, notify_rpm); }
/** * tick_nohz_get_sleep_length - return the length of the current sleep * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_sleep_length(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); return ts->sleep_length; }
/* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); set_bit(0, &ts->check_clocks); }
void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); clockevents_register_device(&__get_cpu_var(xen_clock_events)); }
static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = &__get_cpu_var(menu_devices); int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); int i; int multiplier; struct timespec t; int repeat = 0, low_predicted = 0; int cpu = smp_processor_id(); struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); if (data->needs_update) { menu_update(drv, dev); data->needs_update = 0; } data->last_state_idx = 0; data->exit_us = 0; if (unlikely(latency_req == 0)) return 0; t = ktime_to_timespec(tick_nohz_get_sleep_length()); data->expected_us = t.tv_sec * USEC_PER_SEC + t.tv_nsec / NSEC_PER_USEC; data->bucket = which_bucket(data->expected_us); multiplier = performance_multiplier(); if (data->correction_factor[data->bucket] == 0) data->correction_factor[data->bucket] = RESOLUTION * DECAY; data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], RESOLUTION * DECAY); repeat = get_typical_interval(data); if (data->expected_us > 5 && dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; struct cpuidle_state_usage *su = &dev->states_usage[i]; if (su->disable) continue; if (s->target_residency > data->predicted_us) { low_predicted = 1; continue; } if (s->exit_latency > latency_req) continue; if (s->exit_latency * multiplier > data->predicted_us) continue; data->last_state_idx = i; data->exit_us = s->exit_latency; } /* not deepest C-state chosen for low predicted residency */ if (low_predicted) { unsigned int timer_us = 0; unsigned int perfect_us = 0; /* * Set a timer to detect whether this sleep is much * longer than repeat mode predicted. If the timer * triggers, the code will evaluate whether to put * the CPU into a deeper C-state. * The timer is cancelled on CPU wakeup. */ timer_us = 2 * (data->predicted_us + MAX_DEVIATION); perfect_us = perfect_cstate_ms * 1000; if (repeat && (4 * timer_us < data->expected_us)) { hrtimer_start(hrtmr, ns_to_ktime(1000 * timer_us), HRTIMER_MODE_REL_PINNED); /* In repeat case, menu hrtimer is started */ per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT; } else if (perfect_us < data->expected_us) { /* * The next timer is long. This could be because * we did not make a useful prediction. * In that case, it makes sense to re-enter * into a deeper C-state after some time. */ hrtimer_start(hrtmr, ns_to_ktime(1000 * timer_us), HRTIMER_MODE_REL_PINNED); /* In general case, menu hrtimer is started */ per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_GENERAL; } } return data->last_state_idx; }
/* * The interrupt handling path, implemented in terms of HV interrupt * emulation on TILEPro, and IPI hardware on TILE-Gx. * Entered with interrupts disabled. */ void tile_dev_intr(struct pt_regs *regs, int intnum) { int depth = __get_cpu_var(irq_depth)++; unsigned long original_irqs; unsigned long remaining_irqs; struct pt_regs *old_regs; #if CHIP_HAS_IPI() /* * Pending interrupts are listed in an SPR. We might be * nested, so be sure to only handle irqs that weren't already * masked by a previous interrupt. Then, mask out the ones * we're going to handle. */ unsigned long masked = __insn_mfspr(SPR_IPI_MASK_K); original_irqs = __insn_mfspr(SPR_IPI_EVENT_K) & ~masked; __insn_mtspr(SPR_IPI_MASK_SET_K, original_irqs); #else /* * Hypervisor performs the equivalent of the Gx code above and * then puts the pending interrupt mask into a system save reg * for us to find. */ original_irqs = __insn_mfspr(SPR_SYSTEM_SAVE_K_3); #endif remaining_irqs = original_irqs; /* Track time spent here in an interrupt context. */ old_regs = set_irq_regs(regs); irq_enter(); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: less than 1/8th stack free? */ { long sp = stack_pointer - (long) current_thread_info(); if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { pr_emerg("tile_dev_intr: " "stack overflow: %ld\n", sp - sizeof(struct thread_info)); dump_stack(); } } #endif while (remaining_irqs) { unsigned long irq = __ffs(remaining_irqs); remaining_irqs &= ~(1UL << irq); /* Count device irqs; Linux IPIs are counted elsewhere. */ if (irq != IRQ_RESCHEDULE) __get_cpu_var(irq_stat).irq_dev_intr_count++; generic_handle_irq(irq); } /* * If we weren't nested, turn on all enabled interrupts, * including any that were reenabled during interrupt * handling. */ if (depth == 0) unmask_irqs(~__get_cpu_var(irq_disable_mask)); __get_cpu_var(irq_depth)--; /* * Track time spent against the current process again and * process any softirqs if they are waiting. */ irq_exit(); set_irq_regs(old_regs); }
static void vtss_dsa_on_each_cpu_fini(void* ctx) { if (hardcfg.family == 0x06 || hardcfg.family == 0x0f) { wrmsrl(DS_AREA_MSR, __get_cpu_var(vtss_dsa_cpu_msr)); } }
/* * Debug exception handlers. */ static int breakpoint_handler(unsigned long unused, unsigned int esr, struct pt_regs *regs) { int i, step = 0, *kernel_step; u32 ctrl_reg; u64 addr, val; struct perf_event *bp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint_ctrl ctrl; slots = (struct perf_event **)__get_cpu_var(bp_on_reg); addr = instruction_pointer(regs); debug_info = ¤t->thread.debug; for (i = 0; i < core_num_brps; ++i) { rcu_read_lock(); bp = slots[i]; if (bp == NULL) goto unlock; /* Check if the breakpoint value matches. */ val = read_wb_reg(AARCH64_DBG_REG_BVR, i); if (val != (addr & ~0x3)) goto unlock; /* Possible match, check the byte address select to confirm. */ ctrl_reg = read_wb_reg(AARCH64_DBG_REG_BCR, i); decode_ctrl_reg(ctrl_reg, &ctrl); if (!((1 << (addr & 0x3)) & ctrl.len)) goto unlock; counter_arch_bp(bp)->trigger = addr; perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ if (!bp->overflow_handler) step = 1; unlock: rcu_read_unlock(); } if (!step) return 0; if (user_mode(regs)) { debug_info->bps_disabled = 1; toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL0, 0); /* If we're already stepping a watchpoint, just return. */ if (debug_info->wps_disabled) return 0; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; else user_enable_single_step(current); } else { toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0); kernel_step = &__get_cpu_var(stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) return 0; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; } else { *kernel_step = ARM_KERNEL_STEP_ACTIVE; kernel_enable_single_step(regs); } } return 0; }
int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns, const enum hrtimer_mode mode, int wakeup) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret, leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ ret = remove_hrtimer(timer, base); /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures * to signal that they simply return xtime in * do_gettimeoffset(). In this case we want to round up by * resolution when starting a relative timer, to avoid short * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES tim = ktime_add_safe(tim, base->resolution); #endif } hrtimer_set_expires_range_ns(timer, tim, delta_ns); timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); /* * Only allow reprogramming if the new base is on this CPU. * (it might still be on another CPU if the timer was pending) * * XXX send_remote_softirq() ? */ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && hrtimer_enqueue_reprogram(timer, new_base)) { if (wakeup) { /* * We need to drop cpu_base->lock to avoid a * lock ordering issue vs. rq->lock. */ raw_spin_unlock(&new_base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); local_irq_restore(flags); return ret; } else { __raise_softirq_irqoff(HRTIMER_SOFTIRQ); } } unlock_hrtimer_base(timer, &flags); return ret; }
static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { int i, step = 0, *kernel_step, access; u32 ctrl_reg; u64 val, alignment_mask; struct perf_event *wp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; slots = (struct perf_event **)__get_cpu_var(wp_on_reg); debug_info = ¤t->thread.debug; for (i = 0; i < core_num_wrps; ++i) { rcu_read_lock(); wp = slots[i]; if (wp == NULL) goto unlock; info = counter_arch_bp(wp); /* AArch32 watchpoints are either 4 or 8 bytes aligned. */ if (is_compat_task()) { if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) alignment_mask = 0x7; else alignment_mask = 0x3; } else { alignment_mask = 0x7; } /* Check if the watchpoint value matches. */ val = read_wb_reg(AARCH64_DBG_REG_WVR, i); if (val != (addr & ~alignment_mask)) goto unlock; /* Possible match, check the byte address select to confirm. */ ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); decode_ctrl_reg(ctrl_reg, &ctrl); if (!((1 << (addr & alignment_mask)) & ctrl.len)) goto unlock; /* * Check that the access type matches. * 0 => load, otherwise => store */ access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) goto unlock; info->trigger = addr; perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ if (!wp->overflow_handler) step = 1; unlock: rcu_read_unlock(); } if (!step) return 0; /* * We always disable EL0 watchpoints because the kernel can * cause these to fire via an unprivileged access. */ toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL0, 0); if (user_mode(regs)) { debug_info->wps_disabled = 1; /* If we're already stepping a breakpoint, just return. */ if (debug_info->bps_disabled) return 0; if (test_thread_flag(TIF_SINGLESTEP)) debug_info->suspended_step = 1; else user_enable_single_step(current); } else { toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0); kernel_step = &__get_cpu_var(stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) return 0; if (kernel_active_single_step()) { *kernel_step = ARM_KERNEL_STEP_SUSPEND; } else { *kernel_step = ARM_KERNEL_STEP_ACTIVE; kernel_enable_single_step(regs); } } return 0; }
static inline struct sched_clock_data *this_scd(void) { return &__get_cpu_var(sched_clock_data); }
inline int op_x86_phys_to_virt(int phys) { return __get_cpu_var(switch_index) + phys; }
static void kmemcheck_access(struct pt_regs *regs, unsigned long fallback_address, enum kmemcheck_method fallback_method) { const uint8_t *insn; const uint8_t *insn_primary; unsigned int size; struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); /* Recursive fault -- ouch. */ if (data->busy) { kmemcheck_show_addr(fallback_address); kmemcheck_error_save_bug(regs); return; } data->busy = true; insn = (const uint8_t *) regs->ip; insn_primary = kmemcheck_opcode_get_primary(insn); kmemcheck_opcode_decode(insn, &size); switch (insn_primary[0]) { #ifdef CONFIG_KMEMCHECK_BITOPS_OK /* AND, OR, XOR */ /* * Unfortunately, these instructions have to be excluded from * our regular checking since they access only some (and not * all) bits. This clears out "bogus" bitfield-access warnings. */ case 0x80: case 0x81: case 0x82: case 0x83: switch ((insn_primary[1] >> 3) & 7) { /* OR */ case 1: /* AND */ case 4: /* XOR */ case 6: kmemcheck_write(regs, fallback_address, size); goto out; /* ADD */ case 0: /* ADC */ case 2: /* SBB */ case 3: /* SUB */ case 5: /* CMP */ case 7: break; } break; #endif /* MOVS, MOVSB, MOVSW, MOVSD */ case 0xa4: case 0xa5: /* * These instructions are special because they take two * addresses, but we only get one page fault. */ kmemcheck_copy(regs, regs->si, regs->di, size); goto out; /* CMPS, CMPSB, CMPSW, CMPSD */ case 0xa6: case 0xa7: kmemcheck_read(regs, regs->si, size); kmemcheck_read(regs, regs->di, size); goto out; } /* * If the opcode isn't special in any way, we use the data from the * page fault handler to determine the address and type of memory * access. */ switch (fallback_method) { case KMEMCHECK_READ: kmemcheck_read(regs, fallback_address, size); goto out; case KMEMCHECK_WRITE: kmemcheck_write(regs, fallback_address, size); goto out; } out: data->busy = false; }
/* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; int i, raise = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; retry: now = ktime_get(); expires_next.tv64 = KTIME_MAX; base = cpu_base->clock_base; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; struct rb_node *node; spin_lock(&cpu_base->lock); basenow = ktime_add(now, base->offset); while ((node = base->first)) { struct hrtimer *timer; timer = rb_entry(node, struct hrtimer, node); if (basenow.tv64 < timer->expires.tv64) { ktime_t expires; expires = ktime_sub(timer->expires, base->offset); if (expires.tv64 < expires_next.tv64) expires_next = expires; break; } /* Move softirq callbacks to the pending list */ if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); raise = 1; continue; } __run_hrtimer(timer); } spin_unlock(&cpu_base->lock); base++; } cpu_base->expires_next = expires_next; /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { if (tick_program_event(expires_next, 0)) goto retry; } /* Raise softirq ? */ if (raise) raise_softirq(HRTIMER_SOFTIRQ); }
static void nmi_cpu_stop(void *dummy) { struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); model->stop(msrs); }
static void run_hrtimer_softirq(struct softirq_action *h) { run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); }
/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; struct task_struct *p; int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->user->processes) >= p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->user != current->nsproxy->user_ns->root_user) goto bad_fork_free; } atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); get_group_info(p->group_info); /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ if (nr_threads >= max_threads) goto bad_fork_cleanup_count; if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_flipctr_idx = 0; #endif /* #ifdef CONFIG_PREEMPT_RCU */ p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); p->utime = cputime_zero; p->stime = cputime_zero; p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #ifdef CONFIG_DETECT_SOFTLOCKUP p->last_switch_count = 0; p->last_switch_timestamp = 0; #endif task_io_accounting_init(&p->ioac); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; p->it_prof_expires = cputime_zero; p->it_sched_expires = 0; INIT_LIST_HEAD(&p->cpu_timers[0]); INIT_LIST_HEAD(&p->cpu_timers[1]); INIT_LIST_HEAD(&p->cpu_timers[2]); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); #ifdef CONFIG_SECURITY p->security = NULL; #endif p->cap_bset = current->cap_bset; p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW p->hardirqs_enabled = 1; #else p->hardirqs_enabled = 0; #endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); if ((retval = security_task_alloc(p))) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_security; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(task_active_pid_ns(p)); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { retval = pid_ns_prepare_proc(task_active_pid_ns(p)); if (retval < 0) goto bad_fork_free_pid; } } p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) goto bad_fork_free_pid; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing should be turned off in the child regardless * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ cgroup_fork_callbacks(p); cgroup_callbacks_done = 1; /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. * * The cpus_allowed mask of the parent may have changed after it was * copied first time - so re-copy it here, then check the child's CPU * to ensure it is on a valid CPU (and if not, just force it back to * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); if (!cputime_eq(current->signal->it_virt_expires, cputime_zero) || !cputime_eq(current->signal->it_prof_expires, cputime_zero) || current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || !list_empty(¤t->signal->cpu_timers[0]) || !list_empty(¤t->signal->cpu_timers[1]) || !list_empty(¤t->signal->cpu_timers[2])) { /* * Have child wake up on its first tick to check * for process CPU timers. */ p->it_prof_expires = jiffies_to_cputime(1); } } if (likely(p->pid)) { list_add_tail(&p->sibling, &p->real_parent->children); tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = current->signal->tty; set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); return p; bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: put_io_context(p->io_context); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) __cleanup_signal(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: put_group_info(p->group_info); atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: free_task(p); fork_out: return ERR_PTR(retval); }
/* * Is the high resolution mode active ? */ static inline int hrtimer_hres_active(void) { return __get_cpu_var(hrtimer_bases).hres_active; }
/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; struct task_struct *p; int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current); if (!p) goto fork_out; ftrace_graph_init_task(p); rt_mutex_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; } retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = cputime_zero; p->stime = cputime_zero; p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; #ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_cgroup; } mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW p->hardirqs_enabled = 1; #else p->hardirqs_enabled = 0; #endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; p->hardirq_disable_event = 0; p->softirqs_enabled = 1; p->softirq_enable_ip = _THIS_IP_; p->softirq_enable_event = 0; p->softirq_disable_ip = 0; p->softirq_disable_event = 0; p->hardirq_context = 0; p->softirq_context = 0; #endif #ifdef CONFIG_LOCKDEP p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_CGROUP_MEM_RES_CTLR p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) goto bad_fork_cleanup_files; if ((retval = copy_sighand(clone_flags, p))) goto bad_fork_cleanup_fs; if ((retval = copy_signal(clone_flags, p))) goto bad_fork_cleanup_sighand; if ((retval = copy_mm(clone_flags, p))) goto bad_fork_cleanup_signal; if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } } p->pid = pid_nr(pid); p->tgid = p->pid; if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; if (current->nsproxy != p->nsproxy) { retval = ns_cgroup_clone(p, pid); if (retval) goto bad_fork_free_pid; } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) p->sas_ss_sp = p->sas_ss_size = 0; /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif clear_all_latency_tracing(p); /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible * on the tasklist. */ cgroup_fork_callbacks(p); cgroup_callbacks_done = 1; /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; } if (clone_flags & CLONE_THREAD) { current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } if (likely(p->pid)) { tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, pid); nr_threads++; } total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); perf_event_fork(p); return p; bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { task_lock(p); if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) atomic_dec(&p->mm->oom_disable_count); task_unlock(p); mmput(p->mm); } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: free_task(p); fork_out: return ERR_PTR(retval); }
void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = &__get_cpu_var(idt_desc); xen_convert_trap_info(desc, traps); }
/** * tick_nohz_get_sleep_length - return the length of the current sleep * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_sleep_length(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; return ktime_sub(dev->next_event, ts->idle_entrytime); }
/* Kick device. Returns: 0 - queue is empty or throttled. >0 - queue is not empty. NOTE: Called under dev->queue_lock with locally disabled BH. */ static inline int qdisc_restart(struct net_device *dev) { struct Qdisc *q = dev->qdisc; struct sk_buff *skb; /* Dequeue packet */ if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) { unsigned nolock = (dev->features & NETIF_F_LLTX); dev->gso_skb = NULL; /* * When the driver has LLTX set it does its own locking * in start_xmit. No need to add additional overhead by * locking again. These checks are worth it because * even uncongested locks can be quite expensive. * The driver can do trylock like here too, in case * of lock congestion it should return -1 and the packet * will be requeued. */ if (!nolock) { if (!netif_tx_trylock(dev)) { collision: /* So, someone grabbed the driver. */ /* It may be transient configuration error, when hard_start_xmit() recurses. We detect it by checking xmit owner and drop the packet when deadloop is detected. */ if (dev->xmit_lock_owner == smp_processor_id()) { kfree_skb(skb); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); goto out; } __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } } { /* And release queue */ spin_unlock(&dev->queue_lock); if (!netif_queue_stopped(dev)) { int ret; ret = dev_hard_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) { if (!nolock) { netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); q = dev->qdisc; goto out; } if (ret == NETDEV_TX_LOCKED && nolock) { spin_lock(&dev->queue_lock); q = dev->qdisc; goto collision; } } /* NETDEV_TX_BUSY - we need to requeue */ /* Release the driver */ if (!nolock) { netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); q = dev->qdisc; } /* Device kicked us out :( This is possible in three cases: 0. driver is locked 1. fastroute is enabled 2. device cannot determine busy state before start of transmission (f.e. dialout) 3. device is buggy (ppp) */ requeue: if (unlikely(q == &noop_qdisc)) kfree_skb(skb); else if (skb->next) dev->gso_skb = skb; else q->ops->requeue(skb, q); netif_schedule(dev); } return 0; out: BUG_ON((int) q->q.qlen < 0); return q->q.qlen; }