/* * the idle thread * - there's no useful work to be done, so just try to conserve power and have * a low exit latency (ie sit in a loop waiting for somebody to say that * they'd like to reschedule) */ void cpu_idle(void) { int cpu = smp_processor_id(); /* endless idle loop with no priority at all */ for (;;) { while (!need_resched()) { void (*idle)(void); smp_rmb(); idle = pm_idle; if (!idle) idle = default_idle; irq_stat[cpu].idle_timestamp = jiffies; idle(); } preempt_enable_no_resched(); schedule(); preempt_disable(); } }
/* * The idle thread. We try to conserve power, while trying to keep * overall latency low. The architecture specific idle is passed * a value to indicate the level of "idleness" of the system. */ void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { void (*idle)(void) = pm_idle; #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(smp_processor_id())) cpu_die(); #endif if (!idle) idle = default_idle; tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched()) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); } }
int parport_poll_peripheral(struct parport *port, unsigned char mask, unsigned char result, int usec) { /* Zero return code is success, >0 is timeout. */ int count = usec / 5 + 2; int i; unsigned char status; for (i = 0; i < count; i++) { status = parport_read_status (port); if ((status & mask) == result) return 0; if (signal_pending (current)) return -EINTR; if (need_resched()) break; if (i >= 2) udelay (5); } return 1; }
static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { unsigned long in_purr; idle_loop_prolog(&in_purr); local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); while (!need_resched()) { HMT_low(); HMT_very_low(); } HMT_medium(); clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); idle_loop_epilog(in_purr); return index; }
static int journal_clean_one_cp_list(struct journal_head *jh, int *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; int ret, freed = 0; *released = 0; if (!jh) return 0; last_jh = jh->b_cpprev; do { jh = next_jh; next_jh = jh->b_cpnext; /* Use trylock because of the ranking */ if (jbd_trylock_bh_state(jh2bh(jh))) { ret = __try_to_free_cp_buf(jh); if (ret) { freed++; if (ret == 2) { *released = 1; return freed; } } } /* * This function only frees up some memory * if possible so we dont have an obligation * to finish processing. Bail out if preemption * requested: */ if (need_resched()) return freed; } while (jh != last_jh); return freed; }
static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; spinlock_t *ptl; int progress = 0; unsigned long ret = 0; again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { struct page *page; if (progress >= 64) { progress = 0; if (need_resched() || need_lockbreak(ptl)) break; } progress++; if (!pte_present(*pte)) continue; if (!pte_maybe_dirty(*pte)) continue; page = vm_normal_page(vma, addr, *pte); if (!page) continue; if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) ret += set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); cond_resched(); if (addr != end) goto again; return ret; }
static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { u64 snooze_exit_time; local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); while (!need_resched()) { HMT_low(); HMT_very_low(); if (snooze_timeout_en && get_tb() > snooze_exit_time) break; } HMT_medium(); ppc64_runlatch_on(); clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); return index; }
void sc8825_idle(void) { int this_cpu = smp_processor_id(); int val; if (!need_resched()) { hw_local_irq_disable(); if (!arch_local_irq_pending()) { val = os_ctx->idle(os_ctx); if (0 == val) { #ifdef CONFIG_CACHE_L2X0 /*l2cache power control, standby mode enable*/ /*L2X0_POWER_CTRL __raw_writel(1, SPRD_L2_BASE+0xF80); l2x0_suspend(); */ #endif #if defined(CONFIG_LOCAL_TIMERS) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &this_cpu); #endif cpu_do_idle(); #if defined(CONFIG_LOCAL_TIMERS) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &this_cpu); #endif #ifdef CONFIG_CACHE_L2X0 /* l2x0_resume(1); */ #endif } } hw_local_irq_enable(); } local_irq_enable(); return; }
/* * The idle thread. There's no useful work to be done, so just try to conserve * power and have a low exit latency (ie sit in a loop waiting for somebody to * say that they'd like to reschedule) */ void cpu_idle(void) { unsigned int cpu = smp_processor_id(); set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); rmb(); if (cpu_is_offline(cpu)) play_dead(); local_irq_disable(); /* Don't trace irqs off for idle */ stop_critical_timings(); if (cpuidle_idle_call()) sh_idle(); /* * Sanity check to ensure that sh_idle() returns * with IRQs enabled */ WARN_ON(irqs_disabled()); start_critical_timings(); } rcu_idle_exit(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } }
void cpu_idle(void) { set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while (1) { void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched()) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); check_pgt_cache(); } }
/* * the idle thread * - there's no useful work to be done, so just try to conserve power and have * a low exit latency (ie sit in a loop waiting for somebody to say that * they'd like to reschedule) */ void cpu_idle(void) { /* endless idle loop with no priority at all */ for (;;) { while (!need_resched()) { void (*idle)(void); smp_rmb(); idle = pm_idle; if (!idle) { #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) idle = poll_idle; #else /* CONFIG_SMP && !CONFIG_HOTPLUG_CPU */ idle = default_idle; #endif /* CONFIG_SMP && !CONFIG_HOTPLUG_CPU */ } idle(); } preempt_enable_no_resched(); schedule(); preempt_disable(); } }
static int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { unsigned long in_purr; ktime_t kt_before; unsigned long start_snooze; long snooze = drv->states[0].target_residency; idle_loop_prolog(&in_purr, &kt_before); if (snooze) { start_snooze = get_tb() + snooze * tb_ticks_per_usec; local_irq_enable(); set_thread_flag(TIF_POLLING_NRFLAG); while ((snooze < 0) || (get_tb() < start_snooze)) { if (need_resched() || cpu_is_offline(dev->cpu)) goto out; ppc64_runlatch_off(); HMT_low(); HMT_very_low(); } HMT_medium(); clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); } out: HMT_medium(); dev->last_residency = (int)idle_loop_epilog(in_purr, kt_before); return index; }
/* * Let's power down on idle, but only if we are really * idle, because once we start down the path of * going idle we continue to do idle even if we get * a clock tick interrupt . . */ void omap_pm_idle(void) { unsigned int mask32 = 0; /* * If the DSP is being used let's just idle the CPU, the overhead * to wake up from Big Sleep is big, milliseconds versus micro * seconds for wait for interrupt. */ local_irq_disable(); local_fiq_disable(); if (need_resched()) { local_fiq_enable(); local_irq_enable(); return; } mask32 = omap_readl(ARM_SYSST); /* * Prevent the ULPD from entering low power state by setting * POWER_CTRL_REG:4 = 0 */ omap_writew(omap_readw(ULPD_POWER_CTRL) & ~ULPD_DEEP_SLEEP_TRANSITION_EN, ULPD_POWER_CTRL); /* * Since an interrupt may set up a timer, we don't want to * reprogram the hardware timer with interrupts enabled. * Re-enable interrupts only after returning from idle. */ timer_dyn_reprogram(); if ((mask32 & DSP_IDLE) == 0) { __asm__ volatile ("mcr p15, 0, r0, c7, c0, 4"); } else
void rk28_panic_reset(void) { int at_debug = rk28_system_crash; printk("%s::crash=%d\n" , __func__ , rk28_system_crash); if( __system_crashed() ) return ; rk28_system_crash |= (1<<16); /* 20091206,HSL@RK,must set flag ! */ if( at_debug >= RKDBG_LOADER ) { kld_reboot(1,1); } else if( at_debug >= RKDBG_CUSTOMER0) { //rk28_system_crash |= RKDBG_CUSTOMER1; __rkusb_bk_save( 0 ); local_irq_enable(); /* for get log by usb */ rk28_usb(); while( 1 ) { if( need_resched() && at_debug >= RKDBG_CUSTOMER1 ) { //debug_print("schedule after panic\n"); schedule(); } } } //rk28_write_file("misc"); kld_reboot(1,0); }
void cpu_idle(void) { for (;;) { rcu_idle_enter(); while (!need_resched()) { void (*idle)(void); smp_rmb(); idle = pm_idle; if (!idle) { #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) idle = poll_idle; #else idle = default_idle; #endif } idle(); } rcu_idle_exit(); schedule_preempt_disabled(); } }
/* * The idle thread. There's no useful work to be done, so just try to conserve * power and have a low exit latency (ie sit in a loop waiting for somebody to * say that they'd like to reschedule) */ void __noreturn cpu_idle(void) { int cpu; /* CPU is going idle. */ cpu = smp_processor_id(); /* endless idle loop with no priority at all */ while (1) { tick_nohz_idle_enter(); rcu_idle_enter(); while (!need_resched() && cpu_online(cpu)) { #ifdef CONFIG_MIPS_MT_SMTC extern void smtc_idle_loop_hook(void); smtc_idle_loop_hook(); #endif if (cpu_wait) { /* Don't trace irqs off for idle */ stop_critical_timings(); (*cpu_wait)(); start_critical_timings(); } } #ifdef CONFIG_HOTPLUG_CPU if (!cpu_online(cpu) && !cpu_isset(cpu, cpu_callin_map) && (system_state == SYSTEM_RUNNING || system_state == SYSTEM_BOOTING)) play_dead(); #endif rcu_idle_exit(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } }
/* * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure * whose grace period has elapsed. */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; RCU_TRACE(int cb_count = 0); /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); /* Invoke the callbacks on the local list. */ RCU_TRACE(rn = rcp->name); while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); __rcu_reclaim(rn, list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), false)); }
/* * Let's power down on idle, but only if we are really * idle, because once we start down the path of * going idle we continue to do idle even if we get * a clock tick interrupt . . */ void omap_pm_idle(void) { extern __u32 arm_idlect1_mask; __u32 use_idlect1 = arm_idlect1_mask; int do_sleep = 0; local_irq_disable(); local_fiq_disable(); if (need_resched()) { local_fiq_enable(); local_irq_enable(); return; } /* * Since an interrupt may set up a timer, we don't want to * reprogram the hardware timer with interrupts enabled. * Re-enable interrupts only after returning from idle. */ timer_dyn_reprogram(); #ifdef CONFIG_OMAP_MPU_TIMER #warning Enable 32kHz OS timer in order to allow sleep states in idle use_idlect1 = use_idlect1 & ~(1 << 9); #else while (enable_dyn_sleep) { #ifdef CONFIG_CBUS_TAHVO_USB extern int vbus_active; /* Clock requirements? */ if (vbus_active) break; #endif do_sleep = 1; break; } #endif #ifdef CONFIG_OMAP_DM_TIMER use_idlect1 = omap_dm_timer_modify_idlect_mask(use_idlect1); #endif if (omap_dma_running()) use_idlect1 &= ~(1 << 6); /* We should be able to remove the do_sleep variable and multiple * tests above as soon as drivers, timer and DMA code have been fixed. * Even the sleep block count should become obsolete. */ if ((use_idlect1 != ~0) || !do_sleep) { __u32 saved_idlect1 = omap_readl(ARM_IDLECT1); if (cpu_is_omap15xx()) use_idlect1 &= OMAP1510_BIG_SLEEP_REQUEST; else use_idlect1 &= OMAP1610_IDLECT1_SLEEP_VAL; omap_writel(use_idlect1, ARM_IDLECT1); __asm__ volatile ("mcr p15, 0, r0, c7, c0, 4"); omap_writel(saved_idlect1, ARM_IDLECT1); local_fiq_enable(); local_irq_enable(); return; }
/* avoid HT sibilings if possible */ if (cpumask_empty(tmp)) cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); if (cpumask_empty(tmp)) { mutex_unlock(&round_robin_lock); return; } for_each_cpu(cpu, tmp) { if (cpu_weight[cpu] < min_weight) { min_weight = cpu_weight[cpu]; preferred_cpu = cpu; } } if (tsk_in_cpu[tsk_index] != -1) cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = preferred_cpu; cpumask_set_cpu(preferred_cpu, pad_busy_cpus); cpu_weight[preferred_cpu]++; mutex_unlock(&round_robin_lock); set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); } static void exit_round_robin(unsigned int tsk_index) { struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits); cpumask_clear_cpu(tsk_in_cpu[tsk_index], pad_busy_cpus); tsk_in_cpu[tsk_index] = -1; } static unsigned int idle_pct = 5; /* percentage */ static unsigned int round_robin_time = 1; /* second */ static int power_saving_thread(void *data) { struct sched_param param = {.sched_priority = 1}; int do_sleep; unsigned int tsk_index = (unsigned long)data; u64 last_jiffies = 0; sched_setscheduler(current, SCHED_RR, ¶m); set_freezable(); while (!kthread_should_stop()) { int cpu; u64 expire_time; try_to_freeze(); /* round robin to cpus */ if (last_jiffies + round_robin_time * HZ < jiffies) { last_jiffies = jiffies; round_robin_cpu(tsk_index); } do_sleep = 0; expire_time = jiffies + HZ * (100 - idle_pct) / 100; while (!need_resched()) { if (tsc_detected_unstable && !tsc_marked_unstable) { /* TSC could halt in idle, so notify users */ mark_tsc_unstable("TSC halts in idle"); tsc_marked_unstable = 1; } if (lapic_detected_unstable && !lapic_marked_unstable) { int i; /* LAPIC could halt in idle, so notify users */ for_each_online_cpu(i) clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ON, &i); lapic_marked_unstable = 1; } local_irq_disable(); cpu = smp_processor_id(); if (lapic_marked_unstable) clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); stop_critical_timings(); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(power_saving_mwait_eax, 1); start_critical_timings(); if (lapic_marked_unstable) clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); local_irq_enable(); if (jiffies > expire_time) { do_sleep = 1; break; } } /* * current sched_rt has threshold for rt task running time. * When a rt task uses 95% CPU time, the rt thread will be * scheduled out for 5% CPU time to not starve other tasks. But * the mechanism only works when all CPUs have RT task running, * as if one CPU hasn't RT task, RT task from other CPUs will * borrow CPU time from this CPU and cause RT task use > 95% * CPU time. To make 'avoid starvation' work, takes a nap here. */ if (do_sleep) schedule_timeout_killable(HZ * idle_pct / 100); } exit_round_robin(tsk_index); return 0; } static struct task_struct *ps_tsks[NR_CPUS]; static unsigned int ps_tsk_num; static int create_power_saving_task(void) { int rc = -ENOMEM; ps_tsks[ps_tsk_num] = kthread_run(power_saving_thread, (void *)(unsigned long)ps_tsk_num, "acpi_pad/%d", ps_tsk_num); rc = PTR_RET(ps_tsks[ps_tsk_num]); if (!rc) ps_tsk_num++; else ps_tsks[ps_tsk_num] = NULL; return rc; }
/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here */ static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; bool broadcast; /* * Check if the idle task must be rescheduled. If it is the * case, exit the function after re-enabling the local irq. */ if (need_resched()) { local_irq_enable(); return; } /* * During the idle period, stop measuring the disabled irqs * critical sections latencies */ stop_critical_timings(); /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period */ rcu_idle_enter(); /* * Ask the cpuidle framework to choose a convenient idle state. * Fall back to the default arch idle method on errors. */ next_state = cpuidle_select(drv, dev); if (next_state < 0) { use_default: /* * We can't use the cpuidle framework, let's use the default * idle routine. */ if (current_clr_polling_and_test()) local_irq_enable(); else arch_cpu_idle(); goto exit_idle; } /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get * out of this function */ if (current_clr_polling_and_test()) { dev->last_residency = 0; entered_state = next_state; local_irq_enable(); goto exit_idle; } broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); /* * Tell the time framework to switch to a broadcast timer * because our local timer will be shutdown. If a local timer * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ if (broadcast && clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; trace_cpu_idle_rcuidle(next_state, dev->cpu); /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take * care of re-enabling the local interrupts */ entered_state = cpuidle_enter(drv, dev, next_state); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); /* * Give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); /* * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); rcu_idle_exit(); start_critical_timings(); }
/** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick * Called either from the idle loop or from irq_exit() when an idle period was * just interrupted by an interrupt which did not cause a reschedule. */ void tick_nohz_stop_sched_tick(int inidle) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; local_irq_save(flags); cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); /* * Call to tick_nohz_start_idle stops the last_update_time from being * updated. Thus, it must not be called in the event we are called from * irq_exit() with the prior state different than idle. */ if (!inidle && !ts->inidle) goto end; /* * Set ts->inidle unconditionally. Even if the system did not * switch to NOHZ mode the cpu frequency governers rely on the * update of the idle time accounting in tick_nohz_start_idle(). */ ts->inidle = 1; now = tick_nohz_start_idle(ts); /* * If this cpu is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the cpu which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; if (need_resched()) goto end; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", local_softirq_pending()); ratelimit++; } goto end; } ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; /* * On SMP we really should only care for the CPU which * has the do_timer duty assigned. All other CPUs can * sleep as long as they want. */ if (cpu == tick_do_timer_cpu || tick_do_timer_cpu == TICK_DO_TIMER_NONE) time_delta = timekeeping_max_deferment(); else time_delta = KTIME_MAX; } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; } /* * Do not stop the tick, if we are only one off * or if the cpu is required for rcu */ if (!ts->tick_stopped && delta_jiffies == 1) goto out; /* Schedule the tick, if we are at least one jiffie off */ if ((long)delta_jiffies >= 1) { /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals * that there is no timer pending or at least extremely * far into the future (12 days for HZ=1000). In this * case we set the expiry to the end of time. */ if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { /* * Calculate the time delta for the next timer event. * If the time delta exceeds the maximum time delta * permitted by the current clocksource then adjust * the time delta accordingly to ensure the * clocksource does not wrap. */ time_delta = min_t(u64, time_delta, tick_period.tv64 * delta_jiffies); expires = ktime_add_ns(last_update, time_delta); } else { expires.tv64 = KTIME_MAX; } /* * If this cpu is the one which updates jiffies, then * give up the assignment and let it be taken by the * cpu which runs the tick timer next, which might be * this cpu as well. If we don't drop this here the * jiffies might be stale and do_timer() never * invoked. */ if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; if (delta_jiffies > 1) cpumask_set_cpu(cpu, nohz_cpu_mask); /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { if (select_nohz_load_balancer(1)) { /* * sched tick not stopped! */ cpumask_clear_cpu(cpu, nohz_cpu_mask); goto out; } ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); } ts->idle_sleeps++; /* Mark expires */ ts->idle_expires = expires; /* * If the expiration time == KTIME_MAX, then * in this case we simply stop the tick timer. */ if (unlikely(expires.tv64 == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) goto out; } else if (!tick_program_event(expires, 0)) goto out; /* * We are past the event already. So we crossed a * jiffie boundary. Update jiffies and raise the * softirq. */ tick_do_update_jiffies64(ktime_get()); cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; ts->sleep_length = ktime_sub(dev->next_event, now); end: local_irq_restore(flags); }
int jbd2_log_do_checkpoint(journal_t *journal) { transaction_t *transaction; tid_t this_tid; int result; jbd_debug(1, "Start checkpoint\n"); result = jbd2_cleanup_journal_tail(journal); trace_jbd2_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; transaction = journal->j_checkpoint_transactions; if (transaction->t_chp_stats.cs_chp_time == 0) transaction->t_chp_stats.cs_chp_time = jiffies; this_tid = transaction->t_tid; restart: if (journal->j_checkpoint_transactions == transaction && transaction->t_tid == this_tid) { int batch_count = 0; struct journal_head *jh; int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { jh = transaction->t_checkpoint_list; retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) result = retry; if (!retry && (need_resched() || spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); retry = 1; break; } } if (batch_count) { if (!retry) { spin_unlock(&journal->j_list_lock); retry = 1; } __flush_batch(journal, &batch_count); } if (retry) { spin_lock(&journal->j_list_lock); goto restart; } err = __wait_cp_io(journal, transaction); if (!result) result = err; } out: spin_unlock(&journal->j_list_lock); if (result < 0) jbd2_journal_abort(journal, result); else result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; }
/** * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here * * On archs that support TIF_POLLING_NRFLAG, is called with polling * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; /* * Check if the idle task must be rescheduled. If it is the * case, exit the function after re-enabling the local irq. */ if (need_resched()) { local_irq_enable(); return; } /* * During the idle period, stop measuring the disabled irqs * critical sections latencies */ stop_critical_timings(); /* * Tell the RCU framework we are entering an idle section, * so no more rcu read side critical sections and one more * step to the grace period */ rcu_idle_enter(); /* * Check if the cpuidle framework is ready, otherwise fallback * to the default arch specific idle method */ next_state = cpuidle_select(drv, dev); if (next_state < 0) { default_idle_call(); goto exit_idle; } /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get * out of this function */ if (current_clr_polling_and_test()) { dev->last_residency = 0; entered_state = next_state; local_irq_enable(); goto exit_idle; } /* Take note of the planned idle state. */ idle_set_state(this_rq(), &drv->states[next_state]); /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take * care of re-enabling the local interrupts */ entered_state = cpuidle_enter(drv, dev, next_state); /* The cpu is no longer idle or about to enter idle. */ idle_set_state(this_rq(), NULL); if (entered_state == -EBUSY) { default_idle_call(); goto exit_idle; } /* * Give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); /* * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); rcu_idle_exit(); start_critical_timings(); }
/* * Garbage collector for unused keys. * * This is done in process context so that we don't have to disable interrupts * all over the place. key_put() schedules this rather than trying to do the * cleanup itself, which means key_put() doesn't have to sleep. */ static void key_garbage_collector(struct work_struct *work) { static LIST_HEAD(graveyard); static u8 gc_state; /* Internal persistent state */ #define KEY_GC_REAP_AGAIN 0x01 /* - Need another cycle */ #define KEY_GC_REAPING_LINKS 0x02 /* - We need to reap links */ #define KEY_GC_SET_TIMER 0x04 /* - We need to restart the timer */ #define KEY_GC_REAPING_DEAD_1 0x10 /* - We need to mark dead keys */ #define KEY_GC_REAPING_DEAD_2 0x20 /* - We need to reap dead key links */ #define KEY_GC_REAPING_DEAD_3 0x40 /* - We need to reap dead keys */ #define KEY_GC_FOUND_DEAD_KEY 0x80 /* - We found at least one dead key */ struct rb_node *cursor; struct key *key; time_t new_timer, limit; kenter("[%lx,%x]", key_gc_flags, gc_state); limit = current_kernel_time().tv_sec; if (limit > key_gc_delay) limit -= key_gc_delay; else limit = key_gc_delay; /* Work out what we're going to be doing in this pass */ gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2; gc_state <<= 1; if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags)) gc_state |= KEY_GC_REAPING_LINKS | KEY_GC_SET_TIMER; if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) gc_state |= KEY_GC_REAPING_DEAD_1; kdebug("new pass %x", gc_state); new_timer = LONG_MAX; /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); cursor = rb_first(&key_serial_tree); continue_scanning: while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); if (atomic_read(&key->usage) == 0) goto found_unreferenced_key; if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); key->perm = 0; goto skip_dead_key; } } if (gc_state & KEY_GC_SET_TIMER) { if (key->expiry > limit && key->expiry < new_timer) { kdebug("will expire %x in %ld", key_serial(key), key->expiry - limit); new_timer = key->expiry; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) if (key->type == key_gc_dead_keytype) gc_state |= KEY_GC_FOUND_DEAD_KEY; if ((gc_state & KEY_GC_REAPING_LINKS) || unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) { if (key->type == &key_type_keyring) goto found_keyring; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) if (key->type == key_gc_dead_keytype) goto destroy_dead_key; skip_dead_key: if (spin_is_contended(&key_serial_lock) || need_resched()) goto contended; } contended: spin_unlock(&key_serial_lock); maybe_resched: if (cursor) { cond_resched(); spin_lock(&key_serial_lock); goto continue_scanning; } /* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one * where we didn't reap any keys. */ kdebug("pass complete"); if (gc_state & KEY_GC_SET_TIMER && new_timer != (time_t)LONG_MAX) { new_timer += key_gc_delay; key_schedule_gc(new_timer); } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) || !list_empty(&graveyard)) { /* Make sure that all pending keyring payload destructions are * fulfilled and that people aren't now looking at dead or * dying keys that they don't have a reference upon or a link * to. */ kdebug("gc sync"); synchronize_rcu(); } if (!list_empty(&graveyard)) { kdebug("gc keys"); key_gc_unused_keys(&graveyard); } if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2))) { if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) { /* No remaining dead keys: short circuit the remaining * keytype reap cycles. */ kdebug("dead short"); gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2); gc_state |= KEY_GC_REAPING_DEAD_3; } else { gc_state |= KEY_GC_REAP_AGAIN; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) { kdebug("dead wake"); smp_mb(); clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE); } if (gc_state & KEY_GC_REAP_AGAIN) schedule_work(&key_gc_work); kleave(" [end %x]", gc_state); return; /* We found an unreferenced key - once we've removed it from the tree, * we can safely drop the lock. */ found_unreferenced_key: kdebug("unrefd key %d", key->serial); rb_erase(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); list_add_tail(&key->graveyard_link, &graveyard); gc_state |= KEY_GC_REAP_AGAIN; goto maybe_resched; /* We found a keyring and we need to check the payload for links to * dead or expired keys. We don't flag another reap immediately as we * have to wait for the old payload to be destroyed by RCU before we * can reap the keys to which it refers. */ found_keyring: spin_unlock(&key_serial_lock); kdebug("scan keyring %d", key->serial); key_gc_keyring(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and * destroy its payload with its semaphore held. */ destroy_dead_key: spin_unlock(&key_serial_lock); kdebug("destroy key %d", key->serial); down_write(&key->sem); key->type = &key_type_dead; if (key_gc_dead_keytype->destroy) key_gc_dead_keytype->destroy(key); memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); up_write(&key->sem); goto maybe_resched; }
/* * Submit all the data buffers to disk */ static int journal_submit_data_buffers(journal_t *journal, transaction_t *commit_transaction, int write_op) { struct journal_head *jh; struct buffer_head *bh; int locked; int bufs = 0; struct buffer_head **wbuf = journal->j_wbuf; int err = 0; /* * Whenever we unlock the journal and sleep, things can get added * onto ->t_sync_datalist, so we have to keep looping back to * write_out_data until we *know* that the list is empty. * * Cleanup any flushed data buffers from the data list. Even in * abort mode, we want to flush this out as soon as possible. */ write_out_data: cond_resched(); spin_lock(&journal->j_list_lock); while (commit_transaction->t_sync_datalist) { jh = commit_transaction->t_sync_datalist; bh = jh2bh(jh); locked = 0; /* Get reference just to make sure buffer does not disappear * when we are forced to drop various locks */ get_bh(bh); /* If the buffer is dirty, we need to submit IO and hence * we need the buffer lock. We try to lock the buffer without * blocking. If we fail, we need to drop j_list_lock and do * blocking lock_buffer(). */ if (buffer_dirty(bh)) { if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); trace_jbd_do_submit_data(journal, commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; lock_buffer(bh); spin_lock(&journal->j_list_lock); } locked = 1; } /* We have to get bh_state lock. Again out of order, sigh. */ if (!inverted_lock(journal, bh)) { jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); } /* Someone already cleaned up the buffer? */ if (!buffer_jbd(bh) || bh2jh(bh) != jh || jh->b_transaction != commit_transaction || jh->b_jlist != BJ_SyncData) { jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); BUFFER_TRACE(bh, "already cleaned up"); release_data_buffer(bh); continue; } if (locked && test_clear_buffer_dirty(bh)) { BUFFER_TRACE(bh, "needs writeout, adding to array"); wbuf[bufs++] = bh; __journal_file_buffer(jh, commit_transaction, BJ_Locked); jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; } } else if (!locked && buffer_locked(bh)) { __journal_file_buffer(jh, commit_transaction, BJ_Locked); jbd_unlock_bh_state(bh); put_bh(bh); } else { BUFFER_TRACE(bh, "writeout complete: unfile"); if (unlikely(!buffer_uptodate(bh))) err = -EIO; __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); release_data_buffer(bh); } if (need_resched() || spin_needbreak(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } } spin_unlock(&journal->j_list_lock); trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; }
asmlinkage void __do_softirq(void) { struct softirq_action *h; __u32 pending; unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; /* * Mask out PF_MEMALLOC s current task context is borrowed for the * softirq. A softirq handled such as network RX might set PF_MEMALLOC * again if the socket is related to swap */ current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); account_system_vtime(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); local_irq_enable(); h = softirq_vec; do { if (pending & 1) { unsigned int vec_nr = h - softirq_vec; int prev_count = preempt_count(); kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); h->action(h); trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); preempt_count_set(prev_count); } rcu_bh_qs(cpu); } h++; pending >>= 1; } while (pending); local_irq_disable(); pending = local_softirq_pending(); if (pending) { if (time_before(jiffies, end) && !need_resched() && --max_restart) goto restart; wakeup_softirqd(); } lockdep_softirq_exit(); account_system_vtime(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); }
/* * Lock a mutex (possibly interruptible), slowpath: */ static inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Optimistic spinning. * * We try to spin for acquisition when we find that there are no * pending waiters and the lock owner is currently running on a * (different) CPU. * * The rationale is that if the lock owner is running, it is likely to * release the lock soon. * * Since this needs the lock owner, and this mutex implementation * doesn't track the owner atomically in the lock field, we need to * track it non-atomically. * * We can't do this for DEBUG_MUTEXES because that relies on wait_lock * to serialize everything. * * The mutex spinners are queued up using MCS lock so that only one * spinner can compete for the mutex. However, if mutex spinning isn't * going to happen, there is no point in going through the lock/unlock * overhead. */ if (!mutex_can_spin_on_owner(lock)) goto slowpath; for (;;) { struct task_struct *owner; struct mspin_node node; /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) { mspin_unlock(MLOCK(lock), &node); break; } if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); preempt_enable(); return 0; } mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the * owner acquiring the lock and setting the owner field. If * we're an RT task that will live-lock because we won't let * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) break; /* * The cpu_relax() call is a compiler barrier which forces * everything in this loop to be re-loaded. We don't need * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ arch_mutex_cpu_relax(); } slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_lock_common(lock, &waiter); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) goto done; lock_contended(&lock->dep_map, ip); for (;;) { /* * Lets try to take the lock again - this is needed even if * we get here for the first time (shortly after failing to * acquire the lock), to make sure that we get a wakeup once * it's unlocked. Later on, if we sleep, this is the * operation that gives us the lock. We xchg it to -1, so * that when we release the lock, we properly wake up the * other waiters: */ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) break; /* * got a signal? (This code gets eliminated in the * TASK_UNINTERRUPTIBLE case.) */ if (unlikely(signal_pending_state(state, task))) { mutex_remove_waiter(lock, &waiter, task_thread_info(task)); mutex_release(&lock->dep_map, 1, ip); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); preempt_enable(); return -EINTR; } __set_task_state(task, state); /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } done: lock_acquired(&lock->dep_map, ip); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, current_thread_info()); mutex_set_owner(lock); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); preempt_enable(); return 0; }
/* * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. * Called with j_checkpoint_mutex held. */ int jbd2_log_do_checkpoint(journal_t *journal) { struct journal_head *jh; struct buffer_head *bh; transaction_t *transaction; tid_t this_tid; int result, batch_count = 0; jbd_debug(1, "Start checkpoint\n"); /* * First thing: if there are any transactions in the log which * don't need checkpointing, just eliminate them from the * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); trace_jbd2_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; /* * OK, we need to start writing disk blocks. Take one transaction * and write it. */ result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; transaction = journal->j_checkpoint_transactions; if (transaction->t_chp_stats.cs_chp_time == 0) transaction->t_chp_stats.cs_chp_time = jiffies; this_tid = transaction->t_tid; restart: /* * If someone cleaned up this transaction while we slept, we're * done (maybe it's a new transaction, but it fell at the same * address). */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) goto out; /* checkpoint all of the transaction's buffers */ while (transaction->t_checkpoint_list) { jh = transaction->t_checkpoint_list; bh = jh2bh(jh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); get_bh(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); goto retry; } if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so * starting and waiting for a commit * to finish will cause us to wait for * a _very_ long time. */ printk(KERN_ERR "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); jbd2_log_start_commit(journal, tid); jbd2_log_wait_commit(journal, tid); goto retry; } if (!buffer_dirty(bh)) { if (unlikely(buffer_write_io_error(bh)) && !result) result = -EIO; BUFFER_TRACE(bh, "remove from checkpoint"); if (__jbd2_journal_remove_checkpoint(jh)) /* The transaction was released; we're done */ goto out; continue; } /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal * lock. We cannot afford to let the transaction * logic start messing around with this buffer before * we write it to disk, as that would break * recoverability. */ BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); journal->j_chkpt_bhs[batch_count++] = bh; __buffer_relink_io(jh); transaction->t_chp_stats.cs_written++; if ((batch_count == JBD2_NR_BATCH) || need_resched() || spin_needbreak(&journal->j_list_lock)) goto unlock_and_flush; } if (batch_count) { unlock_and_flush: spin_unlock(&journal->j_list_lock); retry: if (batch_count) __flush_batch(journal, &batch_count); spin_lock(&journal->j_list_lock); goto restart; } /* * Now we issued all of the transaction's buffers, let's deal * with the buffers that are out for I/O. */ restart2: /* Did somebody clean up the transaction in the meanwhile? */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) goto out; while (transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); get_bh(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); spin_lock(&journal->j_list_lock); goto restart2; } if (unlikely(buffer_write_io_error(bh)) && !result) result = -EIO; /* * Now in whatever state the buffer currently is, we * know that it has been written out and so we can * drop it from the list */ if (__jbd2_journal_remove_checkpoint(jh)) break; } out: spin_unlock(&journal->j_list_lock); if (result < 0) jbd2_journal_abort(journal, result); else result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; }
bool osq_lock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *prev, *next; int curr = encode_cpu(smp_processor_id()); int old; node->locked = 0; node->next = NULL; node->cpu = curr; /* * We need both ACQUIRE (pairs with corresponding RELEASE in * unlock() uncontended, or fastpath) and RELEASE (to publish * the node fields we just initialised) semantics when updating * the lock tail. */ old = atomic_xchg(&lock->tail, curr); if (old == OSQ_UNLOCKED_VAL) return true; prev = decode_cpu(old); node->prev = prev; WRITE_ONCE(prev->next, node); /* * Normally @prev is untouchable after the above store; because at that * moment unlock can proceed and wipe the node element from stack. * * However, since our nodes are static per-cpu storage, we're * guaranteed their existence -- this allows us to apply * cmpxchg in an attempt to undo our queueing. */ while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ if (need_resched()) goto unqueue; cpu_relax_lowlatency(); } return true; unqueue: /* * Step - A -- stabilize @prev * * Undo our @prev->next assignment; this will make @prev's * unlock()/unqueue() wait for a next pointer since @lock points to us * (or later). */ for (;;) { if (prev->next == node && cmpxchg(&prev->next, node, NULL) == node) break; /* * We can only fail the cmpxchg() racing against an unlock(), * in which case we should observe @node->locked becomming * true. */ if (smp_load_acquire(&node->locked)) return true; cpu_relax_lowlatency(); /* * Or we race against a concurrent unqueue()'s step-B, in which * case its step-C will write us a new @node->prev pointer. */ prev = READ_ONCE(node->prev); } /* * Step - B -- stabilize @next * * Similar to unlock(), wait for @node->next or move @lock from @node * back to @prev. */ next = osq_wait_next(lock, node, prev); if (!next) return false; /* * Step - C -- unlink * * @prev is stable because its still waiting for a new @prev->next * pointer, @next is stable because our @node->next pointer is NULL and * it will wait in Step-A. */ WRITE_ONCE(next->prev, prev); WRITE_ONCE(prev->next, next); return false; }
static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; int cpu; cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); now = tick_nohz_start_idle(cpu, ts); if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return; if (need_resched()) return; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", (unsigned int) local_softirq_pending()); ratelimit++; } return; } ts->idle_calls++; do { seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; if (rcu_delta_jiffies < delta_jiffies) { next_jiffies = last_jiffies + rcu_delta_jiffies; delta_jiffies = rcu_delta_jiffies; } } if (!ts->tick_stopped && delta_jiffies == 1) goto out; if ((long)delta_jiffies >= 1) { if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { time_delta = KTIME_MAX; ts->do_timer_last = 0; } else if (!ts->do_timer_last) { time_delta = KTIME_MAX; } if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { time_delta = min_t(u64, time_delta, tick_period.tv64 * delta_jiffies); } if (time_delta < KTIME_MAX) expires = ktime_add_ns(last_update, time_delta); else expires.tv64 = KTIME_MAX; if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; if (!ts->tick_stopped) { select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; } ts->idle_sleeps++; ts->idle_expires = expires; if (unlikely(expires.tv64 == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); if (hrtimer_active(&ts->sched_timer)) goto out; } else if (!tick_program_event(expires, 0)) goto out; tick_do_update_jiffies64(ktime_get()); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; }