/** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * * Execute device drivers' "late" and "noirq" freeze callbacks, create a * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ static int create_image(int platform_mode) { int error; error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); return error; } error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; error = disable_nonboot_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); goto Enable_irqs; } if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) goto Power_up; in_suspend = 1; save_processor_state(); error = swsusp_arch_suspend(); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); /* Restore control flow magically appears here */ restore_processor_state(); if (!in_suspend) { events_check_enabled = false; platform_leave(platform_mode); } Power_up: syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); Platform_finish: platform_finish(platform_mode); dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; }
/** * acpi_idle_enter_simple - enters an ACPI state without BM handling * @dev: the target CPU * @drv: cpuidle driver with cpuidle state information * @index: the index of suggested state */ static int acpi_idle_enter_simple(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); ktime_t kt1, kt2; s64 idle_time_ns; s64 idle_time; pr = __this_cpu_read(processors); dev->last_residency = 0; if (unlikely(!pr)) return -EINVAL; local_irq_disable(); if (cx->entry_method != ACPI_CSTATE_FFH) { current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we test * NEED_RESCHED: */ smp_mb(); if (unlikely(need_resched())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return -EINVAL; } } /* * Must be done before busmaster disable as we might need to * access HPET ! */ lapic_timer_state_broadcast(pr, cx, 1); if (cx->type == ACPI_STATE_C3) ACPI_FLUSH_CPU_CACHE(); kt1 = ktime_get_real(); /* Tell the scheduler that we are going deep-idle: */ sched_clock_idle_sleep_event(); acpi_idle_do_entry(cx); kt2 = ktime_get_real(); idle_time_ns = ktime_to_ns(ktime_sub(kt2, kt1)); idle_time = idle_time_ns; do_div(idle_time, NSEC_PER_USEC); /* Update device last_residency*/ dev->last_residency = (int)idle_time; /* Tell the scheduler how much we idled: */ sched_clock_idle_wakeup_event(idle_time_ns); local_irq_enable(); if (cx->entry_method != ACPI_CSTATE_FFH) current_thread_info()->status |= TS_POLLING; lapic_timer_state_broadcast(pr, cx, 0); return index; }
/** * cpuidle_idle_call - the main idle loop * * NOTE: no locks or semaphores should be used here * return non-zero on failure */ int cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_driver(); int next_state, entered_state; if (off) return -ENODEV; if (!initialized) return -ENODEV; /* check if the device is ready */ if (!dev || !dev->enabled) return -EBUSY; #if 0 /* shows regressions, re-enable for 2.6.29 */ /* * run any timers that can be run now, at this point * before calculating the idle duration etc. */ hrtimer_peek_ahead_timers(); #endif /* ask the governor for the next state */ next_state = cpuidle_curr_governor->select(drv, dev); if (need_resched()) { dev->last_residency = 0; /* give the governor an opportunity to reflect on the outcome */ if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev, next_state); local_irq_enable(); return 0; } trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu); trace_cpu_idle_rcuidle(next_state, dev->cpu); entered_state = cpuidle_enter_ops(dev, drv, next_state); trace_power_end_rcuidle(dev->cpu); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); if (entered_state >= 0) { /* Update cpuidle counters */ /* This can be moved to within driver enter routine * but that results in multiple copies of same code. */ dev->states_usage[entered_state].time += (unsigned long long)dev->last_residency; dev->states_usage[entered_state].usage++; } else { dev->last_residency = 0; } /* give the governor an opportunity to reflect on the outcome */ if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev, entered_state); return 0; }
static inline void preempt_conditional_sti(struct pt_regs *regs) { inc_preempt_count(); if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); }
int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; pgd_t *pgdp; int nr = 0; pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); start &= PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, start, len))) goto slow_irqon; pr_devel(" aligned: %lx .. %lx\n", start, end); /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by * important workloads (eg. DB2), and whether limiting the batch size * will decrease performance. * * It seems like we're in the clear for the moment. Direct-IO is * the main guy that batches up lots of get_user_pages, and even * they are limited to 64-at-a-time which is not so many. */ /* * This doesn't prevent pagetable teardown, but does prevent * the pagetables from being freed on powerpc. * * So long as we atomically load page table pointers versus teardown, * we can follow the address down to the the page and take a ref on it. */ local_irq_disable(); pgdp = pgd_offset(mm, addr); do { pgd_t pgd = *pgdp; pr_devel(" %016lx: normal pgd %p\n", addr, (void *)pgd_val(pgd)); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; if (is_hugepd(pgdp)) { if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, addr, next, write, pages, &nr)) goto slow; } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); return nr; { int ret; slow: local_irq_enable(); slow_irqon: pr_devel(" slow path ! nr = %d\n", nr); /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; down_read(&mm->mmap_sem); ret = get_user_pages(current, mm, start, (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); up_read(&mm->mmap_sem); /* Have to be a bit careful with return values */ if (nr > 0) { if (ret < 0) ret = nr; else ret += nr; } return ret; } }
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ unsigned long landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. * Fix it up. */ regs->ip = landing_pad; enter_from_user_mode(); local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if ( #ifdef CONFIG_X86_64 /* * Micro-optimization: the pointer we're following is explicitly * 32 bits, so it can't be out of range. */ __get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #else get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp) #endif ) { /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs); #ifdef CONFIG_X86_64 /* * Opportunistic SYSRETL: if possible, try to return using SYSRETL. * SYSRETL is available on all 64-bit CPUs, so we don't need to * bother with SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. */ return regs->cs == __USER32_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; #else /* * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. * * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, * because the ECX fixup above will ensure that this is essentially * never the case. * * We don't allow syscalls at all from VM86 mode, but we still * need to check VM, because we might be returning from sys_vm86. */ return static_cpu_has(X86_FEATURE_SEP) && regs->cs == __USER_CS && regs->ss == __USER_DS && regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif }
static inline void gpio_unlock(void) { local_irq_enable(); }
/* * This thread processes interrupts reported by the Primary Interrupt Handler. */ static int twl6030_irq_thread(void *data) { long irq = (long)data; static unsigned i2c_errors; static const unsigned max_i2c_errors = 100; int ret; while (!kthread_should_stop()) { int i; union { u8 bytes[4]; u32 int_sts; } sts; /* Wait for IRQ, then read PIH irq status (also blocking) */ wait_for_completion_interruptible(&irq_event); /* read INT_STS_A, B and C in one shot using a burst read */ ret = twl_i2c_read(TWL_MODULE_PIH, sts.bytes, REG_INT_STS_A, 3); if (ret) { pr_warning("twl6030: I2C error %d reading PIH ISR\n", ret); if (++i2c_errors >= max_i2c_errors) { printk(KERN_ERR "Maximum I2C error count" " exceeded. Terminating %s.\n", __func__); break; } complete(&irq_event); continue; } sts.bytes[3] = 0; /* Only 24 bits are valid*/ /* * Since VBUS status bit is not reliable for VBUS disconnect * use CHARGER VBUS detection status bit instead. */ if (sts.bytes[2] & 0x10) sts.bytes[2] |= 0x08; for (i = 0; sts.int_sts; sts.int_sts >>= 1, i++) { local_irq_disable(); if (sts.int_sts & 0x1) { int module_irq = twl6030_irq_base + twl6030_interrupt_mapping[i]; generic_handle_irq(module_irq); } local_irq_enable(); } /* * NOTE: * Simulation confirms that documentation is wrong w.r.t the * interrupt status clear operation. A single *byte* write to * any one of STS_A to STS_C register results in all three * STS registers being reset. Since it does not matter which * value is written, all three registers are cleared on a * single byte write, so we just use 0x0 to clear. */ ret = twl_i2c_write_u8(TWL_MODULE_PIH, 0x00, REG_INT_STS_A); if (ret) pr_warning("twl6030: I2C error in clearing PIH ISR\n"); enable_irq(irq); } return 0; }
/* default implementation */ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) { local_irq_enable(); }
/* * This routine handles page faults. It determines the address, and the * problem, and then passes it handle_page_fault() for normal DTLB and * ITLB issues, and for DMA or SN processor faults when we are in user * space. For the latter, if we're in kernel mode, we just save the * interrupt away appropriately and return immediately. We can't do * page faults for user code while in kernel mode. */ void do_page_fault(struct pt_regs *regs, int fault_num, unsigned long address, unsigned long write) { int is_page_fault; /* This case should have been handled by do_page_fault_ics(). */ BUG_ON(write & ~1); #if CHIP_HAS_TILE_DMA() /* * If it's a DMA fault, suspend the transfer while we're * handling the miss; we'll restart after it's handled. If we * don't suspend, it's possible that this process could swap * out and back in, and restart the engine since the DMA is * still 'running'. */ if (fault_num == INT_DMATLB_MISS || fault_num == INT_DMATLB_ACCESS || fault_num == INT_DMATLB_MISS_DWNCL || fault_num == INT_DMATLB_ACCESS_DWNCL) { __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__SUSPEND_MASK); while (__insn_mfspr(SPR_DMA_USER_STATUS) & SPR_DMA_STATUS__BUSY_MASK) ; } #endif /* Validate fault num and decide if this is a first-time page fault. */ switch (fault_num) { case INT_ITLB_MISS: case INT_DTLB_MISS: #if CHIP_HAS_TILE_DMA() case INT_DMATLB_MISS: case INT_DMATLB_MISS_DWNCL: #endif #if CHIP_HAS_SN_PROC() case INT_SNITLB_MISS: case INT_SNITLB_MISS_DWNCL: #endif is_page_fault = 1; break; case INT_DTLB_ACCESS: #if CHIP_HAS_TILE_DMA() case INT_DMATLB_ACCESS: case INT_DMATLB_ACCESS_DWNCL: #endif is_page_fault = 0; break; default: panic("Bad fault number %d in do_page_fault", fault_num); } #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() if (EX1_PL(regs->ex1) != USER_PL) { struct async_tlb *async; switch (fault_num) { #if CHIP_HAS_TILE_DMA() case INT_DMATLB_MISS: case INT_DMATLB_ACCESS: case INT_DMATLB_MISS_DWNCL: case INT_DMATLB_ACCESS_DWNCL: async = ¤t->thread.dma_async_tlb; break; #endif #if CHIP_HAS_SN_PROC() case INT_SNITLB_MISS: case INT_SNITLB_MISS_DWNCL: async = ¤t->thread.sn_async_tlb; break; #endif default: async = NULL; } if (async) { /* * No vmalloc check required, so we can allow * interrupts immediately at this point. */ local_irq_enable(); set_thread_flag(TIF_ASYNC_TLB); if (async->fault_num != 0) { panic("Second async fault %d;" " old fault was %d (%#lx/%ld)", fault_num, async->fault_num, address, write); } BUG_ON(fault_num == 0); async->fault_num = fault_num; async->is_fault = is_page_fault; async->is_write = write; async->address = address; return; } } #endif handle_page_fault(regs, fault_num, is_page_fault, address, write); }
/** * nohz_restart_sched_tick - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle */ void tick_nohz_restart_sched_tick(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long ticks; ktime_t now, delta; if (!ts->tick_stopped) return; /* Update jiffies first */ now = ktime_get(); local_irq_disable(); tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); /* Account the idle time */ delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick * accounting. Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) { add_preempt_count(HARDIRQ_OFFSET); account_system_time(current, HARDIRQ_OFFSET, jiffies_to_cputime(ticks)); sub_preempt_count(HARDIRQ_OFFSET); } /* * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; hrtimer_cancel(&ts->sched_timer); ts->sched_timer.expires = ts->idle_tick; while (1) { /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, tick_period); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, HRTIMER_MODE_ABS); /* Check, if the timer was already in the past */ if (hrtimer_active(&ts->sched_timer)) break; } else { if (!tick_program_event(ts->sched_timer.expires, 0)) break; } /* Update jiffies and reread time */ tick_do_update_jiffies64(now); now = ktime_get(); } local_irq_enable(); }
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct task_struct *tsk; struct mm_struct *mm; int fault, sig, code; unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; /* Enable interrupts if they were enabled in the parent context. */ if (interrupts_enabled(regs)) local_irq_enable(); /* * If we're in an interrupt or have no user context, we must not take * the fault. */ if (in_atomic() || !mm) goto no_context; if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; if (esr & ESR_LNX_EXEC) { vm_flags = VM_EXEC; } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } /* * PAN bit set implies the fault happened in kernel space, but not * in the arch's user access functions. */ if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT)) goto no_context; /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, * we can bug out early if this is from code which shouldn't. */ if (!down_read_trylock(&mm->mmap_sem)) { if (!user_mode(regs) && !search_exception_tables(regs->pc)) goto no_context; retry: down_read(&mm->mmap_sem); } else { /* * The above down_read_trylock() might have succeeded in which * case, we'll have missed the might_sleep() from down_read(). */ might_sleep(); #ifdef CONFIG_DEBUG_VM if (!user_mode(regs) && !search_exception_tables(regs->pc)) goto no_context; #endif } fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk); /* * If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_sem because it * would already be released in __lock_page_or_retry in mm/filemap.c. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; /* * Major/minor page fault accounting is only done on the initial * attempt. If we go through a retry, it is extremely likely that the * page will be found in page cache at that point. */ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); } else { tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); } if (fault & VM_FAULT_RETRY) { /* * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of * starvation. */ mm_flags &= ~FAULT_FLAG_ALLOW_RETRY; mm_flags |= FAULT_FLAG_TRIED; goto retry; } } up_read(&mm->mmap_sem); /* * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; /* * If we are in kernel mode at this point, we have no context to * handle this fault with. */ if (!user_mode(regs)) goto no_context; if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to * userspace (which will retry the fault, or kill us if we got * oom-killed). */ pagefault_out_of_memory(); return 0; } if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to successfully fix up * this page fault. */ sig = SIGBUS; code = BUS_ADRERR; } else { /* * Something tried to access memory that isn't in our memory * map. */ sig = SIGSEGV; code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR; } __do_user_fault(tsk, addr, esr, sig, code, regs); return 0; no_context: __do_kernel_fault(mm, addr, esr, regs); return 0; }
/** * hibernation_platform_enter - Power off the system using the platform driver. */ int hibernation_platform_enter(void) { int error; if (!hibernation_ops) return -ENOSYS; /* * We have cancelled the power transition by running * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ error = hibernation_ops->begin(); if (error) goto Close; entering_platform_hibernation = true; suspend_console(); ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) hibernation_ops->recover(); goto Resume_devices; } error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; error = hibernation_ops->prepare(); if (error) goto Platform_finish; error = disable_nonboot_cpus(); if (error) goto Platform_finish; local_irq_disable(); syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; } hibernation_ops->enter(); /* We should never get here */ while (1); Power_up: syscore_resume(); local_irq_enable(); enable_nonboot_cpus(); Platform_finish: hibernation_ops->finish(); dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); ftrace_start(); resume_console(); Close: hibernation_ops->end(); return error; }
/** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * * Execute device drivers' "noirq" and "late" freeze callbacks, restore the * contents of highmem that have not been restored yet from the image and run * the low-level code that will restore the remaining contents of memory and * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); return error; } error = platform_pre_restore(platform_mode); if (error) goto Cleanup; error = disable_nonboot_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; save_processor_state(); error = restore_highmem(); if (!error) { error = swsusp_arch_resume(); /* * The code below is only ever reached in case of a failure. * Otherwise, execution continues at the place where * swsusp_arch_suspend() was called. */ BUG_ON(!error); /* * This call to restore_highmem() reverts the changes made by * the previous one. */ restore_highmem(); } /* * The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid * subsequent failures. */ swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); Cleanup: platform_restore_cleanup(platform_mode); dpm_resume_start(PMSG_RECOVER); return error; }
void _spin_unlock_irq(spinlock_t *lock) { _spin_unlock(lock); local_irq_enable(); }
asmlinkage void __init start_kernel(void) { char * command_line; extern struct kernel_param __start___param[], __stop___param[]; #ifdef CONFIG_RTAI_RTSPMM unsigned int indice_part; /* Size of the needed memory block by the configuration */ unsigned long rt_mem_block_size = 0; #endif /* * Interrupts are still disabled. Do necessary setups, then * enable them */ lock_kernel(); page_address_init(); printk(linux_banner); setup_arch(&command_line); setup_per_cpu_areas(); /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ smp_prepare_boot_cpu(); /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ preempt_disable(); build_all_zonelists(); page_alloc_init(); early_init_hardirqs(); printk("Kernel command line: %s\n", saved_command_line); parse_early_param(); parse_args("Booting kernel", command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); sort_main_extable(); trap_init(); rcu_init(); init_IRQ(); pidhash_init(); init_timers(); softirq_init(); time_init(); /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); if (panic_later) panic(panic_later, panic_param); #ifdef CONFIG_RTAI_RTSPMM /* Allocate a big and continuous memory block for the module SPMM included in the RTAI functionalities */ printk("--- Memory Allocation for the module rt_spmm ---\n"); /* WARNING We need to add some space for the structures vrtxptext and vrtxpt and the partitions bitmap that the module rt_spmm uses to handle the blocks in each partition */ /* for each defined partitions */ for(indice_part = 0; indice_part < RT_MAX_PART_NUM; indice_part ++) { if ((rt_partitions_table[indice_part].block_size != 0) && (rt_partitions_table[indice_part].num_of_blocks != 0)) { rt_partitions_table[indice_part].part_size = (rt_partitions_table[indice_part].block_size + XN_NBBY) *rt_partitions_table[indice_part].num_of_blocks + + sizeof(vrtxptext_t)+sizeof(vrtxpt_t); rt_mem_block_size += rt_partitions_table[indice_part].part_size; } } #ifdef CONFIG_RTAI_PART_DMA printk("Allocate memory in the low part of memory\n"); rt_mem_block_ptr=(void*)alloc_bootmem_low(rt_mem_block_size + PAGE_SIZE-1); #else printk("Allocate memory in the standard part of memory\n"); rt_mem_block_ptr=(void*)alloc_bootmem(rt_mem_block_size + PAGE_SIZE-1); #endif /* CONFIG_PART_DMA */ printk("Needed Memory Size : %lu\n", rt_mem_block_size); printk("Allocated Memory Size : %lu\n", rt_mem_block_size + PAGE_SIZE-1); printk("Memory block address : 0x%x\n", (unsigned int)rt_mem_block_ptr); printk("-----------------------------------------------\n"); #endif /* CONFIG_RTAI_RTSPMM */ profile_init(); local_irq_enable(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && initrd_start < min_low_pfn << PAGE_SHIFT) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); initrd_start = 0; } #endif vfs_caches_init_early(); mem_init(); kmem_cache_init(); numa_policy_init(); if (late_time_init) late_time_init(); calibrate_delay(); pidmap_init(); pgtable_cache_init(); prio_tree_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); #endif fork_init(num_physpages); proc_caches_init(); buffer_init(); unnamed_dev_init(); security_init(); vfs_caches_init(num_physpages); #ifdef CONFIG_MOT_FEAT_DEVICE_TREE mothwcfg_init(); #endif /* CONFIG_MOT_FEAT_DEVICE_TREE */ radix_tree_init(); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS proc_root_init(); #endif check_bugs(); acpi_early_init(); /* before LAPIC and SMP init */ /* Do the rest non-__init'ed, we're now alive */ rest_init(); }
/* Handles int $0x80 */ __visible void do_int80_syscall_32(struct pt_regs *regs) { enter_from_user_mode(); local_irq_enable(); do_syscall_32_irqs_on(regs); }
asmlinkage unsigned long asm_do_sig(unsigned long esr, struct pt_regs *regs) { struct siginfo info; struct task_struct *tsk; struct mm_struct *mm; int sig; int trapidx = 0; unsigned long addr; //register unsigned long sp asm("sp"); switch(esr & ESR_CODE) { case TCT_PRIVILEGED: addr = regs->elkr; printk("Privileged Instruction Exception @ %lx\n", addr); sig = SIGSEGV; trapidx = 1; break; case TCT_UNDEFINED: addr = regs->elkr; printk("Undefined Instruction Exception @ %lx\n", addr); sig = SIGSEGV; trapidx = 2; break; case TCT_ITLB: addr = regs->elkr; printk("Instruction TLB Exception @ %lx\n", addr); sig = SIGSEGV; trapidx = 3; break; case TCT_DTLB: addr = regs->elkr; printk("Data TLB Exception @ %lx\n", addr); sig = SIGSEGV; trapidx = 4; break; case TCT_FETCH_ABORT: addr = regs->elkr; printk("Fetch Abort Exception @ address %lx\n", addr); sig = SIGABRT; trapidx= 5; break; case TCT_DATA_ABORT:/*interrupt exception */ addr = regs->elkr; printk("Data Abort Exception @ %lx\n", addr); sig = SIGABRT; trapidx = 6; break; case TCT_ZERO_DIV:/*interrupt exception */ addr = regs->elkr; printk("Zero Division Exception @ %lx\n", addr); sig = SIGSEGV; trapidx = 7; break; default: addr = regs->elkr; printk("Unknown exception, defaulting to SIGSEGV @ %lx\n", addr); sig = SIGSEGV; trapidx = 8; break; } memset(&info, 0, sizeof(info)); info.si_signo = sig; info.si_addr = (void*)addr; tsk = current; mm = tsk->mm; /* * If we're on the kernel stack or during disabled interrupts * or in an atomic section or if we have no user context, the kernel gets the fault. */ if( !mm || in_atomic()) { /* the kernel gets the fault */ bust_spinlocks(1); printk(KERN_ALERT "%s: unhandled kernel space fault (%d) @ %lx\n", tsk->comm, sig, addr); bust_spinlocks(0); local_irq_enable(); do_exit(SIGKILL); return 0; } else { /* the user space gets the fault */ if( sig != SIGTRAP ) printk(KERN_ERR "%s: unhandled user space fault (%d) @ %lx\n", tsk->comm, sig, addr); force_sig_info(sig, &info, tsk); return regs->r1; } }
void kgdb_roundup_cpus(unsigned long flags) { local_irq_enable(); smp_call_function(kgdb_call_nmi_hook, NULL, 0); local_irq_disable(); }
irqreturn_t ipi_interrupt(int irq, void *dev_id) { int this_cpu = smp_processor_id(); struct cpuinfo_parisc *p = &cpu_data[this_cpu]; unsigned long ops; unsigned long flags; /* Count this now; we may make a call that never returns. */ p->ipi_count++; mb(); /* Order interrupt and bit testing. */ for (;;) { spinlock_t *lock = &per_cpu(ipi_lock, this_cpu); spin_lock_irqsave(lock, flags); ops = p->pending_ipi; p->pending_ipi = 0; spin_unlock_irqrestore(lock, flags); mb(); /* Order bit clearing and data access. */ if (!ops) break; while (ops) { unsigned long which = ffz(~ops); ops &= ~(1 << which); switch (which) { case IPI_NOP: smp_debug(100, KERN_DEBUG "CPU%d IPI_NOP\n", this_cpu); break; case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); /* * Reschedule callback. Everything to be * done is done by the interrupt return path. */ break; case IPI_CALL_FUNC: smp_debug(100, KERN_DEBUG "CPU%d IPI_CALL_FUNC\n", this_cpu); generic_smp_call_function_interrupt(); break; case IPI_CALL_FUNC_SINGLE: smp_debug(100, KERN_DEBUG "CPU%d IPI_CALL_FUNC_SINGLE\n", this_cpu); generic_smp_call_function_single_interrupt(); break; case IPI_CPU_START: smp_debug(100, KERN_DEBUG "CPU%d IPI_CPU_START\n", this_cpu); break; case IPI_CPU_STOP: smp_debug(100, KERN_DEBUG "CPU%d IPI_CPU_STOP\n", this_cpu); halt_processor(); break; case IPI_CPU_TEST: smp_debug(100, KERN_DEBUG "CPU%d is alive!\n", this_cpu); break; default: printk(KERN_CRIT "Unknown IPI num on CPU%d: %lu\n", this_cpu, which); return IRQ_NONE; } /* Switch */ /* let in any pending interrupts */ local_irq_enable(); local_irq_disable(); } /* while (ops) */ } return IRQ_HANDLED; }
static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); }
/*H:030 * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep * going around and around until something interesting happens. */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* If the launcher asked for a register with LHREQ_GETREG */ if (cpu->reg_read) { if (put_user(*cpu->reg_read, user)) return -EFAULT; cpu->reg_read = NULL; return sizeof(*cpu->reg_read); } /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { unsigned int irq; bool more; /* First we run any hypercalls the Guest wants done. */ if (cpu->hcall) do_hypercalls(cpu); /* Do we have to tell the Launcher about a trap? */ if (cpu->pending.trap) { if (copy_to_user(user, &cpu->pending, sizeof(cpu->pending))) return -EFAULT; return sizeof(cpu->pending); } /* * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, * it stops us. */ try_to_freeze(); /* Check for signals */ if (signal_pending(current)) return -ERESTARTSYS; /* * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); /* * Just make absolutely sure the Guest is still alive. One of * those hypercalls could have been fatal, for example. */ if (cpu->lg->dead) break; /* * If the Guest asked to be stopped, we sleep. The Guest's * clock timer will wake us. */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); /* * Just before we sleep, make sure no interrupt snuck in * which we should be doing. */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else schedule(); continue; } /* * OK, now we're ready to jump into the Guest. First we put up * the "Do Not Disturb" sign: */ local_irq_disable(); /* Actually run the Guest until something happens. */ lguest_arch_run_guest(cpu); /* Now we're ready to be interrupted or moved to other CPUs */ local_irq_enable(); /* Now we deal with whatever happened to the Guest. */ lguest_arch_handle_trap(cpu); } /* Special case: Guest is 'dead' but wants a reboot. */ if (cpu->lg->dead == ERR_PTR(-ERESTART)) return -ERESTART; /* The Guest is dead => "No such file or directory" */ return -ENOENT; }
/* * Update the */ int ide_driveid_update (ide_drive_t *drive) { ide_hwif_t *hwif = HWIF(drive); struct hd_driveid *id; #if 0 id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); if (!id) return 0; taskfile_lib_get_identify(drive, (char *)&id); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; drive->id->dma_mword = id->dma_mword; drive->id->dma_1word = id->dma_1word; /* anything more ? */ kfree(id); } return 1; #else /* * Re-read drive->id for possible DMA mode * change (copied from ide-probe.c) */ unsigned long timeout, flags; SELECT_MASK(drive, 1); if (IDE_CONTROL_REG) hwif->OUTB(drive->ctl,IDE_CONTROL_REG); msleep(50); hwif->OUTB(WIN_IDENTIFY, IDE_COMMAND_REG); timeout = jiffies + WAIT_WORSTCASE; do { if (time_after(jiffies, timeout)) { SELECT_MASK(drive, 0); return 0; /* drive timed-out */ } msleep(50); /* give drive a breather */ } while (hwif->INB(IDE_ALTSTATUS_REG) & BUSY_STAT); msleep(50); /* wait for IRQ and DRQ_STAT */ if (!OK_STAT(hwif->INB(IDE_STATUS_REG),DRQ_STAT,BAD_R_STAT)) { SELECT_MASK(drive, 0); printk("%s: CHECK for good STATUS\n", drive->name); return 0; } local_irq_save(flags); SELECT_MASK(drive, 0); id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); if (!id) { local_irq_restore(flags); return 0; } ata_input_data(drive, id, SECTOR_WORDS); (void) hwif->INB(IDE_STATUS_REG); /* clear drive IRQ */ local_irq_enable(); local_irq_restore(flags); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; drive->id->dma_mword = id->dma_mword; drive->id->dma_1word = id->dma_1word; /* anything more ? */ kfree(id); } return 1; #endif }
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long vector, int write_acc) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; siginfo_t info; int fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * NOTE2: This is done so that, when updating the vmalloc * mappings we don't have to walk all processes pgdirs and * add the high mappings all at once. Instead we do it as they * are used. However vmalloc'ed page entries have the PAGE_GLOBAL * bit set so sometimes the TLB can use a lingering entry. * * This verifies that the fault happens in kernel space * and that the fault was not a protection error. */ if (address >= VMALLOC_START && (vector != 0x300 && vector != 0x400) && !user_mode(regs)) goto vmalloc_fault; /* If exceptions were enabled, we can reenable them here */ if (user_mode(regs)) { /* Exception was in userspace: reenable interrupts */ local_irq_enable(); flags |= FAULT_FLAG_USER; } else { /* If exception was in a syscall, then IRQ's may have * been enabled or disabled. If they were enabled, * reenable them. */ if (regs->sr && (SPR_SR_IEE | SPR_SR_TEE)) local_irq_enable(); } mm = tsk->mm; info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (user_mode(regs)) { /* * accessing the stack below usp is always a bug. * we get page-aligned addresses so we can only check * if we're within a page from usp, but that might be * enough to catch brutal errors at least. */ if (address + PAGE_SIZE < regs->sp) goto bad_area; } if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; /* first do some preliminary protection checks */ if (write_acc) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags |= FAULT_FLAG_WRITE; } else { /* not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* are we trying to execute nonexecutable area */ if ((vector == 0x400) && !(vma->vm_page_prot.pgprot & _PAGE_EXEC)) goto bad_area; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { /*RGD modeled on Cris */ if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ goto retry; } } up_read(&mm->mmap_sem); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) { info.si_signo = SIGSEGV; info.si_errno = 0; /* info.si_code has been set above */ info.si_addr = (void *)address; force_sig_info(SIGSEGV, &info, tsk); return; } no_context: /* Are we prepared to handle this kernel fault? * * (The kernel has valid exception-points in the source * when it acesses user-memory. When it fails in one * of those points, we find it in a table and do a jump * to some fixup code that loads an appropriate error * code) */ { const struct exception_table_entry *entry; __asm__ __volatile__("l.nop 42"); if ((entry = search_exception_tables(regs->pc)) != NULL) { /* Adjust the instruction pointer in the stackframe */ regs->pc = entry->fixup; return; } } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ if ((unsigned long)(address) < PAGE_SIZE) printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); else printk(KERN_ALERT "Unable to handle kernel access"); printk(" at virtual address 0x%08lx\n", address); die("Oops", regs, write_acc); do_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: __asm__ __volatile__("l.nop 42"); __asm__ __volatile__("l.nop 1"); up_read(&mm->mmap_sem); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void *)address; force_sig_info(SIGBUS, &info, tsk); /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) goto no_context; return; vmalloc_fault: { /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Use current_pgd instead of tsk->active_mm->pgd * since the latter might be unavailable if this * code is executed in a misfortunately run irq * (like inside schedule() between switch_mm and * switch_to...). */ int offset = pgd_index(address); pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; /* phx_warn("do_page_fault(): vmalloc_fault will not work, " "since current_pgd assign a proper value somewhere\n" "anyhow we don't need this at the moment\n"); phx_mmu("vmalloc_fault"); */ pgd = (pgd_t *)current_pgd[smp_processor_id()] + offset; pgd_k = init_mm.pgd + offset; /* Since we're two-level, we don't need to do both * set_pgd and set_pmd (they do the same thing). If * we go three-level at some point, do the right thing * with pgd_present and set_pgd here. * * Also, since the vmalloc area is global, we don't * need to copy individual PTE's, it is enough to * copy the pgd pointer into the pte page of the * root task. If that is there, we'll find our pte if * it exists. */ pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) goto no_context; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) goto bad_area_nosemaphore; set_pmd(pmd, *pmd_k); /* Make sure the actual PTE exists as well to * catch kernel vmalloc-area accesses to non-mapped * addresses. If we don't do this, this will just * silently loop forever. */ pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) goto no_context; return; } }
asmlinkage void __init start_kernel(void) { char * command_line; extern struct kernel_param __start___param[], __stop___param[]; smp_setup_processor_id(); /* * Need to run as early as possible, to initialize the * lockdep hash: */ lockdep_init(); debug_objects_early_init(); /* * Set up the the initial canary ASAP: */ boot_init_stack_canary(); cgroup_init_early(); local_irq_disable(); early_boot_irqs_off(); early_init_irq_lock_class(); /* * Interrupts are still disabled. Do necessary setups, then * enable them */ lock_kernel(); tick_init(); boot_cpu_init(); page_address_init(); printk(KERN_NOTICE "%s", linux_banner); setup_arch(&command_line); mm_init_owner(&init_mm, &init_task); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); /* * These use large bootmem allocations and must precede * kmem_cache_init() */ pidhash_init(); vfs_caches_init_early(); sort_main_extable(); trap_init(); mm_init(); /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ preempt_disable(); if (!irqs_disabled()) { printk(KERN_WARNING "start_kernel(): bug: interrupts were " "enabled *very* early, fixing it\n"); local_irq_disable(); } rcu_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); prio_tree_init(); init_timers(); hrtimers_init(); softirq_init(); timekeeping_init(); time_init(); profile_init(); if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " "enabled early\n"); early_boot_irqs_on(); local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ gfp_allowed_mask = __GFP_BITS_MASK; kmem_cache_init_late(); /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); if (panic_later) panic(panic_later, panic_param); lockdep_info(); /* * Need to run this when irqs are enabled, because it wants * to self-test [hard/soft]-irqs on/off lock inversion bugs * too: */ locking_selftest(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " "disabling it.\n", page_to_pfn(virt_to_page((void *)initrd_start)), min_low_pfn); initrd_start = 0; } #endif page_cgroup_init(); enable_debug_pagealloc(); kmemtrace_init(); kmemleak_init(); debug_objects_mem_init(); idr_init_cache(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) late_time_init(); sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); #endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); proc_caches_init(); buffer_init(); key_init(); security_init(); dbg_late_init(); vfs_caches_init(totalram_pages); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS proc_root_init(); #endif cgroup_init(); cpuset_init(); taskstats_init_early(); delayacct_init(); check_bugs(); acpi_early_init(); /* before LAPIC and SMP init */ sfi_init_late(); ftrace_init(); /* Do the rest non-__init'ed, we're now alive */ rest_init(); }
/* * This routine is responsible for faulting in user pages. * It passes the work off to one of the appropriate routines. * It returns true if the fault was successfully handled. */ static int handle_page_fault(struct pt_regs *regs, int fault_num, int is_page_fault, unsigned long address, int write) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long stack_offset; int fault; int si_code; int is_kernel_mode; pgd_t *pgd; unsigned int flags; /* on TILE, protection faults are always writes */ if (!is_page_fault) write = 1; flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; is_kernel_mode = !user_mode(regs); tsk = validate_current(); /* * Check to see if we might be overwriting the stack, and bail * out if so. The page fault code is a relatively likely * place to get trapped in an infinite regress, and once we * overwrite the whole stack, it becomes very hard to recover. */ stack_offset = stack_pointer & (THREAD_SIZE-1); if (stack_offset < THREAD_SIZE / 8) { pr_alert("Potential stack overrun: sp %#lx\n", stack_pointer); show_regs(regs); pr_alert("Killing current process %d/%s\n", tsk->pid, tsk->comm); do_group_exit(SIGKILL); } /* * Early on, we need to check for migrating PTE entries; * see homecache.c. If we find a migrating PTE, we wait until * the backing page claims to be done migrating, then we proceed. * For kernel PTEs, we rewrite the PTE and return and retry. * Otherwise, we treat the fault like a normal "no PTE" fault, * rather than trying to patch up the existing PTE. */ pgd = get_current_pgd(); if (handle_migrating_pte(pgd, fault_num, address, regs->pc, is_kernel_mode, write)) return 1; si_code = SEGV_MAPERR; /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * This verifies that the fault happens in kernel space * and that the fault was not a protection fault. */ if (unlikely(address >= TASK_SIZE && !is_arch_mappable_range(address, 0))) { if (is_kernel_mode && is_page_fault && vmalloc_fault(pgd, address) >= 0) return 1; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ mm = NULL; /* happy compiler */ vma = NULL; goto bad_area_nosemaphore; } /* * If we're trying to touch user-space addresses, we must * be either at PL0, or else with interrupts enabled in the * kernel, so either way we can re-enable interrupts here * unless we are doing atomic access to user space with * interrupts disabled. */ if (!(regs->flags & PT_FLAGS_DISABLE_IRQ)) local_irq_enable(); mm = tsk->mm; /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ if (in_atomic() || !mm) { vma = NULL; /* happy compiler */ goto bad_area_nosemaphore; } if (!is_kernel_mode) flags |= FAULT_FLAG_USER; /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the * exceptions table. * * As the vast majority of faults will be valid we will only perform * the source reference check when there is a possibility of a deadlock. * Attempt to lock the address space, if we cannot we then validate the * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ if (!down_read_trylock(&mm->mmap_sem)) { if (is_kernel_mode && !search_exception_tables(regs->pc)) { vma = NULL; /* happy compiler */ goto bad_area_nosemaphore; } retry: down_read(&mm->mmap_sem); } vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (regs->sp < PAGE_OFFSET) { /* * accessing the stack below sp is always a bug. */ if (address < regs->sp) goto bad_area; } if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: si_code = SEGV_ACCERR; if (fault_num == INT_ITLB_MISS) { if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else if (write) { #ifdef TEST_VERIFY_AREA if (!is_page_fault && regs->cs == KERNEL_CS) pr_err("WP fault at "REGFMT"\n", regs->eip); #endif if (!(vma->vm_flags & VM_WRITE)) goto bad_area; flags |= FAULT_FLAG_WRITE; } else { if (!is_page_fault || !(vma->vm_flags & VM_READ)) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; else if (fault & VM_FAULT_SIGBUS) goto do_sigbus; BUG(); } if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; /* * No need to up_read(&mm->mmap_sem) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ goto retry; } } #if CHIP_HAS_TILE_DMA() /* If this was a DMA TLB fault, restart the DMA engine. */ switch (fault_num) { case INT_DMATLB_MISS: case INT_DMATLB_MISS_DWNCL: case INT_DMATLB_ACCESS: case INT_DMATLB_ACCESS_DWNCL: __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK); break; } #endif up_read(&mm->mmap_sem); return 1; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ if (!is_kernel_mode) { /* * It's possible to have interrupts off here. */ local_irq_enable(); force_sig_info_fault("segfault", SIGSEGV, si_code, address, fault_num, tsk, regs); return 0; } no_context: /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return 0; /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ bust_spinlocks(1); /* FIXME: no lookup_address() yet */ #ifdef SUPPORT_LOOKUP_ADDRESS if (fault_num == INT_ITLB_MISS) { pte_t *pte = lookup_address(address); if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) pr_crit("kernel tried to execute" " non-executable page - exploit attempt?" " (uid: %d)\n", current->uid); } #endif if (address < PAGE_SIZE) pr_alert("Unable to handle kernel NULL pointer dereference\n"); else pr_alert("Unable to handle kernel paging request\n"); pr_alert(" at virtual address "REGFMT", pc "REGFMT"\n", address, regs->pc); show_regs(regs); if (unlikely(tsk->pid < 2)) { panic("Kernel page fault running %s!", is_idle_task(tsk) ? "the idle task" : "init"); } /* * More FIXME: we should probably copy the i386 here and * implement a generic die() routine. Not today. */ #ifdef SUPPORT_DIE die("Oops", regs); #endif bust_spinlocks(1); do_group_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (is_kernel_mode) goto no_context; pagefault_out_of_memory(); return 0; do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ if (is_kernel_mode) goto no_context; force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address, fault_num, tsk, regs); return 0; }
/** * acpi_idle_enter_bm - enters C3 with proper BM handling * @dev: the target CPU * @drv: cpuidle driver containing state data * @index: the index of suggested state * * If BM is detected, the deepest non-C3 idle state is entered instead. */ static int acpi_idle_enter_bm(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { struct acpi_processor *pr; struct cpuidle_state_usage *state_usage = &dev->states_usage[index]; struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage); ktime_t kt1, kt2; s64 idle_time_ns; s64 idle_time; pr = __this_cpu_read(processors); dev->last_residency = 0; if (unlikely(!pr)) return -EINVAL; if (!cx->bm_sts_skip && acpi_idle_bm_check()) { if (drv->safe_state_index >= 0) { return drv->states[drv->safe_state_index].enter(dev, drv, drv->safe_state_index); } else { local_irq_disable(); acpi_safe_halt(); local_irq_enable(); return -EBUSY; } } local_irq_disable(); if (cx->entry_method != ACPI_CSTATE_FFH) { current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we test * NEED_RESCHED: */ smp_mb(); if (unlikely(need_resched())) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return -EINVAL; } } acpi_unlazy_tlb(smp_processor_id()); /* Tell the scheduler that we are going deep-idle: */ sched_clock_idle_sleep_event(); /* * Must be done before busmaster disable as we might need to * access HPET ! */ lapic_timer_state_broadcast(pr, cx, 1); kt1 = ktime_get_real(); /* * disable bus master * bm_check implies we need ARB_DIS * !bm_check implies we need cache flush * bm_control implies whether we can do ARB_DIS * * That leaves a case where bm_check is set and bm_control is * not set. In that case we cannot do much, we enter C3 * without doing anything. */ if (pr->flags.bm_check && pr->flags.bm_control) { raw_spin_lock(&c3_lock); c3_cpu_count++; /* Disable bus master arbitration when all CPUs are in C3 */ if (c3_cpu_count == num_online_cpus()) acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); raw_spin_unlock(&c3_lock); } else if (!pr->flags.bm_check) { ACPI_FLUSH_CPU_CACHE(); } acpi_idle_do_entry(cx); /* Re-enable bus master arbitration */ if (pr->flags.bm_check && pr->flags.bm_control) { raw_spin_lock(&c3_lock); acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); c3_cpu_count--; raw_spin_unlock(&c3_lock); } kt2 = ktime_get_real(); idle_time_ns = ktime_to_ns(ktime_sub(kt2, kt1)); idle_time = idle_time_ns; do_div(idle_time, NSEC_PER_USEC); /* Update device last_residency*/ dev->last_residency = (int)idle_time; /* Tell the scheduler how much we idled: */ sched_clock_idle_wakeup_event(idle_time_ns); local_irq_enable(); if (cx->entry_method != ACPI_CSTATE_FFH) current_thread_info()->status |= TS_POLLING; lapic_timer_state_broadcast(pr, cx, 0); return index; }
/* * This routine handles page faults. It determines the address, and the * problem, and then passes it handle_page_fault() for normal DTLB and * ITLB issues, and for DMA or SN processor faults when we are in user * space. For the latter, if we're in kernel mode, we just save the * interrupt away appropriately and return immediately. We can't do * page faults for user code while in kernel mode. */ void do_page_fault(struct pt_regs *regs, int fault_num, unsigned long address, unsigned long write) { int is_page_fault; #ifdef CONFIG_KPROBES /* * This is to notify the fault handler of the kprobes. The * exception code is redundant as it is also carried in REGS, * but we pass it anyhow. */ if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1, regs->faultnum, SIGSEGV) == NOTIFY_STOP) return; #endif #ifdef __tilegx__ /* * We don't need early do_page_fault_ics() support, since unlike * Pro we don't need to worry about unlocking the atomic locks. * There is only one current case in GX where we touch any memory * under ICS other than our own kernel stack, and we handle that * here. (If we crash due to trying to touch our own stack, * we're in too much trouble for C code to help out anyway.) */ if (write & ~1) { unsigned long pc = write & ~1; if (pc >= (unsigned long) __start_unalign_asm_code && pc < (unsigned long) __end_unalign_asm_code) { struct thread_info *ti = current_thread_info(); /* * Our EX_CONTEXT is still what it was from the * initial unalign exception, but now we've faulted * on the JIT page. We would like to complete the * page fault however is appropriate, and then retry * the instruction that caused the unalign exception. * Our state has been "corrupted" by setting the low * bit in "sp", and stashing r0..r3 in the * thread_info area, so we revert all of that, then * continue as if this were a normal page fault. */ regs->sp &= ~1UL; regs->regs[0] = ti->unalign_jit_tmp[0]; regs->regs[1] = ti->unalign_jit_tmp[1]; regs->regs[2] = ti->unalign_jit_tmp[2]; regs->regs[3] = ti->unalign_jit_tmp[3]; write &= 1; } else { pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n", current->comm, current->pid, pc, address); show_regs(regs); do_group_exit(SIGKILL); return; } } #else /* This case should have been handled by do_page_fault_ics(). */ BUG_ON(write & ~1); #endif #if CHIP_HAS_TILE_DMA() /* * If it's a DMA fault, suspend the transfer while we're * handling the miss; we'll restart after it's handled. If we * don't suspend, it's possible that this process could swap * out and back in, and restart the engine since the DMA is * still 'running'. */ if (fault_num == INT_DMATLB_MISS || fault_num == INT_DMATLB_ACCESS || fault_num == INT_DMATLB_MISS_DWNCL || fault_num == INT_DMATLB_ACCESS_DWNCL) { __insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__SUSPEND_MASK); while (__insn_mfspr(SPR_DMA_USER_STATUS) & SPR_DMA_STATUS__BUSY_MASK) ; } #endif /* Validate fault num and decide if this is a first-time page fault. */ switch (fault_num) { case INT_ITLB_MISS: case INT_DTLB_MISS: #if CHIP_HAS_TILE_DMA() case INT_DMATLB_MISS: case INT_DMATLB_MISS_DWNCL: #endif is_page_fault = 1; break; case INT_DTLB_ACCESS: #if CHIP_HAS_TILE_DMA() case INT_DMATLB_ACCESS: case INT_DMATLB_ACCESS_DWNCL: #endif is_page_fault = 0; break; default: panic("Bad fault number %d in do_page_fault", fault_num); } #if CHIP_HAS_TILE_DMA() if (!user_mode(regs)) { struct async_tlb *async; switch (fault_num) { #if CHIP_HAS_TILE_DMA() case INT_DMATLB_MISS: case INT_DMATLB_ACCESS: case INT_DMATLB_MISS_DWNCL: case INT_DMATLB_ACCESS_DWNCL: async = ¤t->thread.dma_async_tlb; break; #endif default: async = NULL; } if (async) { /* * No vmalloc check required, so we can allow * interrupts immediately at this point. */ local_irq_enable(); set_thread_flag(TIF_ASYNC_TLB); if (async->fault_num != 0) { panic("Second async fault %d;" " old fault was %d (%#lx/%ld)", fault_num, async->fault_num, address, write); } BUG_ON(fault_num == 0); async->fault_num = fault_num; async->is_fault = is_page_fault; async->is_write = write; async->address = address; return; } } #endif handle_page_fault(regs, fault_num, is_page_fault, address, write); }
/* * This thread processes interrupts reported by the Primary Interrupt Handler. */ static int twl6030_irq_thread(void *data) { long irq = (long)data; static unsigned i2c_errors; static const unsigned max_i2c_errors = 100; int ret; current->flags |= PF_NOFREEZE; while (!kthread_should_stop()) { int i; union { u8 bytes[4]; u32 int_sts; } sts; /* Wait for IRQ, then read PIH irq status (also blocking) */ wait_for_completion_interruptible(&irq_event); /* read INT_STS_A, B and C in one shot using a burst read */ ret = twl_i2c_read(TWL_MODULE_PIH, sts.bytes, REG_INT_STS_A, 3); if (ret) { pr_warning("twl6030: I2C error %d reading PIH ISR\n", ret); if (++i2c_errors >= max_i2c_errors) { printk(KERN_ERR "Maximum I2C error count" " exceeded. Terminating %s.\n", __func__); break; } complete(&irq_event); continue; } sts.bytes[3] = 0; /* Only 24 bits are valid*/ for (i = 0; sts.int_sts; sts.int_sts >>= 1, i++) { local_irq_disable(); if (sts.int_sts & 0x1) { int module_irq = twl6030_irq_base + twl6030_interrupt_mapping[i]; struct irq_desc *d = irq_to_desc(module_irq); if (!d) { pr_err("twl6030: Invalid SIH IRQ: %d\n", module_irq); return -EINVAL; } /* These can't be masked ... always warn * if we get any surprises. */ if (d->status & IRQ_DISABLED) note_interrupt(module_irq, d, IRQ_NONE); else d->handle_irq(module_irq, d); } local_irq_enable(); } ret = twl_i2c_write(TWL_MODULE_PIH, sts.bytes, REG_INT_STS_A, 3); /* clear INT_STS_A */ if (ret) pr_warning("twl6030: I2C error in clearing PIH ISR\n"); enable_irq(irq); } return 0; }
/* * The driver enables interrupts as much as possible. In order to do this, * (a) the device-interrupt is disabled before entering hd_request(), * and (b) the timeout-interrupt is disabled before the sti(). * * Interrupts are still masked (by default) whenever we are exchanging * data/cmds with a drive, because some drives seem to have very poor * tolerance for latency during I/O. The IDE driver has support to unmask * interrupts for non-broken hardware, so use that driver if required. */ static void hd_request(void) { unsigned int block, nsect, sec, track, head, cyl; struct hd_i_struct *disk; struct request *req; if (do_hd) return; repeat: del_timer(&device_timer); local_irq_enable(); req = CURRENT; if (!req) { do_hd = NULL; return; } if (reset) { local_irq_disable(); reset_hd(); return; } disk = req->rq_disk->private_data; block = req->sector; nsect = req->nr_sectors; if (block >= get_capacity(req->rq_disk) || ((block+nsect) > get_capacity(req->rq_disk))) { printk("%s: bad access: block=%d, count=%d\n", req->rq_disk->disk_name, block, nsect); end_request(req, 0); goto repeat; } if (disk->special_op) { if (do_special_op(disk, req)) goto repeat; return; } sec = block % disk->sect + 1; track = block / disk->sect; head = track % disk->head; cyl = track / disk->head; #ifdef DEBUG printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", req->rq_disk->disk_name, (req->cmd == READ)?"read":"writ", cyl, head, sec, nsect, req->buffer); #endif if (req->flags & REQ_CMD) { switch (rq_data_dir(req)) { case READ: hd_out(disk,nsect,sec,head,cyl,WIN_READ,&read_intr); if (reset) goto repeat; break; case WRITE: hd_out(disk,nsect,sec,head,cyl,WIN_WRITE,&write_intr); if (reset) goto repeat; if (wait_DRQ()) { bad_rw_intr(); goto repeat; } outsw(HD_DATA,req->buffer,256); break; default: printk("unknown hd-command\n"); end_request(req, 0); break; } } }