/* * Call this when reinitializing a CPU. It fixes the following potential * problems: * * - The ASID changed from what cpu_tlbstate thinks it is (most likely * because the CPU was taken down and came back up with CR3's PCID * bits clear. CPU hotplug can do this. * * - The TLB contains junk in slots corresponding to inactive ASIDs. * * - The CPU went so far out to lunch that it may have missed a TLB * flush. */ void initialize_tlbstate_and_flush(void) { int i; struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); unsigned long cr3 = __read_cr3(); /* Assert that CR3 already references the right mm. */ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization * doesn't work like other CR4 bits because it can only be set from * long mode.) */ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); }
efi_status_t efi_thunk_set_virtual_address_map( void *phys_set_virtual_address_map, unsigned long memory_map_size, unsigned long descriptor_size, u32 descriptor_version, efi_memory_desc_t *virtual_map) { efi_status_t status; unsigned long flags; u32 func; efi_sync_low_kernel_mappings(); local_irq_save(flags); efi_scratch.prev_cr3 = __read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); __flush_tlb_all(); func = (u32)(unsigned long)phys_set_virtual_address_map; status = efi64_thunk(func, memory_map_size, descriptor_size, descriptor_version, virtual_map); write_cr3(efi_scratch.prev_cr3); __flush_tlb_all(); local_irq_restore(flags); return status; }
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; bool need_flush; u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching * from lazy TLB mode to normal mode if active_mm isn't changing. * When this happens, we don't assume that CR3 (and hence * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ /* We don't want flush_tlb_func_* to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free. */ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.) */ __flush_tlb_all(); } #endif this_cpu_write(cpu_tlbstate.is_lazy, false); /* * The membarrier system call requires a full memory barrier and * core serialization before returning to user-space, after * storing to rq->curr. Writing to CR3 provides that full * memory barrier and core serializing instruction. */ if (real_prev == next) { VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); /* * If the CPU is not in lazy TLB mode, we are just switching * from one thread in a process to another thread in the same * process. No TLB flush required. */ if (!was_lazy) return; /* * Read the tlb_gen to check whether a flush is needed. * If the TLB is up to date, just use it. * The barrier synchronizes with the tlb_gen increment in * the TLB shootdown code. */ smp_mb(); next_tlb_gen = atomic64_read(&next->context.tlb_gen); if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == next_tlb_gen) return; /* * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below. */ new_asid = prev_asid; need_flush = true; } else { /* * Avoid user/user BTB poisoning by flushing the branch * predictor when switching between processes. This stops * one process from doing Spectre-v2 attacks on another. */ cond_ibpb(tsk); if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ sync_current_stack_to_mm(next); } /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, * but the bitmap manipulation can cause cache line contention. */ if (real_prev != &init_mm) { VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev))); cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); } /* * Start remote flushes and then read tlb_gen. */ if (next != &init_mm) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); } if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, true); /* * NB: This gets called via leave_mm() in the idle path * where RCU functions differently. Tracing normally * uses RCU, so we need to use the _rcuidle variant. * * (There is no good reason for this. The idle code should * be rearranged to call this before rcu_idle_enter().) */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ load_new_mm_cr3(next->pgd, new_asid, false); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } /* Make sure we write CR3 before loaded_mm. */ barrier(); this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { load_mm_cr4(next); switch_ldt(real_prev, next); } }
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; /* * NB: The scheduler will call us with prev == next when switching * from lazy TLB mode to normal mode if active_mm isn't changing. * When this happens, we don't assume that CR3 (and hence * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ /* We don't want flush_tlb_func_* to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free. */ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.) */ __flush_tlb_all(); } #endif this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* * We don't currently support having a real mm loaded without * our cpu set in mm_cpumask(). We have all the bookkeeping * in place to figure out whether we would need to flush * if our cpu were cleared in mm_cpumask(), but we don't * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); return; } else { u16 new_asid; bool need_flush; if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) set_pgd(pgd, init_mm.pgd[index]); } /* Stop remote flushes for the previous mm */ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && real_prev != &init_mm); cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. */ cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } load_mm_cr4(next); switch_ldt(real_prev, next); }
pgd_t * __init efi_call_phys_prolog(void) { unsigned long vaddr, addr_pgd, addr_p4d, addr_pud; pgd_t *save_pgd, *pgd_k, *pgd_efi; p4d_t *p4d, *p4d_k, *p4d_efi; pud_t *pud; int pgd; int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { save_pgd = (pgd_t *)__read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); goto out; } early_code_mapping_set_exec(1); n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); /* * Build 1:1 identity mapping for efi=old_map usage. Note that * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical * address X, the pud_index(X) != pud_index(__va(X)), we can only copy * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping. * This means here we can only reuse the PMD tables of the direct mapping. */ for (pgd = 0; pgd < n_pgds; pgd++) { addr_pgd = (unsigned long)(pgd * PGDIR_SIZE); vaddr = (unsigned long)__va(pgd * PGDIR_SIZE); pgd_efi = pgd_offset_k(addr_pgd); save_pgd[pgd] = *pgd_efi; p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd); if (!p4d) { pr_err("Failed to allocate p4d table!\n"); goto out; } for (i = 0; i < PTRS_PER_P4D; i++) { addr_p4d = addr_pgd + i * P4D_SIZE; p4d_efi = p4d + p4d_index(addr_p4d); pud = pud_alloc(&init_mm, p4d_efi, addr_p4d); if (!pud) { pr_err("Failed to allocate pud table!\n"); goto out; } for (j = 0; j < PTRS_PER_PUD; j++) { addr_pud = addr_p4d + j * PUD_SIZE; if (addr_pud > (max_pfn << PAGE_SHIFT)) break; vaddr = (unsigned long)__va(addr_pud); pgd_k = pgd_offset_k(vaddr); p4d_k = p4d_offset(pgd_k, vaddr); pud[j] = *pud_offset(p4d_k, vaddr); } } } out: __flush_tlb_all(); return save_pgd; }
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; /* * NB: The scheduler will call us with prev == next when switching * from lazy TLB mode to normal mode if active_mm isn't changing. * When this happens, we don't assume that CR3 (and hence * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ /* We don't want flush_tlb_func_* to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free. */ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.) */ __flush_tlb_all(); } #endif if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); if (cpumask_test_cpu(cpu, mm_cpumask(next))) { /* * There's nothing to do: we weren't lazy, and we * aren't changing our mm. We don't need to flush * anything, nor do we need to update CR3, CR4, or * LDTR. */ return; } /* Resume remote flushes and then read tlb_gen. */ cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < next_tlb_gen) { /* * Ideally, we'd have a flush_tlb() variant that * takes the known CR3 value as input. This would * be faster on Xen PV and on hypothetical CPUs * on which INVPCID is fast. */ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, next_tlb_gen); write_cr3(__sme_pa(next->pgd) | prev_asid); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } /* * We just exited lazy mode, which means that CR4 and/or LDTR * may be stale. (Changes to the required CR4 and LDTR states * are not reflected in tlb_gen.) */ } else { u16 new_asid; bool need_flush; if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ unsigned int index = pgd_index(current_stack_pointer()); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) set_pgd(pgd, init_mm.pgd[index]); } /* Stop remote flushes for the previous mm */ if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); /* * Start remote flushes and then read tlb_gen. */ cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); write_cr3(__sme_pa(next->pgd) | new_asid); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } load_mm_cr4(next); switch_ldt(real_prev, next); }