void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { if (tlb_ops_need_broadcast()) { struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, &ta, 1); } else local_flush_tlb_page(vma, uaddr); broadcast_tlb_mm_a15_erratum(vma->vm_mm); }
void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (tlb_ops_need_broadcast()) { struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); } else local_flush_tlb_range(vma, start, end); }
void smp_flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { struct mm_struct *mm = vma->vm_mm; if(mm->context != NO_CONTEXT) { cpumask_t cpu_mask = *mm_cpumask(mm); cpu_clear(smp_processor_id(), cpu_mask); if (!cpus_empty(cpu_mask)) xc2((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_page), (unsigned long) vma, page); local_flush_tlb_page(vma, page); } }
void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len, int write) { /* VIPT non-aliasing cache */ if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm)) && vma->vm_flags & VM_EXEC) { unsigned long addr = (unsigned long)kaddr; /* only flushing the kernel mapping on non-aliasing VIPT */ __cpuc_coherent_kern_range(addr, addr + len); } }
/* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ void leave_mm(int cpu) { if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); cpumask_clear_cpu(cpu, mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); #ifndef CONFIG_PAX_PER_CPU_PGD load_cr3(swapper_pg_dir); #endif }
/* * This is the secondary CPU boot entry. We're using this CPUs * idle thread stack, but a set of temporary page tables. */ asmlinkage void __cpuinit secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; unsigned int cpu = smp_processor_id(); /* * All kernel threads share the same mm context; grab a * reference and switch to it. */ atomic_inc(&mm->mm_count); current->active_mm = mm; cpumask_set_cpu(cpu, mm_cpumask(mm)); set_my_cpu_offset(per_cpu_offset(smp_processor_id())); printk("CPU%u: Booted secondary processor\n", cpu); /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ cpu_set_reserved_ttbr0(); flush_tlb_all(); preempt_disable(); trace_hardirqs_off(); if (cpu_ops[cpu]->cpu_postboot) cpu_ops[cpu]->cpu_postboot(); /* * Enable GIC and timers. */ notify_cpu_starting(cpu); smp_store_cpu_info(cpu); /* * OK, now it's safe to let the boot CPU continue. Wait for * the CPU migration code to notice that the CPU is online * before we continue. */ set_cpu_online(cpu, true); complete(&cpu_running); local_dbg_enable(); local_irq_enable(); local_fiq_enable(); /* * OK, it's off to the idle thread for us */ cpu_startup_entry(CPUHP_ONLINE); }
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { unsigned long addr; /* do a global flush by default */ unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); if (current->active_mm != mm) goto out; if (!current->mm) { leave_mm(smp_processor_id()); goto out; } if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) base_pages_to_flush = (end - start) >> PAGE_SHIFT; if (base_pages_to_flush > tlb_single_page_flush_ceiling) { base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } } trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); out: if (base_pages_to_flush == TLB_FLUSH_ALL) { start = 0UL; end = TLB_FLUSH_ALL; } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, start, end); preempt_enable(); }
static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; unsigned int cpu, max, i; max = last_context - first_context; /* Attempt to free next_context first and then loop until we manage */ while (max--) { /* Pick up the victim mm */ mm = context_mm[id]; /* We have a candidate victim, check if it's active, on SMP * we cannot steal active contexts */ if (mm->context.active) { id++; if (id > last_context) id = first_context; continue; } pr_hardcont(" | steal %d from 0x%p", id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; /* Mark it stale on all CPUs that used this mm. For threaded * implementations, we set it on all threads on each core * represented in the mask. A future implementation will use * a core map instead but this will do for now. */ for_each_cpu(cpu, mm_cpumask(mm)) { for (i = cpu_first_thread_sibling(cpu); i <= cpu_last_thread_sibling(cpu); i++) { if (stale_map[i]) __set_bit(id, stale_map[i]); } cpu = i - 1; } return id; } /* This will happen if you have more CPUs than available contexts, * all we can do here is wait a bit and try again */ raw_spin_unlock(&context_lock); cpu_relax(); raw_spin_lock(&context_lock); /* This will cause the caller to try again */ return MMU_NO_CONTEXT; }
static void exit_flush_lazy_tlbs(struct mm_struct *mm) { /* * Would be nice if this was async so it could be run in * parallel with our local flush, but generic code does not * give a good API for it. Could extend the generic code or * make a special powerpc IPI for flushing TLBs. * For now it's not too performance critical. */ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb, (void *)mm, 1); mm_reset_thread_local(mm); }
void flush_tlb_mm(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_L4)) { l4x_unmap_sync_mm(mm); l4x_del_task(mm); return; } if (tlb_ops_need_broadcast()) on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); else local_flush_tlb_mm(mm); broadcast_tlb_mm_a15_erratum(mm); }
/* context.lock is held */ static void install_ldt(struct mm_struct *current_mm, struct ldt_struct *ldt) { /* Synchronizes with smp_read_barrier_depends in load_mm_ldt. */ barrier(); ACCESS_ONCE(current_mm->context.ldt) = ldt; /* Activate the LDT for all CPUs using current_mm. */ smp_call_function_many(mm_cpumask(current_mm), flush_ldt, current_mm, true); local_irq_disable(); flush_ldt(current_mm); local_irq_enable(); }
/** * flush_tlb_mm - Invalidate TLB of specified VM context * @mm: The VM context to invalidate. */ void flush_tlb_mm(struct mm_struct *mm) { cpumask_t cpu_mask; preempt_disable(); cpumask_copy(&cpu_mask, mm_cpumask(mm)); cpumask_clear_cpu(smp_processor_id(), &cpu_mask); local_flush_tlb(); if (!cpumask_empty(&cpu_mask)) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); }
void smp_flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; if (mm->context != NO_CONTEXT) { cpumask_t cpu_mask; cpumask_copy(&cpu_mask, mm_cpumask(mm)); cpumask_clear_cpu(smp_processor_id(), &cpu_mask); if (!cpumask_empty(&cpu_mask)) xc3((smpfunc_t) BTFIXUP_CALL(local_flush_tlb_range), (unsigned long) vma, start, end); local_flush_tlb_range(vma, start, end); } }
/** * flush_tlb_page - Invalidate TLB of page * @vma: The VM context to invalidate the page for. * @va: The virtual address of the page to invalidate. */ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; cpumask_t cpu_mask; preempt_disable(); cpumask_copy(&cpu_mask, mm_cpumask(mm)); cpumask_clear_cpu(smp_processor_id(), &cpu_mask); local_flush_tlb_page(mm, va); if (!cpumask_empty(&cpu_mask)) flush_tlb_others(cpu_mask, mm, va); preempt_enable(); }
void secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; unsigned int cpu = smp_processor_id(); init_mmu(); #ifdef CONFIG_DEBUG_KERNEL if (boot_secondary_processors == 0) { pr_debug("%s: boot_secondary_processors:%d; Hanging cpu:%d\n", __func__, boot_secondary_processors, cpu); for (;;) __asm__ __volatile__ ("waiti " __stringify(LOCKLEVEL)); } pr_debug("%s: boot_secondary_processors:%d; Booting cpu:%d\n", __func__, boot_secondary_processors, cpu); #endif /* Init EXCSAVE1 */ secondary_trap_init(); /* All kernel threads share the same mm context. */ mmget(mm); mmgrab(mm); current->active_mm = mm; cpumask_set_cpu(cpu, mm_cpumask(mm)); enter_lazy_tlb(mm, current); preempt_disable(); trace_hardirqs_off(); calibrate_delay(); notify_cpu_starting(cpu); secondary_init_irq(); local_timer_setup(cpu); set_cpu_online(cpu, true); local_irq_enable(); complete(&cpu_running); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); }
/* * This function is called when terminating an mmu batch or when a batch * is full. It will perform the flush of all the entries currently stored * in a batch. * * Must be called from within some kind of spinlock/non-preempt region... */ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) { const struct cpumask *tmp; int i, local = 0; i = batch->index; tmp = cpumask_of(smp_processor_id()); if (cpumask_equal(mm_cpumask(batch->mm), tmp)) local = 1; if (i == 1) flush_hash_page(batch->vpn[0], batch->pte[0], batch->psize, batch->ssize, local); else flush_hash_range(i, local); batch->index = 0; }
void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { if (IS_ENABLED(CONFIG_L4)) { l4x_unmap_sync_mm(vma->vm_mm); l4x_unmap_page(vma->vm_mm, uaddr); return; } if (tlb_ops_need_broadcast()) { struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, &ta, 1); } else local_flush_tlb_page(vma, uaddr); broadcast_tlb_mm_a15_erratum(vma->vm_mm); }
void __init mem_init(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); /* Setup guest page hinting */ cmma_init(); /* this will put all low memory onto the freelists */ free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ cmma_init_nodat(); mem_init_print_info(NULL); }
static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; unsigned int cpu, max; max = last_context - first_context; /* Attempt to free next_context first and then loop until we manage */ while (max--) { /* Pick up the victim mm */ mm = context_mm[id]; /* We have a candidate victim, check if it's active, on SMP * we cannot steal active contexts */ if (mm->context.active) { id++; if (id > last_context) id = first_context; continue; } pr_devel("[%d] steal context %d from mm @%p\n", smp_processor_id(), id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; /* Mark it stale on all CPUs that used this mm */ for_each_cpu(cpu, mm_cpumask(mm)) __set_bit(id, stale_map[cpu]); return id; } /* This will happen if you have more CPUs than available contexts, * all we can do here is wait a bit and try again */ atomic_spin_unlock(&context_lock); cpu_relax(); atomic_spin_lock(&context_lock); /* This will cause the caller to try again */ return MMU_NO_CONTEXT; }
void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); /* * It's plausible that we're in lazy TLB mode while our mm is init_mm. * If so, our callers still expect us to flush the TLB, but there * aren't any user TLB entries in init_mm to worry about. * * This needs to happen before any other sanity checks due to * intel_idle's shenanigans. */ if (loaded_mm == &init_mm) return; /* Warn if we're not lazy. */ WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); switch_mm(NULL, &init_mm, NULL); }
int __cpu_disable(void) { unsigned int cpu = smp_processor_id(); struct task_struct *p; int ret; ret = mp_ops->cpu_disable(cpu); if (ret) return ret; /* * Take this CPU offline. Once we clear this, we can't return, * and we must not schedule until we're ready to give up the cpu. */ set_cpu_online(cpu, false); /* * OK - migrate IRQs away from this CPU */ migrate_irqs(); /* * Stop the local timer for this CPU. */ local_timer_stop(cpu); /* * Flush user cache and TLB mappings, and then remove this CPU * from the vm mask set of all processes. */ flush_cache_all(); local_flush_tlb_all(); read_lock(&tasklist_lock); for_each_process(p) if (p->mm) cpumask_clear_cpu(cpu, mm_cpumask(p->mm)); read_unlock(&tasklist_lock); return 0; }
static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; #ifdef CONFIG_HOTPLUG_CPU struct task_struct *p; #endif /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ if (cpu == boot_cpuid) return NOTIFY_OK; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); kfree(stale_map[cpu]); stale_map[cpu] = NULL; /* We also clear the cpu_vm_mask bits of CPUs going away */ read_lock(&tasklist_lock); for_each_process(p) { if (p->mm) cpumask_clear_cpu(cpu, mm_cpumask(p->mm)); } read_unlock(&tasklist_lock); break; #endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; }
void __init mem_init(void) { if (MACHINE_HAS_TLB_LC) cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); atomic_set(&init_mm.context.attach_count, 1); max_mapnr = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); /* Setup guest page hinting */ cmma_init(); /* this will put all low memory onto the freelists */ free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ mem_init_print_info(NULL); printk("Write protected kernel read-only data: %#lx - %#lx\n", (unsigned long)&_stext, PFN_ALIGN((unsigned long)&_eshared) - 1); }
static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; unsigned int cpu, max, i; max = last_context - first_context; while (max--) { mm = context_mm[id]; if (mm->context.active) { id++; if (id > last_context) id = first_context; continue; } pr_hardcont(" | steal %d from 0x%p", id, mm); mm->context.id = MMU_NO_CONTEXT; for_each_cpu(cpu, mm_cpumask(mm)) { for (i = cpu_first_thread_sibling(cpu); i <= cpu_last_thread_sibling(cpu); i++) __set_bit(id, stale_map[i]); cpu = i - 1; } return id; } raw_spin_unlock(&context_lock); cpu_relax(); raw_spin_lock(&context_lock); return MMU_NO_CONTEXT; }
* so that when we are unmapping an executable page, we also flush it. * Combined with flushing the L1I at context switch time, this means * we don't have to do any other icache flushes. */ void flush_tlb_mm(struct mm_struct *mm) { HV_Remote_ASID asids[NR_CPUS]; int i = 0, cpu; for_each_cpu(cpu, mm_cpumask(mm)) { HV_Remote_ASID *asid = &asids[i++]; asid->y = cpu / smp_topology.width; asid->x = cpu % smp_topology.width; asid->asid = per_cpu(current_asid, cpu); } flush_remote(0, HV_FLUSH_EVICT_L1I, mm_cpumask(mm), 0, 0, 0, NULL, asids, i); } void flush_tlb_current_task(void) { flush_tlb_mm(current->mm); } void flush_tlb_page_mm(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long va) { unsigned long size = vma_kernel_pagesize(vma); int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0; flush_remote(0, cache, mm_cpumask(mm), va, size, size, mm_cpumask(mm), NULL, 0);
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; /* * NB: The scheduler will call us with prev == next when switching * from lazy TLB mode to normal mode if active_mm isn't changing. * When this happens, we don't assume that CR3 (and hence * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ /* We don't want flush_tlb_func_* to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free. */ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.) */ __flush_tlb_all(); } #endif this_cpu_write(cpu_tlbstate.is_lazy, false); if (real_prev == next) { VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* * We don't currently support having a real mm loaded without * our cpu set in mm_cpumask(). We have all the bookkeeping * in place to figure out whether we would need to flush * if our cpu were cleared in mm_cpumask(), but we don't * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); return; } else { u16 new_asid; bool need_flush; if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) set_pgd(pgd, init_mm.pgd[index]); } /* Stop remote flushes for the previous mm */ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && real_prev != &init_mm); cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* * Start remote flushes and then read tlb_gen. */ cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } load_mm_cr4(next); switch_ldt(real_prev, next); }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(tmp, &mapping->i_mmap_nonlinear); else vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned cpu = smp_processor_id(); if (likely(prev != next)) { #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif cpumask_set_cpu(cpu, mm_cpumask(next)); /* * Re-load page tables. * * This logic has an ordering constraint: * * CPU 0: Write to a PTE for 'next' * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. * CPU 1: set bit 1 in next's mm_cpumask * CPU 1: load from the PTE that CPU 0 writes (implicit) * * We need to prevent an outcome in which CPU 1 observes * the new PTE value and CPU 0 observes bit 1 clear in * mm_cpumask. (If that occurs, then the IPI will never * be sent, and CPU 0's TLB will contain a stale entry.) * * The bad outcome can occur if either CPU's load is * reordered before that CPU's store, so both CPUs must * execute full barriers to prevent this from happening. * * Thus, switch_mm needs a full barrier between the * store to mm_cpumask and any operation that could load * from next->pgd. TLB fills are special and can happen * due to instruction fetches or for no reason at all, * and neither LOCK nor MFENCE orders them. * Fortunately, load_cr3() is serializing and gives the * ordering guarantee we need. * */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); /* Load per-mm CR4 state */ load_mm_cr4(next); #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * Load the LDT, if the LDT is different. * * It's possible that prev->context.ldt doesn't match * the LDT register. This can happen if leave_mm(prev) * was called and then modify_ldt changed * prev->context.ldt but suppressed an IPI to this CPU. * In this case, prev->context.ldt != NULL, because we * never set context.ldt to NULL while the mm still * exists. That means that next->context.ldt != * prev->context.ldt, because mms never share an LDT. */ if (unlikely(prev->context.ldt != next->context.ldt)) load_mm_ldt(next); #endif } #ifdef CONFIG_SMP else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { /* * On established mms, the mm_cpumask is only changed * from irq context, from ptep_clear_flush() while in * lazy tlb mode, and here. Irqs are blocked during * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. * * As above, load_cr3() is serializing and orders TLB * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); load_mm_cr4(next); load_mm_ldt(next); } } #endif }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; anon_vma_link(tmp); file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); spin_lock(&mapping->i_mmap_lock); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); spin_unlock(&mapping->i_mmap_lock); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; }
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; bool need_flush; u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching * from lazy TLB mode to normal mode if active_mm isn't changing. * When this happens, we don't assume that CR3 (and hence * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ /* We don't want flush_tlb_func_* to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir * without going through leave_mm() / switch_mm_irqs_off() or that * does something like write_cr3(read_cr3_pa()). * * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() * isn't free. */ #ifdef CONFIG_DEBUG_VM if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. * Try to recover instead by ignoring the error and doing * a global flush to minimize the chance of corruption. * * (This is far from being a fully correct recovery. * Architecturally, the CPU could prefetch something * back into an incorrect ASID slot and leave it there * to cause trouble down the road. It's better than * nothing, though.) */ __flush_tlb_all(); } #endif this_cpu_write(cpu_tlbstate.is_lazy, false); /* * The membarrier system call requires a full memory barrier and * core serialization before returning to user-space, after * storing to rq->curr. Writing to CR3 provides that full * memory barrier and core serializing instruction. */ if (real_prev == next) { VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); /* * If the CPU is not in lazy TLB mode, we are just switching * from one thread in a process to another thread in the same * process. No TLB flush required. */ if (!was_lazy) return; /* * Read the tlb_gen to check whether a flush is needed. * If the TLB is up to date, just use it. * The barrier synchronizes with the tlb_gen increment in * the TLB shootdown code. */ smp_mb(); next_tlb_gen = atomic64_read(&next->context.tlb_gen); if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == next_tlb_gen) return; /* * TLB contents went out of date while we were in lazy * mode. Fall through to the TLB switching code below. */ new_asid = prev_asid; need_flush = true; } else { /* * Avoid user/user BTB poisoning by flushing the branch * predictor when switching between processes. This stops * one process from doing Spectre-v2 attacks on another. */ cond_ibpb(tsk); if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ sync_current_stack_to_mm(next); } /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, * but the bitmap manipulation can cause cache line contention. */ if (real_prev != &init_mm) { VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev))); cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); } /* * Start remote flushes and then read tlb_gen. */ if (next != &init_mm) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); } if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, true); /* * NB: This gets called via leave_mm() in the idle path * where RCU functions differently. Tracing normally * uses RCU, so we need to use the _rcuidle variant. * * (There is no good reason for this. The idle code should * be rearranged to call this before rcu_idle_enter().) */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ load_new_mm_cr3(next->pgd, new_asid, false); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } /* Make sure we write CR3 before loaded_mm. */ barrier(); this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { load_mm_cr4(next); switch_ldt(real_prev, next); } }