/** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * @nonblocking: whether waiting for disk IO or mmap_sem contention * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * * Must be called with mmap_sem held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, * *@nonblocking will be set to 0. Further, if @gup_flags does not * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in * this case. * * A caller using such a combination of @nonblocking and @gup_flags * must therefore hold the mmap_sem for reading only, and recognize * when it's been released. Otherwise, it must be held for either * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { long i = 0; unsigned int page_mask; struct vm_area_struct *vma = NULL; if (!nr_pages) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* * If FOLL_FORCE is set then do not force a full fault as the hinting * fault information is unrelated to the reference behaviour of a task * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { int ret; ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &pages[i] : NULL); if (ret) return i ? : ret; page_mask = 0; goto next_page; } if (!vma || check_vma_flags(vma, gup_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags); continue; } }
static void pack_process_mm_stats(struct bproc_status_msg_t *msg, struct mm_struct *mm) { struct vm_area_struct *vma; memset(&msg->vm, 0, sizeof(msg->vm)); msg->vm.statm.resident = mm->rss; msg->vm.status.total_vm = mm->total_vm; msg->vm.status.locked_vm = mm->locked_vm; msg->vm.status.rss = mm->rss; /* This code is stolen from the Linux kernel (fs/proc/task_mmu.c) */ for (vma = mm->mmap; vma; vma = vma->vm_next) { { /* CODE FOR statm */ int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; msg->vm.statm.size += pages; if (is_vm_hugetlb_page(vma)) { if (!(vma->vm_flags & VM_DONTCOPY)) msg->vm.statm.shared += pages; continue; } if (vma->vm_file) msg->vm.statm.shared += pages; if (vma->vm_flags & VM_EXECUTABLE) msg->vm.statm.text += pages; else msg->vm.statm.data += pages; } { /* CODE FOR status */ unsigned long len = (vma->vm_end - vma->vm_start) >> 10; if (!vma->vm_file) { msg->vm.status.data += len; if (vma->vm_flags & VM_GROWSDOWN) msg->vm.status.stack += len; continue; } if (vma->vm_flags & VM_WRITE) continue; if (vma->vm_flags & VM_EXEC) { msg->vm.status.exec += len; if (vma->vm_flags & VM_EXECUTABLE) continue; msg->vm.status.lib += len; } } } }
static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) { struct vm_area_struct *vma; /* We don't need vma lookup at all. */ if (!walk->hugetlb_entry) return NULL; VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); vma = find_vma(walk->mm, addr); if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) return vma; return NULL; }
void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; int cpu = smp_processor_id(); if (cpu_context(cpu, mm) != 0) { unsigned long size, flags; int huge = is_vm_hugetlb_page(vma); ENTER_CRITICAL(flags); if (huge) { start = round_down(start, HPAGE_SIZE); end = round_up(end, HPAGE_SIZE); size = (end - start) >> HPAGE_SHIFT; } else {
static int __filemap_sync(struct vm_area_struct *vma, unsigned long address, size_t size, unsigned int flags) { pgd_t *pgd; unsigned long end = address + size; unsigned long next; int i; int error = 0; /* Aquire the lock early; it may be possible to avoid dropping * and reaquiring it repeatedly. */ spin_lock(&vma->vm_mm->page_table_lock); pgd = pgd_offset(vma->vm_mm, address); flush_cache_range(vma, address, end); /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync() */ if (is_vm_hugetlb_page(vma)) goto out; if (address >= end) BUG(); for (i = pgd_index(address); i <= pgd_index(end-1); i++) { next = (address + PGDIR_SIZE) & PGDIR_MASK; if (next <= address || next > end) next = end; error |= filemap_sync_pud_range(pgd, address, next, vma, flags); address = next; pgd++; } /* * Why flush ? filemap_sync_pte already flushed the tlbs with the * dirty bits. */ flush_tlb_range(vma, end - size, end); out: spin_unlock(&vma->vm_mm->page_table_lock); return error; }
static int filemap_sync(struct vm_area_struct * vma, unsigned long address, size_t size, unsigned int flags) { pgd_t * dir; unsigned long end = address + size; int error = 0; /* Aquire the lock early; it may be possible to avoid dropping * and reaquiring it repeatedly. */ spin_lock(&vma->vm_mm->page_table_lock); dir = pgd_offset(vma->vm_mm, address); flush_cache_range(vma, address, end); /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync() */ if (is_vm_hugetlb_page(vma)) goto out; if (address >= end) BUG(); do { error |= filemap_sync_pmd_range(dir, address, end, vma, flags); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); /* * Why flush ? filemap_sync_pte already flushed the tlbs with the * dirty bits. */ flush_tlb_range(vma, end - size, end); out: spin_unlock(&vma->vm_mm->page_table_lock); return error; }
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { int ret; bool write_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool fault_ipa_uncached; bool logging_active = memslot_is_logging(memslot); unsigned long flags = 0; write_fault = kvm_is_write_fault(vcpu); if (fault_status == FSC_PERM && !write_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); if (unlikely(!vma)) { kvm_err("Failed to find VMA for hva 0x%lx\n", hva); up_read(¤t->mm->mmap_sem); return -EFAULT; } if (is_vm_hugetlb_page(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else {
/* * IA-32 Huge TLB Page Support for Kernel. * * Copyright (C) 2002, Rohit Seth <*****@*****.**> */ #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgalloc.h> #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { unsigned long start = address; int length = 1; int nr; struct page *page; struct vm_area_struct *vma; vma = find_vma(mm, addr); if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); pte = huge_pte_offset(mm, address); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; WARN_ON(!PageHead(page)); return page; }
static inline int do_exception(struct pt_regs *regs, int access) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long trans_exc_code; unsigned long address; unsigned int flags; int fault; if (notify_page_fault(regs)) return 0; tsk = current; mm = tsk->mm; trans_exc_code = regs->int_parm_long; fault = VM_FAULT_BADCONTEXT; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto out; address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); #ifdef CONFIG_PGSTE if (test_tsk_thread_flag(current, TIF_SIE) && S390_lowcore.gmap) { address = __gmap_fault(address, (struct gmap *) S390_lowcore.gmap); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; } if (address == -ENOMEM) { fault = VM_FAULT_OOM; goto out_up; } } #endif retry: fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) goto out_up; if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto out_up; if (expand_stack(vma, address)) goto out_up; } fault = VM_FAULT_BADACCESS; if (unlikely(!(vma->vm_flags & access))) goto out_up; if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; down_read(&mm->mmap_sem); goto retry; } } clear_tsk_thread_flag(tsk, TIF_PER_TRAP); fault = 0; out_up: up_read(&mm->mmap_sem); out: return fault; }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_prfile) get_file(tmp->vm_prfile); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(tmp, &mapping->i_mmap_nonlinear); else vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * * error_code: * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ static inline void __kprobes do_exception(struct pt_regs *regs, unsigned long error_code, int is_protection) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; int user_address; const struct exception_table_entry *fixup; int si_code = SEGV_MAPERR; tsk = current; mm = tsk->mm; if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; /* * Check for low-address protection. This needs to be treated * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ if (is_protection && !(S390_lowcore.trans_exc_code & 4)) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ if (!(regs->psw.mask & PSW_MASK_PSTATE)) { address = 0; user_address = 0; goto no_context; } /* Low-address protection hit in user mode 'cannot happen'. */ die ("Low-address protection", regs, error_code); do_exit(SIGKILL); } /* * get the failing address * more specific the segment and page table portion of * the address */ address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; user_address = check_user_space(regs, error_code); /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ if (user_address == 0 || in_atomic() || !mm) goto no_context; /* * When we get here, the fault happened in the current * task's user address space, so we can switch on the * interrupts again and then search the VMAs */ local_irq_enable(); down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) goto bad_area; if (vma->vm_start <= address) goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (expand_stack(vma, address)) goto bad_area; /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: si_code = SEGV_ACCERR; if (!is_protection) { /* page not present, check vm flags */ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } else { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } survive: if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, is_protection)) { case VM_FAULT_MINOR: tsk->min_flt++; break; case VM_FAULT_MAJOR: tsk->maj_flt++; break; case VM_FAULT_SIGBUS: goto do_sigbus; case VM_FAULT_OOM: goto out_of_memory; default: BUG(); } up_read(&mm->mmap_sem); /* * The instruction that caused the program check will * be repeated. Don't signal single step via SIGTRAP. */ clear_tsk_thread_flag(current, TIF_SINGLE_STEP); return; /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ bad_area: up_read(&mm->mmap_sem); /* User mode accesses just cause a SIGSEGV */ if (regs->psw.mask & PSW_MASK_PSTATE) { tsk->thread.prot_addr = address; tsk->thread.trap_no = error_code; do_sigsegv(regs, error_code, si_code, address); return; } no_context: /* Are we prepared to handle this kernel fault? */ fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); if (fixup) { regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; return; } /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ if (user_address == 0) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %p\n", (void *)address); die("Oops", regs, error_code); do_exit(SIGKILL); /* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */ out_of_memory: up_read(&mm->mmap_sem); if (tsk->pid == 1) { yield(); goto survive; } printk("VM: killing process %s\n", tsk->comm); if (regs->psw.mask & PSW_MASK_PSTATE) do_exit(SIGKILL); goto no_context; do_sigbus: up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ tsk->thread.prot_addr = address; tsk->thread.trap_no = error_code; force_sig(SIGBUS, tsk); /* Kernel mode? Handle exceptions or die */ if (!(regs->psw.mask & PSW_MASK_PSTATE)) goto no_context; }
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool zeropage) { struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; unsigned long src_addr, dst_addr; long copied; struct page *page; /* * Sanitize the command parameters: */ BUG_ON(dst_start & ~PAGE_MASK); BUG_ON(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ BUG_ON(src_start + len <= src_start); BUG_ON(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; copied = 0; page = NULL; retry: down_read(&dst_mm->mmap_sem); /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma) goto out_unlock; /* * Be strict and only allow __mcopy_atomic on userfaultfd * registered ranges to prevent userland errors going * unnoticed. As far as the VM consistency is concerned, it * would be perfectly safe to remove this check, but there's * no useful usage for __mcopy_atomic ouside of userfaultfd * registered ranges. This is after all why these are ioctls * belonging to the userfaultfd and not syscalls. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. */ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && dst_vma->vm_flags & VM_SHARED)) goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, src_start, len, zeropage); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; /* * Ensure the dst_vma has a anon_vma or this page * would get a NULL anon_vma when moved in the * dst_vma. */ err = -ENOMEM; if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { pmd_t dst_pmdval; BUG_ON(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmd_read_atomic(dst_pmd); /* * If the dst_pmd is mapped as THP don't * override it and just be strict. */ if (unlikely(pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } /* If an huge pmd materialized from under us fail */ if (unlikely(pmd_trans_huge(*dst_pmd))) { err = -EFAULT; break; } BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, &page, zeropage); cond_resched(); if (unlikely(err == -EFAULT)) { void *page_kaddr; up_read(&dst_mm->mmap_sem); BUG_ON(!page); page_kaddr = kmap(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); kunmap(page); if (unlikely(err)) { err = -EFAULT; goto out; } goto retry; } else BUG_ON(page); if (!err) { dst_addr += PAGE_SIZE; src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&dst_mm->mmap_sem); out: if (page) put_page(page); BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); return copied ? copied : err; }
/* * Hacked from kernel function __get_user_pages in mm/memory.c * * Handle buffers allocated by other kernel space driver and mmaped into user * space, function Ignore the VM_PFNMAP and VM_IO flag in VMA structure * * Get physical pages from user space virtual address and update into page list */ static int __get_pfnmap_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { int i, ret; unsigned long vm_flags; if (nr_pages <= 0) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* * Require read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (gup_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; vma = find_vma(mm, start); if (!vma) { dev_err(atomisp_dev, "find_vma failed\n"); return i ? : -EFAULT; } if (is_vm_hugetlb_page(vma)) { /* i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, gup_flags); */ continue; } do { struct page *page; unsigned long pfn; /* * If we have a pending SIGKILL, don't keep faulting * pages and potentially allocating memory. */ if (unlikely(fatal_signal_pending(current))) { dev_err(atomisp_dev, "fatal_signal_pending in %s\n", __func__); return i ? i : -ERESTARTSYS; } ret = follow_pfn(vma, start, &pfn); if (ret) { dev_err(atomisp_dev, "follow_pfn() failed\n"); return i ? : -EFAULT; } page = pfn_to_page(pfn); if (IS_ERR(page)) return i ? i : PTR_ERR(page); if (pages) { pages[i] = page; get_page(page); flush_anon_page(vma, page, start); flush_dcache_page(page); } if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; nr_pages--; } while (nr_pages && start < vma->vm_end); } while (nr_pages);
static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; struct mm_struct *mm = NULL; struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: * __oom_reap_task exit_mm * atomic_inc_not_zero * mmput * atomic_dec_and_test * exit_oom_victim * [...] * out_of_memory * select_bad_process * # no TIF_MEMDIE task selects new victim * unmap_page_range # frees some memory */ mutex_lock(&oom_lock); /* * Make sure we find the associated mm_struct even when the particular * thread has already terminated and cleared its mm. * We might have race with exit path so consider our work done if there * is no mm. */ p = find_lock_task_mm(tsk); if (!p) goto unlock_oom; mm = p->mm; atomic_inc(&mm->mm_users); task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; goto unlock_oom; } tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (is_vm_hugetlb_page(vma)) continue; /* * mlocked VMAs require explicit munlocking before unmap. * Let's keep it simple here and skip such VMAs. */ if (vma->vm_flags & VM_LOCKED) continue; /* * Only anonymous pages have a good chance to be dropped * without additional steps which we cannot afford as we * are OOM already. * * We do not even care about fs backed pages because all * which are reclaimable have already been reclaimed and * we do not want to block exit_mmap by keeping mm ref * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, &details); } tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); /* * This task can be safely ignored because we cannot do much more * to release its memory. */ set_bit(MMF_OOM_REAPED, &mm->flags); unlock_oom: mutex_unlock(&oom_lock); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ if (mm) mmput_async(mm); return ret; } #define MAX_OOM_REAP_RETRIES 10 static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; /* Retry the down_read_trylock(mmap_sem) a few times */ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) schedule_timeout_idle(HZ/10); if (attempts > MAX_OOM_REAP_RETRIES) { pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); debug_show_all_locks(); } /* * Clear TIF_MEMDIE because the task shouldn't be sitting on a * reasonably reclaimable memory anymore or it is not a good candidate * for the oom victim right now because it cannot release its memory * itself nor by the oom reaper. */ tsk->oom_reaper_list = NULL; exit_oom_victim(tsk); /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); }
/* * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is * called with mmap_sem held, it will release mmap_sem before returning. */ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, bool zeropage) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; long copied; struct page *page; struct hstate *h; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; struct address_space *mapping; /* * There is no default zero huge page for all huge page sizes as * supported by hugetlb. A PMD_SIZE huge pages may exist as used * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ if (zeropage) { up_read(&dst_mm->mmap_sem); return -EINVAL; } src_addr = src_start; dst_addr = dst_start; copied = 0; page = NULL; vma_hpagesize = vma_kernel_pagesize(dst_vma); /* * Validate alignment based on huge page size */ err = -EINVAL; if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) goto out_unlock; retry: /* * On routine entry dst_vma is set. If we had to drop mmap_sem and * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; /* * Check the vma is registered in uffd, this is * required to enforce the VM_MAYWRITE check done at * uffd registration time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) goto out_unlock; vm_shared = dst_vma->vm_flags & VM_SHARED; } if (WARN_ON(dst_addr & (vma_hpagesize - 1) || (len - copied) & (vma_hpagesize - 1))) goto out_unlock; /* * If not shared, ensure the dst_vma has a anon_vma. */ err = -ENOMEM; if (!vm_shared) { if (unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; } h = hstate_vma(dst_vma); while (src_addr < src_start + len) { pte_t dst_pteval; BUG_ON(dst_addr >= dst_start + len); VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* * Serialize via hugetlb_fault_mutex */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } err = -EEXIST; dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); vm_alloc_shared = vm_shared; cond_resched(); if (unlikely(err == -ENOENT)) { up_read(&dst_mm->mmap_sem); BUG_ON(!page); err = copy_huge_page_from_user(page, (const void __user *)src_addr, pages_per_huge_page(h), true); if (unlikely(err)) { err = -EFAULT; goto out; } down_read(&dst_mm->mmap_sem); dst_vma = NULL; goto retry; } else BUG_ON(page); if (!err) { dst_addr += vma_hpagesize; src_addr += vma_hpagesize; copied += vma_hpagesize; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&dst_mm->mmap_sem); out: if (page) { /* * We encountered an error and are about to free a newly * allocated huge page. * * Reservation handling is very subtle, and is different for * private and shared mappings. See the routine * restore_reserve_on_error for details. Unfortunately, we * can not call restore_reserve_on_error now as it would * require holding mmap_sem. * * If a reservation for the page existed in the reservation * map of a private mapping, the map was modified to indicate * the reservation was consumed when the page was allocated. * We clear the PagePrivate flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. * This is better than leaking a global reservation. If no * reservation existed, it is still safe to clear PagePrivate * as no adjustments to reservation counts were made during * allocation. * * The reservation map for shared mappings indicates which * pages have reservations. When a huge page is allocated * for an address with a reservation, no change is made to * the reserve map. In this case PagePrivate will be set * to indicate that the global reservation count should be * incremented when the page is freed. This is the desired * behavior. However, when a huge page is allocated for an * address without a reservation a reservation entry is added * to the reservation map, and PagePrivate will not be set. * When the page is freed, the global reserve count will NOT * be incremented and it will appear as though we have leaked * reserved page. In this case, set PagePrivate so that the * global reserve count will be incremented to match the * reservation map entry which was created. * * Note that vm_alloc_shared is based on the flags of the vma * for which the page was originally allocated. dst_vma could * be different or NULL on error. */ if (vm_alloc_shared) SetPagePrivate(page); else ClearPagePrivate(page); put_page(page); } BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); return copied ? copied : err; }
/** * walk_page_range - walk a memory map's page tables with a callback * @mm: memory map to walk * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, * etc.). If lower-level callbacks are omitted, walking depth is reduced. * * Each callback receives an entry pointer and the start and end of the * associated range, and a copy of the original mm_walk for access to * the ->private or ->mm fields. * * No locks are taken, but the bottom level iterator will map PTE * directories from highmem if necessary. * * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. */ int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { pgd_t *pgd; unsigned long next; int err = 0; if (addr >= end) return err; VM_BUG_ON((addr & ~PAGE_MASK) || (end & ~PAGE_MASK)); if (!walk->mm) return -EINVAL; pgd = pgd_offset(walk->mm, addr); do { struct vm_area_struct *uninitialized_var(vma); next = pgd_addr_end(addr, end); #ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); if (vma && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; /* * Hugepage is very tightly coupled with vma, so * walk through hugetlb entries within a given vma. */ err = walk_hugetlb_range(vma, addr, next, walk); if (err) break; pgd = pgd_offset(walk->mm, next); continue; } #endif if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; pgd++; continue; } if (walk->pgd_entry) err = walk->pgd_entry(pgd, addr, next, walk); if (!err && (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) err = walk_pud_range(pgd, addr, next, walk); if (err) break; pgd++; } while (addr = next, addr != end); return err; }
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool zeropage, bool *mmap_changing) { struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; unsigned long src_addr, dst_addr; long copied; struct page *page; /* * Sanitize the command parameters: */ BUG_ON(dst_start & ~PAGE_MASK); BUG_ON(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ BUG_ON(src_start + len <= src_start); BUG_ON(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; copied = 0; page = NULL; retry: down_read(&dst_mm->mmap_sem); /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ err = -EAGAIN; if (mmap_changing && READ_ONCE(*mmap_changing)) goto out_unlock; /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma) goto out_unlock; /* * Check the vma is registered in uffd, this is required to * enforce the VM_MAYWRITE check done at uffd registration * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. */ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && dst_vma->vm_flags & VM_SHARED)) goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, src_start, len, zeropage); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; /* * Ensure the dst_vma has a anon_vma or this page * would get a NULL anon_vma when moved in the * dst_vma. */ err = -ENOMEM; if (!(dst_vma->vm_flags & VM_SHARED) && unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { pmd_t dst_pmdval; BUG_ON(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmd_read_atomic(dst_pmd); /* * If the dst_pmd is mapped as THP don't * override it and just be strict. */ if (unlikely(pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } /* If an huge pmd materialized from under us fail */ if (unlikely(pmd_trans_huge(*dst_pmd))) { err = -EFAULT; break; } BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, &page, zeropage); cond_resched(); if (unlikely(err == -ENOENT)) { void *page_kaddr; up_read(&dst_mm->mmap_sem); BUG_ON(!page); page_kaddr = kmap(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); kunmap(page); if (unlikely(err)) { err = -EFAULT; goto out; } goto retry; } else BUG_ON(page); if (!err) { dst_addr += PAGE_SIZE; src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&dst_mm->mmap_sem); out: if (page) put_page(page); BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); return copied ? copied : err; }
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; bool local, full; #ifdef CONFIG_HUGETLB_PAGE if (is_vm_hugetlb_page(vma)) return radix__flush_hugetlb_tlb_range(vma, start, end); #endif pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; preempt_disable(); if (mm_is_thread_local(mm)) { local = true; full = (end == TLB_FLUSH_ALL || nr_pages > tlb_local_single_page_flush_ceiling); } else { local = false; full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); } if (full) { if (local) { _tlbiel_pid(pid, RIC_FLUSH_TLB); } else { if (mm_needs_flush_escalation(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbie_pid(pid, RIC_FLUSH_TLB); } } else { bool hflush = false; unsigned long hstart, hend; #ifdef CONFIG_TRANSPARENT_HUGEPAGE hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; hend = end >> HPAGE_PMD_SHIFT; if (hstart < hend) { hstart <<= HPAGE_PMD_SHIFT; hend <<= HPAGE_PMD_SHIFT; hflush = true; } #endif asm volatile("ptesync": : :"memory"); if (local) { __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, HPAGE_PMD_SIZE, MMU_PAGE_2M); asm volatile("ptesync": : :"memory"); } else { __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, HPAGE_PMD_SIZE, MMU_PAGE_2M); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; mm->shared_vm = oldmm->shared_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory_mm(oldmm, len)) goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * * interruption code (int_code): * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ static inline int do_exception(struct pt_regs *regs, int access) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; unsigned long trans_exc_code; unsigned long address; unsigned int flags; int fault; if (notify_page_fault(regs)) return 0; tsk = current; mm = tsk->mm; trans_exc_code = regs->int_parm_long; /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ fault = VM_FAULT_BADCONTEXT; if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) goto out; address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); #ifdef CONFIG_PGSTE if (test_tsk_thread_flag(current, TIF_SIE) && S390_lowcore.gmap) { address = __gmap_fault(address, (struct gmap *) S390_lowcore.gmap); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; } if (address == -ENOMEM) { fault = VM_FAULT_OOM; goto out_up; } } #endif retry: fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) goto out_up; if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto out_up; if (expand_stack(vma, address)) goto out_up; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ fault = VM_FAULT_BADACCESS; if (unlikely(!(vma->vm_flags & access))) goto out_up; if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. */ if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; } } /* * The instruction that caused the program check will * be repeated. Don't signal single step via SIGTRAP. */ clear_tsk_thread_flag(tsk, TIF_PER_TRAP); fault = 0; out_up: up_read(&mm->mmap_sem); out: return fault; }
/** * walk_page_range - walk a memory map's page tables with a callback * @mm: memory map to walk * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree * * Recursively walk the page table for the memory area in a VMA, * calling supplied callbacks. Callbacks are called in-order (first * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, * etc.). If lower-level callbacks are omitted, walking depth is reduced. * * Each callback receives an entry pointer and the start and end of the * associated range, and a copy of the original mm_walk for access to * the ->private or ->mm fields. * * Usually no locks are taken, but splitting transparent huge page may * take page table lock. And the bottom level iterator will map PTE * directories from highmem if necessary. * * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. * * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry * is !NULL. */ int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { pgd_t *pgd; unsigned long next; int err = 0; if (addr >= end) return err; if (!walk->mm) return -EINVAL; VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); pgd = pgd_offset(walk->mm, addr); do { struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* * This function was not intended to be vma based. * But there are vma special cases to be handled: * - hugetlb vma's * - VM_PFNMAP vma's */ vma = find_vma(walk->mm, addr); if (vma) { /* * There are no page structures backing a VM_PFNMAP * range, so do not allow split_huge_page_pmd(). */ if ((vma->vm_start <= addr) && (vma->vm_flags & VM_PFNMAP)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; pgd = pgd_offset(walk->mm, next); continue; } /* * Handle hugetlb vma individually because pagetable * walk for the hugetlb page is dependent on the * architecture and we can't handled it in the same * manner as non-huge pages. */ if (walk->hugetlb_entry && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; /* * Hugepage is very tightly coupled with vma, * so walk through hugetlb entries within a * given vma. */ err = walk_hugetlb_range(vma, addr, next, walk); if (err) break; pgd = pgd_offset(walk->mm, next); continue; } } if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; pgd++; continue; } if (walk->pgd_entry) err = walk->pgd_entry(pgd, addr, next, walk); if (!err && (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) err = walk_pud_range(pgd, addr, next, walk); if (err) break; pgd++; } while (addr = next, addr != end); return err; }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; anon_vma_link(tmp); file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); spin_lock(&mapping->i_mmap_lock); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); spin_unlock(&mapping->i_mmap_lock); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; }
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * * interruption code (int_code): * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ static inline int do_exception(struct pt_regs *regs, int access) { struct gmap *gmap; struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; enum fault_type type; unsigned long trans_exc_code; unsigned long address; unsigned int flags; int fault; tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ clear_pt_regs_flag(regs, PIF_PER_TRAP); if (notify_page_fault(regs)) return 0; mm = tsk->mm; trans_exc_code = regs->int_parm_long; /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ fault = VM_FAULT_BADCONTEXT; type = get_fault_type(regs); switch (type) { case KERNEL_FAULT: goto out; case VDSO_FAULT: fault = VM_FAULT_BADMAP; goto out; case USER_FAULT: case GMAP_FAULT: if (faulthandler_disabled() || !mm) goto out; break; } address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (user_mode(regs)) flags |= FAULT_FLAG_USER; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); gmap = NULL; if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { gmap = (struct gmap *) S390_lowcore.gmap; current->thread.gmap_addr = address; current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; } if (gmap->pfault_enabled) flags |= FAULT_FLAG_RETRY_NOWAIT; } retry: fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) goto out_up; if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) goto out_up; if (expand_stack(vma, address)) goto out_up; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ fault = VM_FAULT_BADACCESS; if (unlikely(!(vma->vm_flags & access))) goto out_up; if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault = handle_mm_fault(vma, address, flags); /* No reason to continue if interrupted by SIGKILL. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { fault = VM_FAULT_SIGNAL; goto out; } if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. */ if (flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } if (fault & VM_FAULT_RETRY) { if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { /* FAULT_FLAG_RETRY_NOWAIT has been set, * mmap_sem has not been released */ current->thread.gmap_pfault = 1; fault = VM_FAULT_PFAULT; goto out_up; } /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~(FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); flags |= FAULT_FLAG_TRIED; down_read(&mm->mmap_sem); goto retry; } } if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; goto out_up; } if (address == -ENOMEM) { fault = VM_FAULT_OOM; goto out_up; } } fault = 0; out_up: up_read(&mm->mmap_sem); out: return fault; }