/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting */ static int dummy_vm_enough_memory(long pages) { unsigned long free, allowed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { free = get_page_cache_size(); free += nr_free_pages(); free += nr_swap_pages; /* * Any slabs which are created with the * SLAB_RECLAIM_ACCOUNT flag claim to have contents * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ free += atomic_read(&slab_reclaim_pages); /* * Leave the last 3% for root */ if (current->euid) free -= free / 32; if (free > pages) return 0; vm_unacct_memory(pages); return -ENOMEM; } allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; allowed += total_swap_pages; if (atomic_read(&vm_committed_space) < allowed) return 0; vm_unacct_memory(pages); return -ENOMEM; }
static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, int *swapid) { int ret = -EINVAL; struct swap_info_struct *si = NULL; int si_frontswap_pages; unsigned long total_pages_to_unuse = total; unsigned long pages = 0, pages_to_unuse = 0; int type; assert_spin_locked(&swap_lock); for (type = swap_list.head; type >= 0; type = si->next) { si = swap_info[type]; si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; } else { pages = si_frontswap_pages; pages_to_unuse = 0; /* unuse all */ } /* ensure there is enough RAM to fetch pages from frontswap */ if (security_vm_enough_memory_mm(current->mm, pages)) { ret = -ENOMEM; continue; } vm_unacct_memory(pages); *unused = pages_to_unuse; *swapid = type; ret = 0; break; } return ret; }
/* * Frontswap, like a true swap device, may unnecessarily retain pages * under certain circumstances; "shrink" frontswap is essentially a * "partial swapoff" and works by calling try_to_unuse to attempt to * unuse enough frontswap pages to attempt to -- subject to memory * constraints -- reduce the number of pages in frontswap */ void frontswap_shrink(unsigned long target_pages) { int wrapped = 0; bool locked = false; for (wrapped = 0; wrapped <= 3; wrapped++) { struct swap_info_struct *si = NULL; unsigned long total_pages = 0, total_pages_to_unuse; unsigned long pages = 0, unuse_pages = 0; int type; /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change * so restart scan from swap_list.head each time */ spin_lock(&swap_lock); locked = true; total_pages = 0; for (type = swap_list.head; type >= 0; type = si->next) { si = swap_info[type]; total_pages += si->frontswap_pages; } if (total_pages <= target_pages) goto out; total_pages_to_unuse = total_pages - target_pages; for (type = swap_list.head; type >= 0; type = si->next) { si = swap_info[type]; if (total_pages_to_unuse < si->frontswap_pages) pages = unuse_pages = total_pages_to_unuse; else { pages = si->frontswap_pages; unuse_pages = 0; /* unuse all */ } if (security_vm_enough_memory_kern(pages)) continue; vm_unacct_memory(pages); break; } if (type < 0) goto out; locked = false; spin_unlock(&swap_lock); try_to_unuse(type, true, unuse_pages); } out: if (locked) spin_unlock(&swap_lock); return; }
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; vm_acct_memory(pages); if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { free = global_page_state(NR_FREE_PAGES); free += global_page_state(NR_FILE_PAGES); free -= global_page_state(NR_SHMEM); free += get_nr_swap_pages(); free += global_page_state(NR_SLAB_RECLAIMABLE); if (free <= totalreserve_pages) goto error; else free -= totalreserve_pages; if (!cap_sys_admin) free -= free / 32; if (free > pages) return 0; goto error; } allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; if (!cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; if (mm) allowed -= mm->total_vm / 32; if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: vm_unacct_memory(pages); return -ENOMEM; }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; mm->shared_vm = oldmm->shared_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }
/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. */ int cap_vm_enough_memory(long pages) { unsigned long free, allowed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { unsigned long n; free = get_page_cache_size(); free += nr_swap_pages; /* * Any slabs which are created with the * SLAB_RECLAIM_ACCOUNT flag claim to have contents * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ free += atomic_read(&slab_reclaim_pages); /* * Leave the last 3% for root */ if (!capable(CAP_SYS_ADMIN)) free -= free / 32; if (free > pages) return 0; /* * nr_free_pages() is very expensive on large systems, * only call if we're about to fail. */ n = nr_free_pages(); if (!capable(CAP_SYS_ADMIN)) n -= n / 32; free += n; if (free > pages) return 0; vm_unacct_memory(pages); return -ENOMEM; } allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; /* * Leave the last 3% for root */ if (!capable(CAP_SYS_ADMIN)) allowed -= allowed / 32; allowed += total_swap_pages; if (atomic_read(&vm_committed_space) < allowed) return 0; vm_unacct_memory(pages); return -ENOMEM; }
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; get_file(file); if (tmp->vm_prfile) get_file(tmp->vm_prfile); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(tmp, &mapping->i_mmap_nonlinear); else vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }
static int low_vm_enough_memory(struct mm_struct *mm, long pages) { unsigned long free, allowed; int cap_sys_admin = 0, notify; if (cap_capable(current, CAP_SYS_ADMIN) == 0) cap_sys_admin = 1; allowed = totalram_pages - hugetlb_total_pages(); allowed_pages = allowed; /* We activate ourselves only after both parameters have been * configured. */ if (deny_pages == 0 || notify_low_pages == 0 || notify_high_pages == 0) return __vm_enough_memory(mm, pages, cap_sys_admin); vm_acct_memory(pages); /* Easily freed pages when under VM pressure or direct reclaim */ free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; free += global_page_state(NR_SLAB_RECLAIMABLE); if (likely(free > notify_low_pages)) goto enough_memory; /* No luck, lets make it more expensive and try again.. */ free += nr_free_pages(); if (free < deny_pages) { int i; lowmem_free_pages = free; low_watermark_state(1); high_watermark_state(1); /* Memory allocations by root are always allowed */ if (cap_sys_admin) return 0; /* OOM unkillable process is allowed to consume memory */ if (current->oomkilladj == OOM_DISABLE) return 0; /* uids from allowed_uids vector are also allowed no matter what */ for (i = 0; i < LOWMEM_MAX_UIDS && allowed_uids[i]; i++) if (current->uid == allowed_uids[i]) return 0; vm_unacct_memory(pages); if (printk_ratelimit()) { printk(MY_NAME ": denying memory allocation to process %d (%s)\n", current->pid, current->comm); } return -ENOMEM; } enough_memory: /* See if we need to notify level 1 */ low_watermark_state(free < notify_low_pages); /* * In the level 2 notification case things are more complicated, * as the level that we drop the state and send a notification * should be lower than when it is first triggered. Having this * on the same watermark level ends up bouncing back and forth * when applications are being stupid. */ notify = free < notify_high_pages; if (notify || free - nr_decay_pages > notify_high_pages) high_watermark_state(notify); /* We have plenty of memory */ lowmem_free_pages = free; return 0; }
/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { free = global_page_state(NR_FREE_PAGES); free += global_page_state(NR_FILE_PAGES); /* * shmem pages shouldn't be counted as free in this * case, they can't be purged, only swapped out, and * that won't affect the overall amount of available * memory in the system. */ free -= global_page_state(NR_SHMEM); free += nr_swap_pages; /* * Any slabs which are created with the * SLAB_RECLAIM_ACCOUNT flag claim to have contents * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ free += global_page_state(NR_SLAB_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. */ if (free <= totalreserve_pages) goto error; else free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) free -= free / 32; if (free > pages) return 0; goto error; } allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; /* * Leave the last 3% for root */ if (!cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; /* Don't let a single process grow too big: leave 3% of the size of this process for other processes */ if (mm) allowed -= mm->total_vm / 32; if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: vm_unacct_memory(pages); return -ENOMEM; }