enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { if (task->exit_state) return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { if (unlikely(frozen(task))) __thaw_task(task); if (!force_kill) return OOM_SCAN_ABORT; } if (!task->mm) return OOM_SCAN_CONTINUE; if (task->flags & PF_EXITING) { /* * If task is current and is in the process of releasing memory, * allow the "kill" to set TIF_MEMDIE, which will allow it to * access memory reserves. Otherwise, it may stall forever. * * The iteration isn't broken here, however, in case other * threads are found to have already been oom killed. */ if (task == current) return OOM_SCAN_SELECT; else if (!force_kill) { /* * If this task is not being ptraced on exit, then wait * for it to finish before killing some other task * unnecessarily. */ if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) return OOM_SCAN_ABORT; } } return OOM_SCAN_OK; }
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { unsigned long points; if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); if (!p) return 0; if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + p->mm->nr_ptes + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); /* * Root processes get 3% bonus, just like the __vm_enough_memory() * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) points -= 30 * totalpages / 1000; /* * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may * either completely disable oom killing or always prefer a certain * task. */ points += p->signal->oom_score_adj * totalpages / 1000; /* * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).. */ return points ? points : 1; }
/** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); if (!p) return 0; adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + p->mm->nr_ptes + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); /* * Root processes get 3% bonus, just like the __vm_enough_memory() * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) adj -= 30; /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj; /* * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ return points > 0 ? points : 1; }
enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { if (task->exit_state) { #ifdef CONFIG_OOM_SCAN_WA_PREVENT_WRONG_SEARCH if (task->pid == task->tgid) return OOM_SCAN_SKIP_SEARCH_THREAD; #endif return OOM_SCAN_CONTINUE; } if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { if (unlikely(frozen(task))) __thaw_task(task); if (!force_kill) return OOM_SCAN_ABORT; } if (!task->mm) return OOM_SCAN_CONTINUE; /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. */ if (oom_task_origin(task)) return OOM_SCAN_SELECT; if (task->flags & PF_EXITING && !force_kill) { /* * If this task is not being ptraced on exit, then wait for it * to finish before killing some other task unnecessarily. */ if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) return OOM_SCAN_ABORT; } return OOM_SCAN_OK; }
/** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation * @memcg: task's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { long points; long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; p = find_lock_task_mm(p); if (!p) return 0; /* * Do not even consider tasks which are explicitly marked oom * unkillable or have been already oom reaped or the are in * the middle of vfork */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || test_bit(MMF_OOM_SKIP, &p->mm->flags) || in_vfork(p)) { task_unlock(p); return 0; } /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj; /* * Never return 0 for an eligible task regardless of the root bonus and * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ return points > 0 ? points : 1; }
enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, struct task_struct *task, unsigned long totalpages) { if (oom_unkillable_task(task, NULL, oc->nodemask)) return OOM_SCAN_CONTINUE; /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) return OOM_SCAN_ABORT; /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. */ if (oom_task_origin(task)) return OOM_SCAN_SELECT; return OOM_SCAN_OK; }
/* * Simple selection loop. We chose the process with the highest * number of 'points'. We expect the caller will lock the tasklist. * * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *memcg, const nodemask_t *nodemask, bool force_kill) { struct task_struct *g, *p; struct task_struct *chosen = NULL; *ppoints = 0; do_each_thread(g, p) { unsigned int points; if (p->exit_state) continue; if (oom_unkillable_task(p, memcg, nodemask)) continue; /* * This task already has access to memory reserves and is * being killed. Don't allow any other task access to the * memory reserve. * * Note: this may have a chance of deadlock if it gets * blocked waiting for another task which itself is waiting * for memory. Is there a better alternative? */ if (test_tsk_thread_flag(p, TIF_MEMDIE)) { if (unlikely(frozen(p))) __thaw_task(p); if (!force_kill) return ERR_PTR(-1UL); } if (!p->mm) continue; if (p->flags & PF_EXITING) { /* * If p is the current task and is in the process of * releasing memory, we allow the "kill" to set * TIF_MEMDIE, which will allow it to gain access to * memory reserves. Otherwise, it may stall forever. * * The loop isn't broken here, however, in case other * threads are found to have already been oom killed. */ if (p == current) { chosen = p; *ppoints = 1000; } else if (!force_kill) { /* * If this task is not being ptraced on exit, * then wait for it to finish before killing * some other task unnecessarily. */ if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) return ERR_PTR(-1UL); } } points = oom_badness(p, memcg, nodemask, totalpages); if (points > *ppoints) { chosen = p; *ppoints = points; } } while_each_thread(g, p);
/** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long totalpages) { int points; if (oom_unkillable_task(p, mem, nodemask)) return 0; p = find_lock_task_mm(p); if (!p) return 0; /* * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN * so the entire heuristic doesn't need to be executed for something * that cannot be killed. */ if (atomic_read(&p->mm->oom_disable_count)) { task_unlock(p); return 0; } /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. */ if (!totalpages) totalpages = 1; /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + p->mm->nr_ptes; points += get_mm_counter(p->mm, MM_SWAPENTS); points *= 1000; points /= totalpages; task_unlock(p); /* * Root processes get 3% bonus, just like the __vm_enough_memory() * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) points -= 30; /* * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may * either completely disable oom killing or always prefer a certain * task. */ points += p->signal->oom_score_adj; /* * Never return 0 for an eligible task that may be killed since it's * possible that no single user task uses more than 0.1% of memory and * no single admin tasks uses more than 3.0%. */ if (points <= 0) return 1; return (points < 1000) ? points : 1000; }
/* * If this is a system OOM (not a memcg OOM) and the task selected to be * killed is not already running at high (RT) priorities, speed up the * recovery by boosting the dying task to the lowest FIFO priority. * That helps with the recovery and avoids interfering with RT tasks. */ static void boost_dying_task_prio(struct task_struct *p, struct mem_cgroup *mem) { struct sched_param param = { .sched_priority = 1 }; if (mem) return; if (!rt_task(p)) sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); } /* * The process p may have detached its own ->mm while exiting or through * use_mm(), but one or more of its subthreads may still have a valid * pointer. Return p, or any of its subthreads with a valid ->mm, with * task_lock() held. */ struct task_struct *find_lock_task_mm(struct task_struct *p) { struct task_struct *t = p; do { task_lock(t); if (likely(t->mm)) return t; task_unlock(t); } while_each_thread(p, t); return NULL; } /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, const struct mem_cgroup *mem, const nodemask_t *nodemask) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; /* When mem_cgroup_out_of_memory() and p is not member of the group */ if (mem && !task_in_mem_cgroup(p, mem)) return true; /* p may not have freeable memory in nodemask */ if (!has_intersects_mems_allowed(p, nodemask)) return true; return false; } /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long totalpages) { long points; if (oom_unkillable_task(p, mem, nodemask)) return 0; p = find_lock_task_mm(p); if (!p) return 0; /* * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN * so the entire heuristic doesn't need to be executed for something * that cannot be killed. */ if (atomic_read(&p->mm->oom_disable_count)) { task_unlock(p); return 0; } /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. */ if (!totalpages) totalpages = 1; /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + p->mm->nr_ptes; points += get_mm_counter(p->mm, swap_usage); points *= 1000; points /= totalpages; task_unlock(p); /* * Root processes get 3% bonus, just like the __vm_enough_memory() * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) points -= 30; /* * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may * either completely disable oom killing or always prefer a certain * task. */ points += p->signal->oom_score_adj; /* * Never return 0 for an eligible task that may be killed since it's * possible that no single user task uses more than 0.1% of memory and * no single admin tasks uses more than 3.0%. */ if (points <= 0) return 1; return (points < 1000) ? points : 1000; }
/** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; if (oom_killer_disabled) return false; if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ return true; } /* * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ if (task_will_free_mem(current)) { mark_oom_victim(current); wake_oom_reaper(current); return true; } /* * The OOM killer does not compensate for IO-less reclaim. * pagefault_out_of_memory lost its gfp context so we have to * make sure exclude 0 mask - all other users should have at least * ___GFP_DIRECT_RECLAIM to get here. */ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) return true; /* * Check if there were limitations on the allocation (only relevant for * NUMA and memcg) that may require different handling. */ constraint = constrained_alloc(oc); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; check_panic_on_oom(oc, constraint); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } select_bad_process(oc); /* Found nothing?!?! */ if (!oc->chosen) { dump_header(oc, NULL); pr_warn("Out of memory and no killable processes...\n"); /* * If we got here due to an actual allocation at the * system level, we cannot survive this and will enter * an endless loop in the allocator. Bail out now. */ if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) panic("System is deadlocked on memory\n"); } if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); return !!oc->chosen; }