static inline long __sched do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); __add_wait_queue_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } x->done--; return timeout ?: 1; }
/* * Because this function is inlined, the 'state' parameter will be * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ static inline int __sched __down_common(struct semaphore *sem, long state, long timeout) { struct task_struct *task = current; struct semaphore_waiter waiter; list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; waiter.up = 0; for (;;) { if (signal_pending_state(state, task)) goto interrupted; if (timeout <= 0) goto timed_out; __set_task_state(task, state); spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); spin_lock_irq(&sem->lock); if (waiter.up) return 0; } timed_out: list_del(&waiter.list); return -ETIME; interrupted: list_del(&waiter.list); return -EINTR; }
/* * schedule() is the main scheduler function. */ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_sched_qs(cpu); prev = rq->curr; switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; else deactivate_task(rq, prev, 1); switch_count = &prev->nvcsw; } pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); put_prev_task(rq, prev); next = pick_next_task(rq); if (likely(prev != next)) { sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */
// ARM10C 20170830 // x: &kthreadd_done, action: schedule_timeout, timeout: 0x7FFFFFFF, state: 2 static inline long __sched do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { // x->done: (&kthreadd_done)->done: 0 if (!x->done) { // current: kmem_cache#15-oX (struct task_struct) (pid: 1) DECLARE_WAITQUEUE(wait, current); // DECLARE_WAITQUEUE(wait, kmem_cache#15-oX (struct task_struct) (pid: 1)): // wait_queue_t wait = // { // .private = kmem_cache#15-oX (struct task_struct) (pid: 1), // .func = default_wake_function, // .task_list = { NULL, NULL } // } // &x->wait: &(&kthreadd_done)->wait __add_wait_queue_tail_exclusive(&x->wait, &wait); // __add_wait_queue_tail_exclusive 에서 한일: // (&wait)->flags: ? | 0x01 // &(&(&kthreadd_done)->wait)->task_list 과 (&(&(&kthreadd_done)->wait)->task_list)->prev 사이에 &(&wait)->task_list 를 추가함 // 간단히 말하면 head 인 &(&(&kthreadd_done)->wait)->task_list 의 tail에 &(&wait)->task_list 가 추가됨 // // (&(&(&kthreadd_done)->wait)->task_list)->prev = &(&wait)->task_list; // (&(&wait)->task_list)->next = &(&(&kthreadd_done)->wait)->task_list; // (&(&wait)->task_list)->prev = &(&(&kthreadd_done)->wait)->task_list; // (&(&(&kthreadd_done)->wait)->task_list)->next = &(&wait)->task_list; // 2017/08/30 종료 // 2017/09/06 시작 do { // state: 2, current: kmem_cache#15-oX (struct task_struct) (pid: 1) // signal_pending_state(2, kmem_cache#15-oX (struct task_struct) (pid: 1)): 0 if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } // state: 2 __set_current_state(state); // __set_current_state 에서 한일: // (kmem_cache#15-oX (struct task_struct) (pid: 1))->state: 2 // &x->wait.lock: &(&kthreadd_done)->wait.lock spin_unlock_irq(&x->wait.lock); // spin_unlock_irq 에서 한일: // &(&kthreadd_done)->wait.lock 을 사용하여 spin unlock 을 수행 // action: schedule_timeout, timeout: 0x7FFFFFFF timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); if (!x->done) return timeout; } x->done--; return timeout ?: 1; }
/* * Lock a mutex (possibly interruptible), slowpath: */ static inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Optimistic spinning. * * We try to spin for acquisition when we find that there are no * pending waiters and the lock owner is currently running on a * (different) CPU. * * The rationale is that if the lock owner is running, it is likely to * release the lock soon. * * Since this needs the lock owner, and this mutex implementation * doesn't track the owner atomically in the lock field, we need to * track it non-atomically. * * We can't do this for DEBUG_MUTEXES because that relies on wait_lock * to serialize everything. * * The mutex spinners are queued up using MCS lock so that only one * spinner can compete for the mutex. However, if mutex spinning isn't * going to happen, there is no point in going through the lock/unlock * overhead. */ if (!mutex_can_spin_on_owner(lock)) goto slowpath; for (;;) { struct task_struct *owner; struct mspin_node node; /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) { mspin_unlock(MLOCK(lock), &node); break; } if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); preempt_enable(); return 0; } mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the * owner acquiring the lock and setting the owner field. If * we're an RT task that will live-lock because we won't let * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) break; /* * The cpu_relax() call is a compiler barrier which forces * everything in this loop to be re-loaded. We don't need * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ arch_mutex_cpu_relax(); } slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_lock_common(lock, &waiter); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) goto done; lock_contended(&lock->dep_map, ip); for (;;) { /* * Lets try to take the lock again - this is needed even if * we get here for the first time (shortly after failing to * acquire the lock), to make sure that we get a wakeup once * it's unlocked. Later on, if we sleep, this is the * operation that gives us the lock. We xchg it to -1, so * that when we release the lock, we properly wake up the * other waiters: */ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) break; /* * got a signal? (This code gets eliminated in the * TASK_UNINTERRUPTIBLE case.) */ if (unlikely(signal_pending_state(state, task))) { mutex_remove_waiter(lock, &waiter, task_thread_info(task)); mutex_release(&lock->dep_map, 1, ip); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); preempt_enable(); return -EINTR; } __set_task_state(task, state); /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } done: lock_acquired(&lock->dep_map, ip); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, current_thread_info()); mutex_set_owner(lock); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); preempt_enable(); return 0; }
/** * percpu_ida_alloc - allocate a tag * @pool: pool to allocate from * @state: task state for prepare_to_wait * * Returns a tag - an integer in the range [0..nr_tags) (passed to * tag_pool_init()), or otherwise -ENOSPC on allocation failure. * * Safe to be called from interrupt context (assuming it isn't passed * TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course). * * @gfp indicates whether or not to wait until a free id is available (it's not * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep * however long it takes until another thread frees an id (same semantics as a * mempool). * * Will not fail if passed TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE. */ int percpu_ida_alloc(struct percpu_ida *pool, int state) { DEFINE_WAIT(wait); struct percpu_ida_cpu *tags; unsigned long flags; int tag = -ENOSPC; tags = raw_cpu_ptr(pool->tag_cpu); spin_lock_irqsave(&tags->lock, flags); /* Fastpath */ if (likely(tags->nr_free)) { tag = tags->freelist[--tags->nr_free]; spin_unlock_irqrestore(&tags->lock, flags); return tag; } spin_unlock_irqrestore(&tags->lock, flags); while (1) { spin_lock_irqsave(&pool->lock, flags); tags = this_cpu_ptr(pool->tag_cpu); /* * prepare_to_wait() must come before steal_tags(), in case * percpu_ida_free() on another cpu flips a bit in * cpus_have_tags * * global lock held and irqs disabled, don't need percpu lock */ if (state != TASK_RUNNING) prepare_to_wait(&pool->wait, &wait, state); if (!tags->nr_free) alloc_global_tags(pool, tags); if (!tags->nr_free) steal_tags(pool, tags); if (tags->nr_free) { tag = tags->freelist[--tags->nr_free]; if (tags->nr_free) cpumask_set_cpu(smp_processor_id(), &pool->cpus_have_tags); } spin_unlock_irqrestore(&pool->lock, flags); if (tag >= 0 || state == TASK_RUNNING) break; if (signal_pending_state(state, current)) { tag = -ERESTARTSYS; break; } schedule(); } if (state != TASK_RUNNING) finish_wait(&pool->wait, &wait); return tag; }
/* * Lock a mutex (possibly interruptible), slowpath: */ static inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long ip) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned int old_val; unsigned long flags; spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_lock_common(lock, &waiter); mutex_acquire(&lock->dep_map, subclass, 0, ip); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; old_val = atomic_xchg(&lock->count, -1); if (old_val == 1) goto done; lock_contended(&lock->dep_map, ip); for (;;) { /* * Lets try to take the lock again - this is needed even if * we get here for the first time (shortly after failing to * acquire the lock), to make sure that we get a wakeup once * it's unlocked. Later on, if we sleep, this is the * operation that gives us the lock. We xchg it to -1, so * that when we release the lock, we properly wake up the * other waiters: */ old_val = atomic_xchg(&lock->count, -1); if (old_val == 1) break; /* * got a signal? (This code gets eliminated in the * TASK_UNINTERRUPTIBLE case.) */ if (unlikely(signal_pending_state(state, task))) { mutex_remove_waiter(lock, &waiter, task_thread_info(task)); mutex_release(&lock->dep_map, 1, ip); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); return -EINTR; } __set_task_state(task, state); /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule(); spin_lock_mutex(&lock->wait_lock, flags); } done: lock_acquired(&lock->dep_map); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); return 0; }
static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; /* ==1== 找到当前cpu上的就绪队列rq 并将正在运行的进程curr保存到prev中 */ cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; /* * do_exit() calls schedule() with preemption disabled as an exception; * however we must fix that up, otherwise the next task will see an * inconsistent (higher) preempt count. * * It also avoids the below schedule_debug() test from complaining * about this. */ if (unlikely(prev->state == TASK_DEAD)) preempt_enable_no_resched_notrace(); /* 如果禁止内核抢占,而又调用了cond_resched就会出错 * 这里就是用来捕获该错误的 */ schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); /* 关闭本地中断 */ local_irq_disable(); /* 更新全局状态, * 标识当前CPU发生上下文的切换 */ rcu_note_context_switch(); /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); /* 锁住该队列 */ raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ /* 切换次数记录, 默认认为非主动调度计数(抢占) */ switch_count = &prev->nivcsw; /* * scheduler检查prev的状态state和内核抢占表示 * 如果prev是不可运行的, 并且在内核态没有被抢占 * * 此时当前进程不是处于运行态, 并且不是被抢占 * 此时不能只检查抢占计数 * 因为可能某个进程(如网卡轮询)直接调用了schedule * 如果不判断prev->stat就可能误认为task进程为RUNNING状态 * 到达这里,有两种可能,一种是主动schedule, 另外一种是被抢占 * 被抢占有两种情况, 一种是时间片到点, 一种是时间片没到点 * 时间片到点后, 主要是置当前进程的need_resched标志 * 接下来在时钟中断结束后, 会preempt_schedule_irq抢占调度 * * 那么我们正常应该做的是应该将进程prev从就绪队列rq中删除, * 但是如果当前进程prev有非阻塞等待信号, * 并且它的状态是TASK_INTERRUPTIBLE * 我们就不应该从就绪队列总删除它 * 而是配置其状态为TASK_RUNNING, 并且把他留在rq中 /* 如果内核态没有被抢占, 并且内核抢占有效 即是否同时满足以下条件: 1 该进程处于停止状态 2 该进程没有在内核态被抢占 */ if (!preempt && prev->state) { /* 如果当前进程有非阻塞等待信号,并且它的状态是TASK_INTERRUPTIBLE */ if (unlikely(signal_pending_state(prev->state, prev))) { /* 将当前进程的状态设为:TASK_RUNNING */ prev->state = TASK_RUNNING; } else /* 否则需要将prev进程从就绪队列中删除*/ { /* 将当前进程从runqueue(运行队列)中删除 */ deactivate_task(rq, prev, DEQUEUE_SLEEP); /* 标识当前进程不在runqueue中 */ prev->on_rq = 0; /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup); } } /* 如果不是被抢占的,就累加主动切换次数 */ switch_count = &prev->nvcsw; } /* 如果prev进程仍然在就绪队列上没有被删除 */ if (task_on_rq_queued(prev)) update_rq_clock(rq); /* 跟新就绪队列的时钟 */ /* 挑选一个优先级最高的任务将其排进队列 */ next = pick_next_task(rq, prev); /* 清除pre的TIF_NEED_RESCHED标志 */ clear_tsk_need_resched(prev); /* 清楚内核抢占标识 */ clear_preempt_need_resched(); rq->clock_skip_update = 0; /* 如果prev和next非同一个进程 */ if (likely(prev != next)) { rq->nr_switches++; /* 队列切换次数更新 */ rq->curr = next; /* 将next标记为队列的curr进程 */ ++*switch_count; /* 进程切换次数更新 */ trace_sched_switch(preempt, prev, next); /* 进程之间上下文切换 */ rq = context_switch(rq, prev, next); /* unlocks the rq */ } else /* 如果prev和next为同一进程,则不进行进程切换 */ { lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); } balance_callback(rq); }
/* * schedule() is the main scheduler function. */ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { /* * If a worker is going to sleep, notify and * ask workqueue whether it wants to wake up a * task to maintain concurrency. If so, wake * up the task. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; to_wakeup = wq_worker_sleeping(prev, cpu); if (to_wakeup) try_to_wake_up_local(to_wakeup); } deactivate_task(rq, prev, DEQUEUE_SLEEP); /* * If we are going to sleep and we have plugged IO queued, make * sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); blk_schedule_flush_plug(prev); raw_spin_lock(&rq->lock); } } switch_count = &prev->nvcsw; } pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); rq->skip_clock_update = 0; if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ /* * The context switch have flipped the stack from under us * and restored the local variables which were saved when * this task called schedule() in the past. prev == current * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); } else raw_spin_unlock_irq(&rq->lock); post_schedule(rq); preempt_enable_no_resched(); if (need_resched()) goto need_resched; }