/* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, * tintv.tv64 != 0) until the timer is accessed. */ static void timerfd_triggered(struct timerfd_ctx *ctx) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; ctx->ticks++; wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags); }
void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); hlist_del_init(&pin->m_list); hlist_del_init(&pin->s_list); spin_unlock(&pin_lock); spin_lock_irq(&pin->wait.lock); pin->done = 1; wake_up_locked(&pin->wait); spin_unlock_irq(&pin->wait.lock); }
fastcall int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); unsigned long flags; tsk->state = TASK_INTERRUPTIBLE; spin_lock_irqsave(&sem->wait.lock, flags); add_wait_queue_exclusive_locked(&sem->wait, &wait); sem->sleepers++; for (;;) { int sleepers = sem->sleepers; /* * With signals pending, this turns into * the trylock failure case - we won't be * sleeping, and we* can't get the lock as * it has contention. Just correct the count * and exit. */ if (signal_pending(current)) { retval = -EINTR; sem->sleepers = 0; atomic_add(sleepers, &sem->count); break; } /* * Add "everybody else" into it. They aren't * playing, because we own the spinlock in * wait_queue_head. The "-1" is because we're * still hoping to get the semaphore. */ if (!atomic_add_negative(sleepers - 1, &sem->count)) { sem->sleepers = 0; break; } sem->sleepers = 1; /* us - see -1 above */ spin_unlock_irqrestore(&sem->wait.lock, flags); schedule(); spin_lock_irqsave(&sem->wait.lock, flags); tsk->state = TASK_INTERRUPTIBLE; } remove_wait_queue_locked(&sem->wait, &wait); wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; return retval; }
static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; ucnt = ctx->count; if (ucnt > 0) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { ucnt = ctx->count; res = sizeof(ucnt); break; } if (signal_pending(current)) { res = -ERESTARTSYS; break; } spin_unlock_irq(&ctx->wqh.lock); schedule(); spin_lock_irq(&ctx->wqh.lock); } __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } if (res > 0) { ctx->count = 0; if (waitqueue_active(&ctx->wqh)) wake_up_locked(&ctx->wqh); } spin_unlock_irq(&ctx->wqh.lock); if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) return -EFAULT; return res; }
/* * Adds "n" to the eventfd counter "count". Returns "n" in case of * success, or a value lower then "n" in case of coutner overflow. * This function is supposed to be called by the kernel in paths * that do not allow sleeping. In this function we allow the counter * to reach the ULLONG_MAX value, and we signal this as overflow * condition by returining a POLLERR to poll(2). */ int eventfd_signal(struct file *file, int n) { struct eventfd_ctx *ctx = file->private_data; unsigned long flags; if (n < 0) return -EINVAL; spin_lock_irqsave(&ctx->wqh.lock, flags); if (ULLONG_MAX - ctx->count < n) n = (int) (ULLONG_MAX - ctx->count); ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; }
static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; if (ucnt == ULLONG_MAX) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(tx_cache_get_file_ro(file)->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ULLONG_MAX - ctx->count > ucnt) { res = sizeof(ucnt); break; } if (signal_pending(current)) { res = -ERESTARTSYS; break; } spin_unlock_irq(&ctx->wqh.lock); schedule(); spin_lock_irq(&ctx->wqh.lock); } __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } if (res > 0) { ctx->count += ucnt; if (waitqueue_active(&ctx->wqh)) wake_up_locked(&ctx->wqh); } spin_unlock_irq(&ctx->wqh.lock); return res; }
/* * Trylock failed - make sure we correct for * having decremented the count. * * We could have done the trylock with a * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ fastcall int __down_trylock(struct semaphore * sem) { int sleepers; unsigned long flags; spin_lock_irqsave(&sem->wait.lock, flags); sleepers = sem->sleepers + 1; sem->sleepers = 0; /* * Add "everybody else" and us into it. They aren't * playing, because we own the spinlock in the * wait_queue_head. */ if (!atomic_add_negative(sleepers, &sem->count)) { wake_up_locked(&sem->wait); } spin_unlock_irqrestore(&sem->wait.lock, flags); return 1; }
fastcall void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); unsigned long flags; tsk->state = TASK_UNINTERRUPTIBLE; spin_lock_irqsave(&sem->wait.lock, flags); add_wait_queue_exclusive_locked(&sem->wait, &wait); sem->sleepers++; for (;;) { int sleepers = sem->sleepers; /* * Add "everybody else" into it. They aren't * playing, because we own the spinlock in * the wait_queue_head. */ if (!atomic_add_negative(sleepers - 1, &sem->count)) { sem->sleepers = 0; break; } sem->sleepers = 1; /* us - see -1 above */ spin_unlock_irqrestore(&sem->wait.lock, flags); schedule(); spin_lock_irqsave(&sem->wait.lock, flags); tsk->state = TASK_UNINTERRUPTIBLE; } remove_wait_queue_locked(&sem->wait, &wait); wake_up_locked(&sem->wait); spin_unlock_irqrestore(&sem->wait.lock, flags); tsk->state = TASK_RUNNING; }
/** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for * O(NumReady) performance. * * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), void *priv) { int error, pwake = 0; unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ mutex_lock(&ep->mtx); /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ spin_lock_irqsave(&ep->lock, flags); /* 这一步要注意, 首先, 所有监听到events的epitem都链到rdllist上了, * 但是这一步之后, 所有的epitem都转移到了txlist上, 而rdllist被清空了, * 要注意哦, rdllist已经被清空了! */ list_splice_init(&ep->rdllist, &txlist); /* ovflist, 在ep_poll_callback()里面我解释过, 此时此刻我们不希望 * 有新的event加入到ready list中了, 保存后下次再处理... */ ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); /* * Now call the callback function. */ /* 在这个回调函数里面处理每个epitem * sproc 就是 ep_send_events_proc, 下面会注释到. */ error = (*sproc)(ep, &txlist, priv); spin_lock_irqsave(&ep->lock, flags); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ /* 现在我们来处理ovflist, 这些epitem都是我们在传递数据给用户空间时 * 监听到了事件. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ /* 将这些直接放入readylist */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ ep->ovflist = EP_UNACTIVE_PTR; /* * Quickly re-inject items left on "txlist". */ /* 上一次没有处理完的epitem, 重新插入到ready list */ list_splice(&txlist, &ep->rdllist); /* ready list不为空, 直接唤醒... */ if (!list_empty(&ep->rdllist)) { /* * Wake up (if active) both the eventpoll wait list and * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return error; }
/* * 这个是关键性的回调函数, 当我们监听的fd发生状态改变时, 它会被调用. * 参数key被当作一个unsigned long整数使用, 携带的是events. */ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; spin_lock_irqsave(&ep->lock, flags); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ /* 没有我们关心的event... */ if (key && !((unsigned long) key & epi->event.events)) goto out_unlock; /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are * chained in ep->ovflist and requeued later on. */ /* * 这里看起来可能有点费解, 其实干的事情比较简单: * 如果该callback被调用的同时, epoll_wait()已经返回了, * 也就是说, 此刻应用程序有可能已经在循环获取events, * 这种情况下, 内核将此刻发生event的epitem用一个单独的链表 * 链起来, 不发给应用程序, 也不丢弃, 而是在下一次epoll_wait * 时返回给用户. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; } goto out_unlock; } /* If this file is already in the ready list we exit soon */ /* 将当前的epitem放入ready list */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ /* 唤醒epoll_wait... */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); /* 如果epollfd也在被poll, 那就唤醒队列里面的所有成员. */ if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 1; }
/* * ep_insert()在epoll_ctl()中被调用, 完成往epollfd里面添加一个监听fd的工作 * tfile是fd在内核态的struct file结构 */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; unsigned long flags; struct epitem *epi; struct ep_pqueue epq; /* 查看是否达到当前用户的最大监听数 */ if (unlikely(atomic_read(&ep->user->epoll_watches) >= max_user_watches)) return -ENOSPC; /* 从著名的slab中分配一个epitem */ if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ /* 这些都是相关成员的初始化... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; /* 这里保存了我们需要监听的文件fd和它的file结构 */ ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; /* 这个指针的初值不是NULL哦... */ epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ /* 好, 我们终于要进入到poll的正题了 */ epq.epi = epi; /* 初始化一个poll_table * 其实就是指定调用poll_wait(注意不是epoll_wait!!!)时的回调函数,和我们关心哪些events, * ep_ptable_queue_proc()就是我们的回调啦, 初值是所有event都关心 */ init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ /* 这一部很关键, 也比较难懂, 完全是内核的poll机制导致的... * 首先, f_op->poll()一般来说只是个wrapper, 它会调用真正的poll实现, * 拿UDP的socket来举例, 这里就是这样的调用流程: f_op->poll(), sock_poll(), * udp_poll(), datagram_poll(), sock_poll_wait(), 最后调用到我们上面指定的 * ep_ptable_queue_proc()这个回调函数...(好深的调用路径...). * 完成这一步, 我们的epitem就跟这个socket关联起来了, 当它有状态变化时, * 会通过ep_poll_callback()来通知. * 最后, 这个函数还会查询当前的fd是不是已经有啥event已经ready了, 有的话 * 会将event返回. */ revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; /* Add the current item to the list of active epoll hook for this file */ /* 这个就是每个文件会将所有监听自己的epitem链起来 */ spin_lock(&tfile->f_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ /* 都搞定后, 将epitem插入到对应的eventpoll中去 */ ep_rbtree_insert(ep, epi); /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ /* 到达这里后, 如果我们监听的fd已经有事件发生, 那就要处理一下 */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { /* 将当前的epitem加入到ready list中去 */ list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ /* 谁在epoll_wait, 就唤醒它... */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); /* 谁在epoll当前的epollfd, 也唤醒它... */ if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); atomic_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0; error_unregister: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); return error; }