/* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, work_func_t func, unsigned long mask) { INIT_WORK(&poll->work, func); init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; }
static void scif_poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; }
/* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, unsigned long mask, struct vhost_dev *dev) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; poll->dev = dev; vhost_work_init(&poll->work, fn); }
static int virqfd_enable(struct vfio_pci_device *vdev, int (*handler)(struct vfio_pci_device *, void *), void (*thread)(struct vfio_pci_device *, void *), void *data, struct virqfd **pvirqfd, int fd) { struct fd irqfd; struct eventfd_ctx *ctx; struct virqfd *virqfd; int ret = 0; unsigned int events; virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); if (!virqfd) return -ENOMEM; virqfd->pvirqfd = pvirqfd; virqfd->vdev = vdev; virqfd->handler = handler; virqfd->thread = thread; virqfd->data = data; INIT_WORK(&virqfd->shutdown, virqfd_shutdown); INIT_WORK(&virqfd->inject, virqfd_inject); irqfd = fdget(fd); if (!irqfd.file) { ret = -EBADF; goto err_fd; } ctx = eventfd_ctx_fileget(irqfd.file); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto err_ctx; } virqfd->eventfd = ctx; /* * virqfds can be released by closing the eventfd or directly * through ioctl. These are both done through a workqueue, so * we update the pointer to the virqfd under lock to avoid * pushing multiple jobs to release the same virqfd. */ spin_lock_irq(&vdev->irqlock); if (*pvirqfd) { spin_unlock_irq(&vdev->irqlock); ret = -EBUSY; goto err_busy; } *pvirqfd = virqfd; spin_unlock_irq(&vdev->irqlock); /* * Install our own custom wake-up handling so we are notified via * a callback whenever someone signals the underlying eventfd. */ init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt); /* * Check if there was an event already pending on the eventfd * before we registered and trigger it as if we didn't miss it. */ if (events & POLLIN) { if ((!handler || handler(vdev, data)) && thread) schedule_work(&virqfd->inject); } /* * Do not drop the file until the irqfd is fully initialized, * otherwise we might race against the POLLHUP. */ fdput(irqfd); return 0; err_busy: eventfd_ctx_put(ctx); err_ctx: fdput(irqfd); err_fd: kfree(virqfd); return ret; }
void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->error = 0; pwq->table = NULL; }
/* * ep_insert()在epoll_ctl()中被调用, 完成往epollfd里面添加一个监听fd的工作 * tfile是fd在内核态的struct file结构 */ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; unsigned long flags; struct epitem *epi; struct ep_pqueue epq; /* 查看是否达到当前用户的最大监听数 */ if (unlikely(atomic_read(&ep->user->epoll_watches) >= max_user_watches)) return -ENOSPC; /* 从著名的slab中分配一个epitem */ if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ /* 这些都是相关成员的初始化... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; /* 这里保存了我们需要监听的文件fd和它的file结构 */ ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; /* 这个指针的初值不是NULL哦... */ epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ /* 好, 我们终于要进入到poll的正题了 */ epq.epi = epi; /* 初始化一个poll_table * 其实就是指定调用poll_wait(注意不是epoll_wait!!!)时的回调函数,和我们关心哪些events, * ep_ptable_queue_proc()就是我们的回调啦, 初值是所有event都关心 */ init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ /* 这一部很关键, 也比较难懂, 完全是内核的poll机制导致的... * 首先, f_op->poll()一般来说只是个wrapper, 它会调用真正的poll实现, * 拿UDP的socket来举例, 这里就是这样的调用流程: f_op->poll(), sock_poll(), * udp_poll(), datagram_poll(), sock_poll_wait(), 最后调用到我们上面指定的 * ep_ptable_queue_proc()这个回调函数...(好深的调用路径...). * 完成这一步, 我们的epitem就跟这个socket关联起来了, 当它有状态变化时, * 会通过ep_poll_callback()来通知. * 最后, 这个函数还会查询当前的fd是不是已经有啥event已经ready了, 有的话 * 会将event返回. */ revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; /* Add the current item to the list of active epoll hook for this file */ /* 这个就是每个文件会将所有监听自己的epitem链起来 */ spin_lock(&tfile->f_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ /* 都搞定后, 将epitem插入到对应的eventpoll中去 */ ep_rbtree_insert(ep, epi); /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ /* 到达这里后, 如果我们监听的fd已经有事件发生, 那就要处理一下 */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { /* 将当前的epitem加入到ready list中去 */ list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ /* 谁在epoll_wait, 就唤醒它... */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); /* 谁在epoll当前的epollfd, 也唤醒它... */ if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); atomic_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0; error_unregister: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); return error; }