/** * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively * waiting threads, which is not always desirable because all threads will * be waken up again and again, even user only needs a few of them to be * active most time. This is not good for performance because cache can * be polluted by different threads. * * LIFO list can resolve this problem because we always wakeup the most * recent active thread by default. * * NB: please don't call non-exclusive & exclusive wait on the same * waitq if add_wait_queue_exclusive_head is used. */ void add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link) { unsigned long flags; spin_lock_irqsave(&LINUX_WAITQ_HEAD(waitq)->lock, flags); __add_wait_queue_exclusive(LINUX_WAITQ_HEAD(waitq), LINUX_WAITQ(link)); spin_unlock_irqrestore(&LINUX_WAITQ_HEAD(waitq)->lock, flags); }
/** * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively * waiting threads, which is not always desirable because all threads will * be waken up again and again, even user only needs a few of them to be * active most time. This is not good for performance because cache can * be polluted by different threads. * * LIFO list can resolve this problem because we always wakeup the most * recent active thread by default. * * NB: please don't call non-exclusive & exclusive wait on the same * waitq if add_wait_queue_exclusive_head is used. */ void add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link) { unsigned long flags; spin_lock_irqsave(&waitq->lock, flags); __add_wait_queue_exclusive(waitq, link); spin_unlock_irqrestore(&waitq->lock, flags); }
/* 这个函数真正将执行epoll_wait的进程带入睡眠状态... */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsigned long flags; long jtimeout; wait_queue_t wait; /* * Calculate the timeout by checking for the "infinite" value (-1) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ /* 计算睡觉时间, 毫秒要转换为HZ */ jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; retry: spin_lock_irqsave(&ep->lock, flags); res = 0; /* 如果ready list不为空, 就不睡了, 直接干活... */ if (list_empty(&ep->rdllist)) { /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ /* OK, 初始化一个等待队列, 准备直接把自己挂起, * 注意current是一个宏, 代表当前进程 */ init_waitqueue_entry(&wait, current); __add_wait_queue_exclusive(&ep->wq, &wait); for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ /* 将当前进程设置位睡眠, 但是可以被信号唤醒的状态, * 注意这个设置是"将来时", 我们此刻还没睡! */ set_current_state(TASK_INTERRUPTIBLE); /* 如果这个时候, ready list里面有成员了, * 或者睡眠时间已经过了, 就直接不睡了... */ if (!list_empty(&ep->rdllist) || !jtimeout) break; /* 如果有信号产生, 也起床... */ if (signal_pending(current)) { res = -EINTR; break; } /* 啥事都没有,解锁, 睡觉... */ spin_unlock_irqrestore(&ep->lock, flags); /* jtimeout这个时间后, 会被唤醒, * ep_poll_callback()如果此时被调用, * 那么我们就会直接被唤醒, 不用等时间了... * 再次强调一下ep_poll_callback()的调用时机是由被监听的fd * 的具体实现, 比如socket或者某个设备驱动来决定的, * 因为等待队列头是他们持有的, epoll和当前进程 * 只是单纯的等待... **/ jtimeout = schedule_timeout(jtimeout); spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); /* OK 我们醒来了... */ set_current_state(TASK_RUNNING); } /* Is it worth to try to dig for events ? */ eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; spin_unlock_irqrestore(&ep->lock, flags); /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ /* 如果一切正常, 有event发生, 就开始准备数据copy给用户空间了... */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && jtimeout) goto retry; return res; }