bool select_call::wait_os(bool zero_timeout) { timeval to, *pto = NULL; timespec to_pselect, *pto_pselect = NULL; /* Avner: I put it in comment, because this logic is wrong // optimization: do not call os select if ALL fds are excluded // extend check to write/except fds if (m_rfd_count == m_n_exclude_fds) return; */ if (zero_timeout) { to.tv_sec = to.tv_usec = 0; pto = &to; } else { pto = m_timeout; } // Restore original sets if (m_b_run_prepare_to_poll) { if (m_readfds) FD_COPY(m_readfds, &m_os_rfds, m_nfds); if (m_writefds) FD_COPY(m_writefds, &m_os_wfds, m_nfds); if (m_exceptfds)FD_COPY(m_exceptfds, &m_orig_exceptfds, m_nfds); } __log_func("calling os select: %d", m_nfds); if (m_sigmask) { if (pto) { to_pselect.tv_sec = pto->tv_sec; to_pselect.tv_nsec = pto->tv_usec * 1000; pto_pselect = &to_pselect; } m_n_all_ready_fds = orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); } else { m_n_all_ready_fds = orig_os_api.select(m_nfds, m_readfds, m_writefds, m_exceptfds, pto); } if (m_n_all_ready_fds < 0) { vma_throw_object(io_mux_call::io_error); } if (m_n_all_ready_fds > 0) { __log_func("wait_os() returned with %d", m_n_all_ready_fds); } return false; // No cq_fd in select() event }
void epoll_wait_call::init_offloaded_fds() { // copy offloaded_fds pointer and count m_epfd_info->get_offloaded_fds_arr_and_size(&m_p_num_all_offloaded_fds, &m_p_all_offloaded_fds); m_num_all_offloaded_fds = *m_p_num_all_offloaded_fds; // TODO: fix orig ugly code, and then remove this __log_func("building: epfd=%d, m_epfd_info->get_fd_info().size()=%d, *m_p_num_all_offloaded_fds=%d", m_epfd, (int)m_epfd_info->get_fd_info().size(), (int)*m_p_num_all_offloaded_fds ); }
void select_call::set_wfd_ready(int fd) { // This function also checks that fd was in the original read set if (!FD_ISSET(fd, m_writefds) && FD_ISSET(fd, &m_orig_writefds)) { //TODO: why do we need the last 'if'?? FD_SET(fd, m_writefds); ++m_n_ready_wfds; // if (!FD_ISSET(fd, m_readfds)) ++m_n_all_ready_fds; __log_func("ready w fd: %d", fd); } }
void select_call::set_offloaded_wfd_ready(int fd_index) { if (m_p_offloaded_modes[fd_index] & OFF_WRITE) { //TODO: consider removing int fd = m_p_all_offloaded_fds[fd_index]; if (!FD_ISSET(fd, m_writefds)) { FD_SET(fd, m_writefds); ++m_n_ready_wfds; ++m_n_all_ready_fds; __log_func("ready offloaded w fd: %d", fd); } } }
bool epoll_wait_call::check_all_offloaded_sockets(uint64_t *p_poll_sn) { NOT_IN_USE(p_poll_sn); m_n_all_ready_fds = get_current_events(); if (!m_n_ready_rfds) { // check cq for acks ring_poll_and_process_element(&m_poll_sn, NULL); m_n_all_ready_fds = get_current_events(); } __log_func("m_n_all_ready_fds=%d, m_n_ready_rfds=%d, m_n_ready_wfds=%d", m_n_all_ready_fds, m_n_ready_rfds, m_n_ready_wfds); return m_n_all_ready_fds; }
void rfs_uc_tcp_gro::flush_gro_desc(void* pv_fd_ready_array) { if (!m_b_active) return; if (m_gro_desc.buf_count > 1) { m_gro_desc.p_ip_h->tot_len = htons(m_gro_desc.ip_tot_len); m_gro_desc.p_tcp_h->ack_seq = m_gro_desc.ack; m_gro_desc.p_tcp_h->window = m_gro_desc.wnd; if (m_gro_desc.ts_present) { tcphdr_ts* p_tcp_ts_h = (tcphdr_ts*) m_gro_desc.p_tcp_h; p_tcp_ts_h->popts[2] = m_gro_desc.tsecr; } m_gro_desc.p_first->rx.tcp.gro = 1; m_gro_desc.p_first->lwip_pbuf.pbuf.flags = PBUF_FLAG_IS_CUSTOM; m_gro_desc.p_first->lwip_pbuf.pbuf.tot_len = m_gro_desc.p_first->lwip_pbuf.pbuf.len = (m_gro_desc.p_first->sz_data - m_gro_desc.p_first->rx.tcp.n_transport_header_len); m_gro_desc.p_first->lwip_pbuf.pbuf.ref = 1; m_gro_desc.p_first->lwip_pbuf.pbuf.type = PBUF_REF; m_gro_desc.p_first->lwip_pbuf.pbuf.payload = (u8_t *)(m_gro_desc.p_first->p_buffer + m_gro_desc.p_first->rx.tcp.n_transport_header_len); m_gro_desc.p_first->rx.is_vma_thr = m_gro_desc.p_last->rx.is_vma_thr; for (mem_buf_desc_t* p_desc = m_gro_desc.p_last; p_desc != m_gro_desc.p_first; p_desc = p_desc->p_prev_desc) { p_desc->p_prev_desc->lwip_pbuf.pbuf.tot_len += p_desc->lwip_pbuf.pbuf.tot_len; } } __log_func("Rx LRO TCP segment info: src_port=%d, dst_port=%d, flags='%s%s%s%s%s%s' seq=%u, ack=%u, win=%u, payload_sz=%u, num_bufs=%u", ntohs(m_gro_desc.p_tcp_h->source), ntohs(m_gro_desc.p_tcp_h->dest), m_gro_desc.p_tcp_h->urg?"U":"", m_gro_desc.p_tcp_h->ack?"A":"", m_gro_desc.p_tcp_h->psh?"P":"", m_gro_desc.p_tcp_h->rst?"R":"", m_gro_desc.p_tcp_h->syn?"S":"", m_gro_desc.p_tcp_h->fin?"F":"", ntohl(m_gro_desc.p_tcp_h->seq), ntohl(m_gro_desc.p_tcp_h->ack_seq), ntohs(m_gro_desc.p_tcp_h->window), m_gro_desc.ip_tot_len - 40, m_gro_desc.buf_count); if (!rfs_uc::rx_dispatch_packet(m_gro_desc.p_first, pv_fd_ready_array)) { m_p_ring->reclaim_recv_buffers_no_lock(m_gro_desc.p_first); } m_b_active = false; }
select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, timeval *timeout, const sigset_t *__sigmask /* = NULL */) : io_mux_call(off_fds_buffer, off_modes_buffer, nfds, __sigmask), m_nfds(nfds), m_readfds(readfds), m_writefds(writefds), m_exceptfds(exceptfds), m_timeout(timeout), m_nfds_with_cq(0), m_b_run_prepare_to_poll(false) { int fd; //socket_fd_api* temp_sock_fd_api = NULL; if (m_nfds > FD_SETSIZE) { errno = ENOMEM; vma_throw_object(io_mux_call::io_error); } // create stats m_p_stats = &g_select_stats; vma_stats_instance_get_select_block(m_p_stats); bool offloaded_read = !!m_readfds; bool offloaded_write = !!m_writefds; if (offloaded_read || offloaded_write) { FD_ZERO(&m_os_rfds, m_nfds); FD_ZERO(&m_os_wfds, m_nfds); //covers the case of select(readfds = NULL) if(!m_readfds) { FD_ZERO(&m_cq_rfds, m_nfds); m_readfds = &m_cq_rfds; } // get offloaded fds in read set for (fd = 0; fd < m_nfds; ++fd) { bool check_read = offloaded_read && FD_ISSET(fd, m_readfds); bool check_write = offloaded_write && FD_ISSET(fd, m_writefds); socket_fd_api* psock = fd_collection_get_sockfd(fd); if (psock && psock->get_type() == FD_TYPE_SOCKET) { offloaded_mode_t off_mode = OFF_NONE; if (check_read) off_mode = (offloaded_mode_t)(off_mode | OFF_READ); if (check_write) off_mode = (offloaded_mode_t)(off_mode | OFF_WRITE); if (off_mode) { __log_func("---> fd=%d IS SET for read or write!", fd); m_p_all_offloaded_fds[m_num_all_offloaded_fds] = fd; m_p_offloaded_modes[m_num_all_offloaded_fds] = off_mode; m_num_all_offloaded_fds++; if (! psock->skip_os_select()) { if (check_read) { FD_SET(fd, &m_os_rfds); if (psock->is_readable(NULL)) { io_mux_call::update_fd_array(&m_fd_ready_array, fd); m_n_ready_rfds++; m_n_all_ready_fds++; } else { // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on recvfrom(), // after iomux returned a shadow fd as ready (only for non-blocking sockets) psock->set_immediate_os_sample(); } } if (check_write) { FD_SET(fd, &m_os_wfds); } } else __log_func("fd=%d must be skipped from os r select()", fd); } } else { if (check_read) { FD_SET(fd, &m_os_rfds); } if (check_write) { FD_SET(fd, &m_os_wfds); } } } } __log_func("num all offloaded_fds=%d", m_num_all_offloaded_fds); }
bool select_call::wait(const timeval &elapsed) { timeval timeout, *pto = NULL; timespec to_pselect, *pto_pselect = NULL; BULLSEYE_EXCLUDE_BLOCK_START if (m_n_all_ready_fds > 0) { __log_panic("wait() called when there are ready fd's!!!"); // YossiE TODO make this and some more checks as debug assertions // In all functions } BULLSEYE_EXCLUDE_BLOCK_END // Restore original sets if (m_b_run_prepare_to_poll) { if (m_readfds) FD_COPY(m_readfds, &m_os_rfds, m_nfds); if (m_writefds) FD_COPY(m_writefds, &m_os_wfds, m_nfds); if (m_exceptfds)FD_COPY(m_exceptfds, &m_orig_exceptfds, m_nfds); } // Call OS select() on original sets + CQ epfd in read set if (m_readfds) FD_SET(m_cqepfd, m_readfds); if (m_timeout) { tv_sub(m_timeout, &elapsed, &timeout); if (timeout.tv_sec < 0 || timeout.tv_usec < 0) { // Already reached timeout return false; } pto = &timeout; } __log_func("going to wait on select CQ+OS nfds=%d cqfd=%d pto=%p!!!", m_nfds_with_cq, m_cqepfd, pto); // ACTUAL CALL TO SELECT if (m_sigmask) { if (pto) { to_pselect.tv_sec = pto->tv_sec; to_pselect.tv_nsec = pto->tv_usec * 1000; pto_pselect = &to_pselect; } m_n_all_ready_fds = orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); } else { m_n_all_ready_fds = orig_os_api.select(m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto); } __log_func("done select CQ+OS nfds=%d cqfd=%d pto=%p ready=%d!!!", m_nfds_with_cq, m_cqepfd, pto, m_n_all_ready_fds); if (m_n_all_ready_fds < 0) { vma_throw_object(io_mux_call::io_error); } // Clear CQ from the set and don't count it if (m_readfds) { if (FD_ISSET(m_cqepfd, m_readfds)) { FD_CLR(m_cqepfd, m_readfds); // Not needed if m_readfds is NULL --m_n_all_ready_fds; return true; } } return false; }
bool epoll_wait_call::_wait(int timeout) { int i, ready_fds, fd; bool cq_ready = false; __log_func("calling os epoll: %d", m_epfd); if (timeout) { lock(); if (m_epfd_info->m_ready_fds.empty()) { m_epfd_info->going_to_sleep(); } else { timeout = 0; } unlock(); } if (m_sigmask) { ready_fds = orig_os_api.epoll_pwait(m_epfd, m_p_ready_events, m_maxevents, timeout, m_sigmask); } else { ready_fds = orig_os_api.epoll_wait(m_epfd, m_p_ready_events, m_maxevents, timeout); } if (timeout) { lock(); m_epfd_info->return_from_sleep(); unlock(); } if (ready_fds < 0) { vma_throw_object(io_mux_call::io_error); } // convert the returned events to user events and mark offloaded fds m_n_all_ready_fds = 0; for (i = 0; i < ready_fds; ++i) { fd = m_p_ready_events[i].data.fd; // wakeup event if(m_epfd_info->is_wakeup_fd(fd)) { lock(); m_epfd_info->remove_wakeup_fd(); unlock(); continue; } // If it's CQ if (m_epfd_info->is_cq_fd(m_p_ready_events[i].data.u64)) { cq_ready = true; continue; } if ((m_p_ready_events[i].events & EPOLLIN)) { socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); if (temp_sock_fd_api) { // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on recvfrom(), // after iomux returned a shadow fd as ready (only for non-blocking sockets) temp_sock_fd_api->set_immediate_os_sample(); } } // Copy event bits and data m_events[m_n_all_ready_fds].events = m_p_ready_events[i].events; if (!m_epfd_info->get_data_by_fd(fd, &m_events[m_n_all_ready_fds].data)) { continue; } ++m_n_all_ready_fds; } return cq_ready; }