/* Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * * dupthresh: 3 OOO packets delivered (packet count) * FACK: sequence delta to highest sacked sequence (sequence space) * RACK: sent time delta to the latest delivered packet (time domain) * * The advantage of RACK is it applies to both original and retransmitted * packet and therefore is robust against tail losses. Another advantage * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * * The current version is only used after recovery starts but can be * easily extended to detect the first loss. */ int tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; u32 reo_wnd, prior_retrans = tp->retrans_out; if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) return 0; /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; /* To be more reordering resilient, allow min_rtt/4 settling delay * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed * RTT because reordering is often a path property and less related * to queuing or delayed ACKs. * * TODO: measure and adapt to the observed reordering delay, and * use a timer to retransmit like the delayed early retransmit. */ reo_wnd = 1000; if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); if (skb == tcp_send_head(sk)) break; /* Skip ones already (s)acked */ if (!after(scb->end_seq, tp->snd_una) || scb->sacked & TCPCB_SACKED_ACKED) continue; if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { if (skb_mstamp_us_delta(&tp->rack.mstamp, &skb->skb_mstamp) <= reo_wnd) continue; /* skb is lost if packet sent later is sacked */ tcp_skb_mark_lost_uncond_verify(tp, skb); if (scb->sacked & TCPCB_SACKED_RETRANS) { scb->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early * b/c the rest are all sent after rack_sent */ break; } } return prior_retrans - tp->retrans_out; }
/** * @skb_list can be invalid after the function call, don't try to use it. */ static void ss_do_send(struct sock *sk, SsSkbList *skb_list) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); SS_DBG("%s: cpu=%d sk=%p queue_empty=%d send_head=%p" " sk_state=%d mss=%d size=%d\n", __func__, smp_processor_id(), sk, tcp_write_queue_empty(sk), tcp_send_head(sk), sk->sk_state, mss, size); if (unlikely(!ss_sock_active(sk))) return; ss_sock_cpu_check(sk); while ((skb = ss_skb_dequeue(skb_list))) { skb->ip_summed = CHECKSUM_PARTIAL; skb_shinfo(skb)->gso_segs = 0; /* * TODO Mark all data with PUSH to force receiver to consume * the data. Currently we do this for debugging purposes. * We need to do this only for complete messages/skbs. * Actually tcp_push() already does it for the last skb. * MSG_MORE should be used, probably by connection layer. */ tcp_mark_push(tp, skb); SS_DBG("%s: entail skb=%p data_len=%u len=%u\n", __func__, skb, skb->data_len, skb->len); ss_skb_entail(sk, skb); tp->write_seq += skb->len; TCP_SKB_CB(skb)->end_seq += skb->len; } SS_DBG("%s: sk=%p send_head=%p sk_state=%d\n", __func__, sk, tcp_send_head(sk), sk->sk_state); tcp_push(sk, MSG_DONTWAIT, mss, TCP_NAGLE_OFF|TCP_NAGLE_PUSH, size); }
/** * Cleans the meta-socket retransmission queue and the reinject-queue. * @sk must be the metasocket. */ static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) { struct sk_buff *skb, *tmp; struct tcp_sock *meta_tp = tcp_sk(meta_sk); struct mptcp_cb *mpcb = meta_tp->mpcb; int acked = 0; while ((skb = tcp_write_queue_head(meta_sk)) && skb != tcp_send_head(meta_sk)) { if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) break; tcp_unlink_write_queue(skb, meta_sk); if (mptcp_is_data_fin(skb)) { struct sock *sk_it; /* DATA_FIN has been acknowledged - now we can close * the subflows */ mptcp_for_each_sk(mpcb, sk_it) { unsigned long delay = 0; /* If we are the passive closer, don't trigger * subflow-fin until the subflow has been finned * by the peer - thus we add a delay. */ if (mpcb->passive_close && sk_it->sk_state == TCP_ESTABLISHED) delay = inet_csk(sk_it)->icsk_rto << 3; mptcp_sub_close(sk_it, delay); } } meta_tp->packets_out -= tcp_skb_pcount(skb); sk_wmem_free_skb(meta_sk, skb); acked = 1; }
/** * This is main body of the socket close function in Sync Sockets. * * inet_release() can sleep (as well as tcp_close()), so we make our own * non-sleepable socket closing. * * This function must be used only for data sockets. * Use standard sock_release() for listening sockets. * * In most cases it is called in softirq context and from ksoftirqd which * processes data from the socket (RSS and RPS distribute packets that way). * * Note: it used to be called in process context as well, at the time when * Tempesta starts or stops. That's not the case right now, but it may change. * * TODO In some cases we need to close socket agresively w/o FIN_WAIT_2 state, * e.g. by sending RST. So we need to add second parameter to the function * which says how to close the socket. * One of the examples is rcl_req_limit() (it should reset connections). * See tcp_sk(sk)->linger2 processing in standard tcp_close(). * * Called with locked socket. */ static void ss_do_close(struct sock *sk) { struct sk_buff *skb; int data_was_unread = 0; int state; if (unlikely(!sk)) return; SS_DBG("Close socket %p (%s): cpu=%d account=%d refcnt=%d\n", sk, ss_statename[sk->sk_state], smp_processor_id(), sk_has_account(sk), atomic_read(&sk->sk_refcnt)); assert_spin_locked(&sk->sk_lock.slock); ss_sock_cpu_check(sk); BUG_ON(sk->sk_state == TCP_LISTEN); /* We must return immediately, so LINGER option is meaningless. */ WARN_ON(sock_flag(sk, SOCK_LINGER)); /* We don't support virtual containers, so TCP_REPAIR is prohibited. */ WARN_ON(tcp_sk(sk)->repair); /* The socket must have atomic allocation mask. */ WARN_ON(!(sk->sk_allocation & GFP_ATOMIC)); /* The below is mostly copy-paste from tcp_close(). */ sk->sk_shutdown = SHUTDOWN_MASK; while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - tcp_hdr(skb)->fin; data_was_unread += len; SS_DBG("free rcv skb %p\n", skb); __kfree_skb(skb); } sk_mem_reclaim(sk); if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; if (data_was_unread) { NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (tcp_close_state(sk)) { /* The code below is taken from tcp_send_fin(). */ struct tcp_sock *tp = tcp_sk(sk); int mss_now = tcp_current_mss(sk); skb = tcp_write_queue_tail(sk); if (tcp_send_head(sk) != NULL) { /* Send FIN with data if we have any. */ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; } else { /* No data to send in the socket, allocate new skb. */ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); if (!skb) { SS_WARN("can't send FIN due to bad alloc"); } else { skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); } adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); /* * SS sockets are processed in softirq only, * so backlog queue should be empty. */ WARN_ON(sk->sk_backlog.tail); percpu_counter_inc(sk->sk_prot->orphan_count); if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) return; if (sk->sk_state == TCP_FIN_WAIT2) { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); return; } } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req = tcp_sk(sk)->fastopen_rsk; if (req != NULL) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } }
struct sk_buff *rtl_tcp_send_head(const struct sock *sk) { return tcp_send_head(sk); }