static void mptcp_ccc_recalc_alpha(struct sock *sk) { struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; struct sock *sub_sk; int best_cwnd = 0, best_rtt = 0, can_send = 0; u64 max_numerator = 0, sum_denominator = 0, alpha = 1; if (!mpcb) return; /* Only one subflow left - fall back to normal reno-behavior * (set alpha to 1) */ if (mpcb->cnt_established <= 1) goto exit; /* Do regular alpha-calculation for multiple subflows */ /* Find the max numerator of the alpha-calculation */ mptcp_for_each_sk(mpcb, sub_sk) { struct tcp_sock *sub_tp = tcp_sk(sub_sk); u64 tmp; if (!mptcp_ccc_sk_can_send(sub_sk)) continue; can_send++; /* We need to look for the path, that provides the max-value. * Integer-overflow is not possible here, because * tmp will be in u64. */ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd, alpha_scale_num), (u64)sub_tp->srtt * sub_tp->srtt); if (tmp >= max_numerator) { max_numerator = tmp; best_cwnd = sub_tp->snd_cwnd; best_rtt = sub_tp->srtt; } } /* No subflow is able to send - we don't care anymore */ if (unlikely(!can_send)) goto exit; /* Calculate the denominator */ mptcp_for_each_sk(mpcb, sub_sk) { struct tcp_sock *sub_tp = tcp_sk(sub_sk); if (!mptcp_ccc_sk_can_send(sub_sk)) continue; sum_denominator += div_u64( mptcp_ccc_scale(sub_tp->snd_cwnd, alpha_scale_den) * best_rtt, sub_tp->srtt); } sum_denominator *= sum_denominator; if (unlikely(!sum_denominator)) { pr_err("%s: sum_denominator == 0, cnt_established:%d\n", __func__, mpcb->cnt_established); mptcp_for_each_sk(mpcb, sub_sk) { struct tcp_sock *sub_tp = tcp_sk(sub_sk); pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u", __func__, sub_tp->mptcp->path_index, sub_sk->sk_state, sub_tp->srtt, sub_tp->snd_cwnd); } }
static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; __u32 seq; struct net *net = dev_net(skb->dev); sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); if (sk == NULL) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); return; } bh_lock_sock(sk); if (sock_owned_by_user(sk)) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto out; } tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = inet6_sk(sk); if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst; if (sock_owned_by_user(sk)) goto out; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto out; dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { struct inet_sock *inet = inet_sk(sk); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = np->daddr; fl6.saddr = np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false); if (IS_ERR(dst)) { sk->sk_err_soft = -PTR_ERR(dst); goto out; } } else dst_hold(dst); if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } dst_release(dst); goto out; } icmpv6_err_convert(type, code, &err); switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: if (sock_owned_by_user(sk)) goto out; req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, &hdr->saddr, inet6_iif(skb)); if (!req) goto out; WARN_ON(req->sk != NULL); if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: case TCP_SYN_RECV: if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); tcp_done(sk); } else sk->sk_err_soft = err; goto out; } if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else sk->sk_err_soft = err; out: bh_unlock_sock(sk); sock_put(sk); }
/* * TCP Westwood * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); }
static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * * These are so named because they represent the approximate values * of snd_una and snd_nxt at the beginning of the current RTT. More * precisely, they represent the amount of data sent during the RTT. * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding * bytes of data have been ACKed during the course of the RTT, giving * an "actual" rate of: * * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) * * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, * because delayed ACKs can cover more than one segment, so they * don't line up nicely with the boundaries of RTTs. * * Another unfortunate fact of life is that delayed ACKs delay the * advance of the left edge of our send window, so that the number * of bytes we send in an RTT is often less than our cwnd will allow. * So we keep track of our cwnd separately, in v_beg_snd_cwnd. */ if (after(ack, vegas->beg_snd_nxt)) { /* Do the Vegas once-per-RTT cwnd adjustment. */ u32 old_wnd, old_snd_cwnd; /* Here old_wnd is essentially the window of data that was * sent during the previous RTT, and has all * been acknowledged in the course of the RTT that ended * with the ACK we just received. Likewise, old_snd_cwnd * is the cwnd during the previous RTT. */ old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / tp->mss_cache; old_snd_cwnd = vegas->beg_snd_cwnd; /* Save the extent of the current window so we can use this * at the end of the next RTT. */ vegas->beg_snd_una = vegas->beg_snd_nxt; vegas->beg_snd_nxt = tp->snd_nxt; vegas->beg_snd_cwnd = tp->snd_cwnd; /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. * If we only had 2 samples total, * then that means we're getting only 1 ACK per RTT, which * means they're almost certainly delayed ACKs. * If we have 3 samples, we should be OK. */ if (vegas->cntRTT <= 2) { /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or * decrease cwnd, and by how much. */ /* Pluck out the RTT we are using for the Vegas * calculations. This is the min RTT seen during the * last RTT. Taking the min filters out the effects * of delayed ACKs, at the cost of noticing congestion * a bit later. */ rtt = vegas->minRTT; /* Calculate the cwnd we should have, if we weren't * going too fast. * * This is: * (actual rate in segments) * baseRTT * We keep it as a fixed point number with * V_PARAM_SHIFT bits to the right of the binary point. */ target_cwnd = ((old_wnd * vegas->baseRTT) << V_PARAM_SHIFT) / rtt; /* Calculate the difference between the window we had, * and the window we would like to have. This quantity * is the "Diff" from the Arizona Vegas papers. * * Again, this is a fixed point number with * V_PARAM_SHIFT bits to the right of the binary * point. */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > (u32)gamma) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ tp->snd_ssthresh = 2; /* Set cwnd to match the actual rate * exactly: * cwnd = (actual rate) * baseRTT * Then we add 1 because the integer * truncation robs us of full link * utilization. */ tp->snd_cwnd = min(tp->snd_cwnd, (target_cwnd >> V_PARAM_SHIFT)+1); } tcp_slow_start(tp); } else {
struct sock *mptcp_select_ack_sock(const struct mptcp_cb *mpcb, int copied) { struct sock *sk, *subsk = NULL; struct tcp_sock *meta_tp = mpcb_meta_tp(mpcb); u32 max_data_seq = 0; /* max_data_seq initialized to correct compiler-warning. * But the initialization is handled by max_data_seq_set */ short max_data_seq_set = 0; u32 min_time = 0xffffffff; /* How do we select the subflow to send the window-update on? * * 1. He has to be in a state where he can send an ack. * 2. He has to be one of those subflow who recently * contributed to the received stream * (this guarantees a working subflow) * a) its latest data_seq received is after the original * copied_seq. * We select the one with the lowest rtt, so that the * window-update reaches our peer the fastest. * b) if no subflow has this kind of data_seq (e.g., very * strange meta-level retransmissions going on), we take * the subflow who last sent the highest data_seq. */ mptcp_for_each_sk(mpcb, sk) { struct tcp_sock *tp = tcp_sk(sk); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE | TCPF_LISTEN)) continue; /* Select among those who contributed to the * current receive-queue. */ if (copied && after(tp->mptcp->last_data_seq, meta_tp->copied_seq - copied)) { if (tp->srtt < min_time) { min_time = tp->srtt; subsk = sk; max_data_seq_set = 0; } continue; } if (!subsk && !max_data_seq_set) { max_data_seq = tp->mptcp->last_data_seq; max_data_seq_set = 1; subsk = sk; } /* Otherwise, take the one with the highest data_seq */ if ((!subsk || max_data_seq_set) && after(tp->mptcp->last_data_seq, max_data_seq)) { max_data_seq = tp->mptcp->last_data_seq; subsk = sk; } } if (!subsk) { mptcp_debug("%s subsk is null, copied %d, cseq %u\n", __func__, copied, meta_tp->copied_seq); mptcp_for_each_sk(mpcb, sk) { struct tcp_sock *tp = tcp_sk(sk); mptcp_debug("%s pi %d state %u last_dseq %u\n", __func__, tp->mptcp->path_index, sk->sk_state, tp->mptcp->last_data_seq); } }
void tcp_v4_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; int type = skb->h.icmph->type; int code = skb->h.icmph->code; struct sock *sk; __u32 seq; int err; if (skb->len < (iph->ihl << 2) + 8) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; } sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put((struct inet_timewait_sock *)sk); return; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS); goto out; } switch (type) { case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ if (!sock_owned_by_user(sk)) do_pmtu_discovery(sk, iph, info); goto out; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: goto out; } switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: if (sock_owned_by_user(sk)) goto out; req = inet_csk_search_req(sk, &prev, th->dest, iph->daddr, iph->saddr); if (!req) goto out; /* ICMPs are not backlogged, hence we cannot get an established socket here. */ BUG_TRAP(!req->sk); if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); goto out; } /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can f.e. if SYNs crossed. */ if (!sock_owned_by_user(sk)) { TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk->sk_error_report(sk); tcp_done(sk); } else { sk->sk_err_soft = err; } goto out; } /* If we've already connected we will keep trying * until we time out, or the user gives up. * * rfc1122 4.2.3.9 allows to consider as hard errors * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, * but it is obsoleted by pmtu discovery). * * Note, that in modern internet, where routing is unreliable * and in each dark corner broken firewalls sit, sending random * errors ordered by their masters even this two messages finally lose * their original sense (even Linux sends invalid PORT_UNREACHs) * * Now we are in compliance with RFCs. * --ANK (980905) */ inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } out: bh_unlock_sock(sk); sock_put(sk); }
static void ctcp_cwnd_event(struct sock *sk) { tcp_sk(sk)->snd_cwnd = instance; }
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p = NULL, final; struct flowi fl; struct dst_entry *dst; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return(-EAFNOSUPPORT); memset(&fl, 0, sizeof(fl)); if (np->sndflow) { fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl.fl6_flowlabel); if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if(ipv6_addr_any(&usin->sin6_addr)) usin->sin6_addr.s6_addr[15] = 0x1; addr_type = ipv6_addr_type(&usin->sin6_addr); if(addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != usin->sin6_scope_id) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } ipv6_addr_copy(&np->daddr, &usin->sin6_addr); np->flow_label = fl.fl6_flowlabel; /* * TCP over IPv4 */ if (addr_type == IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); if (__ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), inet->saddr); ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), inet->rcv_saddr); } return err; } if (!ipv6_addr_any(&np->rcv_saddr)) saddr = &np->rcv_saddr; fl.proto = IPPROTO_TCP; ipv6_addr_copy(&fl.fl6_dst, &np->daddr); ipv6_addr_copy(&fl.fl6_src, (saddr ? saddr : &np->saddr)); fl.oif = sk->sk_bound_dev_if; fl.fl_ip_dport = usin->sin6_port; fl.fl_ip_sport = inet->sport; if (np->opt && np->opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; ipv6_addr_copy(&final, &fl.fl6_dst); ipv6_addr_copy(&fl.fl6_dst, rt0->addr); final_p = &final; } err = ip6_dst_lookup(sk, &dst, &fl); if (err) goto failure; if (final_p) { ipv6_addr_copy(&fl.fl6_dst, final_p); fl.flags |= FLOWI_FLAG_NOTROUTE; } if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) goto failure; if (saddr == NULL) { saddr = &fl.fl6_src; ipv6_addr_copy(&np->rcv_saddr, saddr); } /* set the source address */ ipv6_addr_copy(&np->saddr, saddr); inet->rcv_saddr = LOOPBACK4_IPV6; ip6_dst_store(sk, dst, NULL); sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); icsk->icsk_ext_hdr_len = 0; if (np->opt) icsk->icsk_ext_hdr_len = (np->opt->opt_flen + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; if (!tp->write_seq) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, np->daddr.s6_addr32, inet->sport, inet->dport); err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); __sk_dst_reset(sk); failure: inet->dport = 0; sk->sk_route_caps = 0; return err; }
/* * BST_TMR_DELAY_SOCK_SYNC timeout. * If sock is ready, get sock sync properties and post them to daemon */ static void delay_sock_bastet_timeout(struct sock *sk) { int err; struct bst_set_sock_sync_prop sock_p; struct bastet_sock *bsk = sk->bastet; /* Accurating time */ if (time_after(bsk->bastet_timeout, jiffies)) { sk_reset_timer(sk, &bsk->bastet_timer, bsk->bastet_timeout); return; } /* We must reset timer event, bastet_delay_sock_sync_notify depends on it * this code must be put after accurating time */ bsk->bastet_timer_event = BST_TMR_EVT_INVALID; /* In repair mode or userspace needs repair, do not sync sock */ if (unlikely(tcp_sk(sk)->repair || bsk->need_repair)) { BASTET_LOGE("sk: %p in repair mode", sk); return; } if (TCP_ESTABLISHED != sk->sk_state) { BASTET_LOGE("sk: %p sk_state is not TCP_ESTABLISHED", sk); return; } if (BST_SOCK_VALID != bsk->bastet_sock_state) { BASTET_LOGE("sk: %p state: %d not expected", sk, bsk->bastet_sock_state); return; } /* Sock owner has used since last setup */ if (time_after(bsk->last_sock_active_time_point + bsk->delay_sync_time_section, jiffies)) { setup_sock_sync_delay_timer(sk); return; } /* Sock owner has some data unacked, * Coming ack would trigger delay timer again */ if (!tcp_write_queue_empty(sk)) { BASTET_LOGI("sk: %p has sent data not acked", sk); post_indicate_packet(BST_IND_TRIGGER_THAW, &bsk->pid, sizeof(pid_t)); return; } /* Sock owner has some data to recv, do not sync. * If sock owner has none recv action, * delay timer should be stopped. */ if (!skb_queue_empty(&sk->sk_receive_queue)) { BASTET_LOGI("sk: %p has received data in queue", sk); bsk->last_sock_active_time_point = jiffies; setup_sock_sync_delay_timer(sk); post_indicate_packet(BST_IND_TRIGGER_THAW, &bsk->pid, sizeof(pid_t)); return; } memset(&sock_p, 0, sizeof(struct bst_set_sock_sync_prop)); bastet_get_comm_prop(sk, &sock_p.guide); bastet_get_sock_prop(sk, &sock_p.sync_prop); err = post_indicate_packet(BST_IND_SOCK_SYNC_PROP, &sock_p, sizeof(sock_p)); if (!err) { /* if post success */ bsk->bastet_sock_state = BST_SOCK_INVALID; } }
static bool tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct dst_entry *dst, struct request_sock *req) { struct tcp_sock *tp = tcp_sk(sk); struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) return false; spin_lock(&queue->fastopenq->lock); queue->fastopenq->qlen++; spin_unlock(&queue->fastopenq->lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created * only out of the bits carried in the SYN packet. */ tp = tcp_sk(child); tp->fastopen_rsk = req; /* Do a hold on the listner sk so that if the listener is being * closed, the child that has been accepted can live on and still * access listen_lock. */ sock_hold(sk); tcp_rsk(req)->listener = sk; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the SYN table of the parent * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_init_buffer_space(child); /* Queue the data carried in the SYN packet. We need to first * bump skb's refcnt because the caller will attempt to free it. * * XXX (TFO) - we honor a zero-payload TFO request for now, * (any reason not to?) but no need to queue the skb since * there is no data. How about SYN+FIN? */ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1) { skb = skb_get(skb); skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); skb_set_owner_r(skb, child); __skb_queue_tail(&child->sk_receive_queue, skb); tp->syn_data_acked = 1; } tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; sk->sk_data_ready(sk, 0); bh_unlock_sock(child); sock_put(child); WARN_ON(req->sk == NULL); return true; }
static inline int mptcp_olia_sk_can_send(const struct sock *sk) { return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt; }
static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct dst_entry *dst, struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; bool own_req; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, NULL, &own_req); if (!child) return NULL; spin_lock(&queue->fastopenq.lock); queue->fastopenq.qlen++; spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created * only out of the bits carried in the SYN packet. */ tp = tcp_sk(child); tp->fastopen_rsk = req; tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); atomic_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_init_buffer_space(child); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ return child; }
int ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int val, valbool; int retv = -ENOPROTOOPT; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.setsockopt(sk, level, optname, optval, optlen); if(level!=SOL_IPV6) goto out; if (optval == NULL) val=0; else if (get_user(val, (int __user *) optval)) return -EFAULT; valbool = (val!=0); lock_sock(sk); switch (optname) { case IPV6_ADDRFORM: if (val == PF_INET) { struct ipv6_txoptions *opt; struct sk_buff *pktopt; if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) break; if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) { retv = -EADDRNOTAVAIL; break; } fl6_free_socklist(sk); ipv6_sock_mc_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct tcp_sock *tp = tcp_sk(sk); local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; tp->af_specific = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, tp->pmtu_cookie); } else { local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&udp_prot); local_bh_enable(); sk->sk_prot = &udp_prot; sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } opt = xchg(&np->opt, NULL); if (opt) sock_kfree_s(sk, opt, opt->tot_len); pktopt = xchg(&np->pktoptions, NULL); if (pktopt) kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; #ifdef INET_REFCNT_DEBUG atomic_dec(&inet6_sock_nr); #endif module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (inet_sk(sk)->num) goto e_inval; np->ipv6only = valbool; retv = 0; break; case IPV6_PKTINFO: np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_HOPLIMIT: np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_RTHDR: if (val < 0 || val > 2) goto e_inval; np->rxopt.bits.srcrt = val; retv = 0; break; case IPV6_HOPOPTS: np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_DSTOPTS: np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_FLOWINFO: np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi fl; int junk; fl.fl6_flowlabel = 0; fl.oif = sk->sk_bound_dev_if; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (opt == NULL) break; memset(opt, 0, sizeof(*opt)); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_user(opt+1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control = (void*)(opt+1); retv = datagram_send_ctl(&msg, &fl, opt, &junk); if (retv) goto done; update: retv = 0; if (sk->sk_type == SOCK_STREAM) { if (opt) { struct tcp_sock *tp = tcp_sk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { tp->ext_header_len = opt->opt_flen + opt->opt_nflen; tcp_sync_mss(sk, tp->pmtu_cookie); } } opt = xchg(&np->opt, opt); sk_dst_reset(sk); } else { write_lock(&sk->sk_dst_lock); opt = xchg(&np->opt, opt); write_unlock(&sk->sk_dst_lock); sk_dst_reset(sk); } done: if (opt) sock_kfree_s(sk, opt, opt->tot_len); break; } case IPV6_UNICAST_HOPS: if (val > 255 || val < -1) goto e_inval; np->hop_limit = val; retv = 0; break; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) goto e_inval; if (val > 255 || val < -1) goto e_inval; np->mcast_hops = val; retv = 0; break; case IPV6_MULTICAST_LOOP: np->mc_loop = valbool; retv = 0; break; case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) goto e_inval; if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) goto e_inval; if (__dev_get_by_index(val) == NULL) { retv = -ENODEV; break; } np->mcast_oif = val; retv = 0; break; case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; retv = -EFAULT; if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen != sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: { struct group_req greq; struct sockaddr_in6 *psin6; retv = -EFAULT; if (copy_from_user(&greq, optval, sizeof(struct group_req))) break; if (greq.gr_group.ss_family != AF_INET6) { retv = -EADDRNOTAVAIL; break; } psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) retv = ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); else retv = ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); break; } case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: { struct group_source_req greqs; int omode, add; if (optlen != sizeof(struct group_source_req)) goto e_inval; if (copy_from_user(&greqs, optval, sizeof(greqs))) { retv = -EFAULT; break; } if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) { retv = -EADDRNOTAVAIL; break; } if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, &psin6->sin6_addr); if (retv) break; omode = MCAST_INCLUDE; add = 1; } else /*IP_DROP_SOURCE_MEMBERSHIP */ { omode = MCAST_INCLUDE; add = 0; } retv = ip6_mc_source(add, omode, sk, &greqs); break; } case MCAST_MSFILTER: { extern int sysctl_optmem_max; extern int sysctl_mld_max_msf; struct group_filter *gsf; if (optlen < GROUP_FILTER_SIZE(0)) goto e_inval; if (optlen > sysctl_optmem_max) { retv = -ENOBUFS; break; } gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); if (gsf == 0) { retv = -ENOBUFS; break; } retv = -EFAULT; if (copy_from_user(gsf, optval, optlen)) { kfree(gsf); break; } /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) { kfree(gsf); retv = -ENOBUFS; break; } if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { kfree(gsf); retv = -EINVAL; break; } retv = ip6_mc_msfilter(sk, gsf); kfree(gsf); break; } case IPV6_ROUTER_ALERT: retv = ip6_ra_control(sk, val, NULL); break; case IPV6_MTU_DISCOVER: if (val<0 || val>2) goto e_inval; np->pmtudisc = val; retv = 0; break; case IPV6_MTU: if (val && val < IPV6_MIN_MTU) goto e_inval; np->frag_size = val; retv = 0; break; case IPV6_RECVERR: np->recverr = valbool; if (!val) skb_queue_purge(&sk->sk_error_queue); retv = 0; break; case IPV6_FLOWINFO_SEND: np->sndflow = valbool; retv = 0; break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!capable(CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; #ifdef CONFIG_NETFILTER default: retv = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); break; #endif } release_sock(sk); out: return retv; e_inval: release_sock(sk); return -EINVAL; }
/** * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint * @args: RPC transport to connect * * Invoked by a work queue tasklet. */ static void xs_tcp_connect_worker(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; struct socket *sock = xprt->sock; int err, status = -EIO; if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt); if (!xprt->sock) { /* start from scratch */ if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { dprintk("RPC: can't create TCP transport socket (%d).\n", -err); goto out; } if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { sock_release(sock); goto out; } } else /* "close" the socket, preserving the local port */ xs_tcp_reuse_connection(xprt); if (!xprt->inet) { struct sock *sk = sock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = xprt; xprt->old_data_ready = sk->sk_data_ready; xprt->old_state_change = sk->sk_state_change; xprt->old_write_space = sk->sk_write_space; sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_allocation = GFP_ATOMIC; /* socket options */ sk->sk_userlocks |= SOCK_BINDPORT_LOCK; sock_reset_flag(sk, SOCK_LINGER); tcp_sk(sk)->linger2 = 0; tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); /* Reset to new socket */ xprt->sock = sock; xprt->inet = sk; write_unlock_bh(&sk->sk_callback_lock); } /* Tell the socket layer to start connecting... */ status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, sizeof(xprt->addr), O_NONBLOCK); dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); if (status < 0) { switch (status) { case -EINPROGRESS: case -EALREADY: goto out_clear; case -ECONNREFUSED: case -ECONNRESET: /* retry with existing socket, after a delay */ break; default: /* get rid of existing socket, and retry */ xs_close(xprt); break; } } out: xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); }
static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); /* limited by applications */ if (!tcp_is_cwnd_limited(sk, in_flight)) return; /* We do the Veno calculations only if we got enough rtt samples */ if (veno->cntrtt <= 2) { /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd; /* We have enough rtt samples, so, using the Veno * algorithm, we determine the state of the network. */ rtt = veno->minrtt; target_cwnd = ((tp->snd_cwnd * veno->basertt) << V_PARAM_SHIFT) / rtt; veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ tcp_slow_start(tp); } else { /* Congestion avoidance. */ if (veno->diff < (u32)beta) { /* In the "non-congestive state", increase cwnd * every rtt. */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } else tp->snd_cwnd_cnt++; } else { /* In the "congestive state", increase cwnd * every other rtt. */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { if (veno->inc && tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; veno->inc = 0; } else veno->inc = 1; tp->snd_cwnd_cnt = 0; } else tp->snd_cwnd_cnt++; } } if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; } /* Wipe the slate clean for the next rtt. */ /* veno->cntrtt = 0; */ veno->minrtt = 0x7fffffff; }
/* Save metrics learned by this TCP session. This function is called * only, when TCP finishes successfully i.e. when it enters TIME-WAIT * or goes from LAST-ACK to CLOSE. */ void tcp_update_metrics(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; int m; if (sysctl_tcp_nometrics_save || !dst) return; if (dst->flags & DST_HOST) dst_confirm(dst); rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. */ tm = tcp_get_metrics(sk, dst, false); if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) tcp_metric_set(tm, TCP_METRIC_RTT, 0); goto out_unlock; } else tm = tcp_get_metrics(sk, dst, true); if (!tm) goto out_unlock; rtt = tcp_metric_get(tm, TCP_METRIC_RTT); m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is * always better than underestimation. */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) rtt = tp->srtt_us; else rtt -= (m >> 3); tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { unsigned long var; if (m < 0) m = -m; /* Scale deviation to rttvar fixed point */ m >>= 1; if (m < tp->mdev_us) m = tp->mdev_us; var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); }
//start_listening void moon_listen(void) { //For test purposes moonraker_socket_t *listen; listen = kmalloc(sizeof(moonraker_socket_t),GFP_KERNEL); listen->proto = kmalloc(sizeof(moonraker_proto_t),GFP_KERNEL); listen->proto->name = kmalloc(5,GFP_KERNEL); listen->ip = 2130706433; listen->port = 80; listen->proto->defer_accept=0; listen->keepalive_timeout=0; listen->ack_pingpong=1; listen->max_backlog=2048; listen->defer_accept=1; strcpy(listen->proto->name,"http"); //end for test purpose struct sockaddr_in sin; struct socket *sock = NULL; struct sock *sk; struct tcp_sock *tp; struct inet_connection_sock *icsk; moonraker_proto_t *proto = listen->proto; u16 port = listen->port; u32 addr = listen->ip; //127.0.0.1 int err = 0; err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (err) { printk(KERN_ERR "Moonraker: error %d creating socket.\n", err); goto error; } sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(addr); sin.sin_port = htons(port); sk = sock->sk; icsk = inet_csk(sk); sk->sk_reuse = 1; sock_set_flag(sk, SOCK_URGINLINE); err = sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); if (err){ printk(KERN_ERR "Moonraker: error %d binding socket. This means that probably some other process is (or was a short time ago) using addr %d\n",err,sin.sin_addr.s_addr); goto error; } tp = tcp_sk(sk); printk("listen sk accept_queue: %d.\n",!reqsk_queue_empty(&icsk->icsk_accept_queue)); icsk->icsk_ack.pingpong = listen->ack_pingpong; sock_reset_flag(sk, SOCK_LINGER); sk->sk_lingertime = 0; tp->linger2 = listen->keepalive_timeout * HZ; if (proto->defer_accept && !listen->keepalive_timeout && listen->defer_accept) icsk->icsk_accept_queue.rskq_defer_accept = 1; err = sock->ops->listen(sock, listen->max_backlog); if (err) { printk(KERN_ERR "Moonraker: error %d listening on socket.\n", err); goto error; } printk(KERN_NOTICE "Moonraker: thread %d listens on %s://%d.%d.%d.%d:%d.\n", 1, proto->name, HIPQUAD(addr), port); // return sock; return; error: if (sock) sock_release(sock); return; return NULL; }
static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __u32 info) { struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; __u32 seq; sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); if (sk == NULL) { ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put((struct inet_timewait_sock *)sk); return; } bh_lock_sock(sk); if (sock_owned_by_user(sk)) NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = inet6_sk(sk); if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst = NULL; if (sock_owned_by_user(sk)) goto out; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto out; /* icmp should have updated the destination cache entry */ dst = __sk_dst_check(sk, np->dst_cookie); if (dst == NULL) { struct inet_sock *inet = inet_sk(sk); struct flowi fl; /* BUGGG_FUTURE: Again, it is not clear how to handle rthdr case. Ignore this complexity for now. */ memset(&fl, 0, sizeof(fl)); fl.proto = IPPROTO_TCP; ipv6_addr_copy(&fl.fl6_dst, &np->daddr); ipv6_addr_copy(&fl.fl6_src, &np->saddr); fl.oif = sk->sk_bound_dev_if; fl.fl_ip_dport = inet->dport; fl.fl_ip_sport = inet->sport; if ((err = ip6_dst_lookup(sk, &dst, &fl))) { sk->sk_err_soft = -err; goto out; } if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { sk->sk_err_soft = -err; goto out; } } else dst_hold(dst); if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ dst_release(dst); goto out; } icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: if (sock_owned_by_user(sk)) goto out; req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, &hdr->saddr, inet6_iif(skb)); if (!req) goto out; /* ICMPs are not backlogged, hence we cannot get * an established socket here. */ BUG_TRAP(req->sk == NULL); if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); goto out; } inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can, it SYNs are crossed. --ANK */ if (!sock_owned_by_user(sk)) { TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ tcp_done(sk); } else sk->sk_err_soft = err; goto out; } if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else sk->sk_err_soft = err; out: bh_unlock_sock(sk); sock_put(sk); }
static void bictcp_init(struct sock *sk) { bictcp_reset(inet_csk_ca(sk)); if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; }
/* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct rtable *rt; u32 daddr, nexthop; int tmp; int err; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; if (inet->opt && inet->opt->srr) { if (!daddr) return -EINVAL; nexthop = inet->opt->faddr; } tmp = ip_route_connect(&rt, nexthop, inet->saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, inet->sport, usin->sin_port, sk); if (tmp < 0) return tmp; if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet->opt || !inet->opt->srr) daddr = rt->rt_dst; if (!inet->saddr) inet->saddr = rt->rt_src; inet->rcv_saddr = inet->saddr; if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); /* VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state TIME-WAIT * and initialize rx_opt.ts_recent from it, when trying new connection. */ if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } inet->dport = usin->sin_port; inet->daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk); if (err) goto failure; /* OK, now commit destination to socket. */ sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, inet->daddr, inet->sport, usin->sin_port); inet->id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; if (err) goto failure; return 0; failure: /* This unhashes the socket and releases the local port, if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->dport = 0; return err; }
static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) { tcp_reno_cong_avoid(sk, ack, acked); return; } if (after(ack, vegas->beg_snd_nxt)) { /* Do the Vegas once-per-RTT cwnd adjustment. */ /* Save the extent of the current window so we can use this * at the end of the next RTT. */ vegas->beg_snd_nxt = tp->snd_nxt; /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. * If we only had 2 samples total, * then that means we're getting only 1 ACK per RTT, which * means they're almost certainly delayed ACKs. * If we have 3 samples, we should be OK. */ if (vegas->cntRTT <= 2) { /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, acked); } else { u32 rtt, diff; u64 target_cwnd; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or * decrease cwnd, and by how much. */ /* Pluck out the RTT we are using for the Vegas * calculations. This is the min RTT seen during the * last RTT. Taking the min filters out the effects * of delayed ACKs, at the cost of noticing congestion * a bit later. */ rtt = vegas->minRTT; /* Calculate the cwnd we should have, if we weren't * going too fast. * * This is: * (actual rate in segments) * baseRTT */ target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT; do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity * is the "Diff" from the Arizona Vegas papers. */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ /* Set cwnd to match the actual rate * exactly: * cwnd = (actual rate) * baseRTT * Then we add 1 because the integer * truncation robs us of full link * utilization. */ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ tcp_slow_start(tp, acked); } else { /* Congestion avoidance. */ /* Figure out where we would like cwnd * to be. */ if (diff > beta) { /* The old window was too fast, so * we slow down. */ tp->snd_cwnd--; tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. */ tp->snd_cwnd++; } else { /* Sending just as fast as we * should be. */ } } if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; tp->snd_ssthresh = tcp_current_ssthresh(sk); } /* Wipe the slate clean for the next RTT. */ vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ else if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp, acked); }
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else if (!yeah->doing_reno_now) { /* Scalable */ tp->snd_cwnd_cnt += yeah->pkts_acked; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } yeah->pkts_acked = 1; } else { /* Reno */ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. * * These are so named because they represent the approximate values * of snd_una and snd_nxt at the beginning of the current RTT. More * precisely, they represent the amount of data sent during the RTT. * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding * bytes of data have been ACKed during the course of the RTT, giving * an "actual" rate of: * * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) * * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, * because delayed ACKs can cover more than one segment, so they * don't line up yeahly with the boundaries of RTTs. * * Another unfortunate fact of life is that delayed ACKs delay the * advance of the left edge of our send window, so that the number * of bytes we send in an RTT is often less than our cwnd will allow. * So we keep track of our cwnd separately, in v_beg_snd_cwnd. */ if (after(ack, yeah->vegas.beg_snd_nxt)) { /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. * If we only had 2 samples total, * then that means we're getting only 1 ACK per RTT, which * means they're almost certainly delayed ACKs. * If we have 3 samples, we should be OK. */ if (yeah->vegas.cntRTT > 2) { u32 rtt, queue; u64 bw; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or * decrease cwnd, and by how much. */ /* Pluck out the RTT we are using for the Vegas * calculations. This is the min RTT seen during the * last RTT. Taking the min filters out the effects * of delayed ACKs, at the cost of noticing congestion * a bit later. */ rtt = yeah->vegas.minRTT; /* Compute excess number of packets above bandwidth * Avoid doing full 64 bit divide. */ bw = tp->snd_cwnd; bw *= rtt - yeah->vegas.baseRTT; do_div(bw, rtt); queue = bw; if (queue > TCP_YEAH_ALPHA || rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { if (queue > TCP_YEAH_ALPHA && tp->snd_cwnd > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , tp->snd_cwnd >> TCP_YEAH_EPSILON); tp->snd_cwnd -= reduction; tp->snd_cwnd = max(tp->snd_cwnd, yeah->reno_count); tp->snd_ssthresh = tp->snd_cwnd; } if (yeah->reno_count <= 2) yeah->reno_count = max(tp->snd_cwnd>>1, 2U); else yeah->reno_count++; yeah->doing_reno_now = min(yeah->doing_reno_now + 1, 0xffffffU); } else {
static u32 tcp_illinois_cwnd_undo(struct sock *sk) { const struct illinois *ca = inet_csk_ca(sk); return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); }
static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); }
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct rt6_info *rt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (np->sndflow) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (flowlabel == NULL) return -EINVAL; usin->sin6_addr = flowlabel->dst; fl6_sock_release(flowlabel); } } if(ipv6_addr_any(&usin->sin6_addr)) usin->sin6_addr.s6_addr[15] = 0x1; addr_type = ipv6_addr_type(&usin->sin6_addr); if(addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != usin->sin6_scope_id) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } np->daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; if (addr_type == IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); if (__ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; } else { ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr); } return err; } if (!ipv6_addr_any(&np->rcv_saddr)) saddr = &np->rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = np->daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; final_p = fl6_update_dst(&fl6, np->opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } if (saddr == NULL) { saddr = &fl6.saddr; np->rcv_saddr = *saddr; } np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(sk, dst, NULL, NULL); rt = (struct rt6_info *) dst; if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { struct inet_peer *peer = rt6_get_peer(rt); if (peer) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } } icsk->icsk_ext_hdr_len = 0; if (np->opt) icsk->icsk_ext_hdr_len = (np->opt->opt_flen + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; if (!tp->write_seq) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, np->daddr.s6_addr32, inet->inet_sport, inet->inet_dport); err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); __sk_dst_reset(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; }
/* Create a new IPv4 subflow. * * We are in user-context and meta-sock-lock is hold. */ int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc, struct mptcp_rem4 *rem) { struct tcp_sock *tp; struct sock *sk; struct sockaddr_in loc_in, rem_in; struct socket_alloc sock_full; struct socket *sock = (struct socket *)&sock_full; int ret; /** First, create and prepare the new socket */ memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full)); sock->state = SS_UNCONNECTED; sock->ops = NULL; ret = inet_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1); if (unlikely(ret < 0)) { net_err_ratelimited("%s inet_create failed ret: %d\n", __func__, ret); return ret; } sk = sock->sk; tp = tcp_sk(sk); /* All subsockets need the MPTCP-lock-class */ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name); lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0); ret = mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL); if (ret) { net_err_ratelimited("%s mptcp_add_sock failed ret: %d\n", __func__, ret); goto error; } tp->mptcp->slave_sk = 1; /* Initializing the timer for an MPTCP subflow */ timer_setup(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, 0); /** Then, connect the socket to the peer */ loc_in.sin_family = AF_INET; rem_in.sin_family = AF_INET; loc_in.sin_port = 0; if (rem->port) rem_in.sin_port = rem->port; else rem_in.sin_port = inet_sk(meta_sk)->inet_dport; loc_in.sin_addr = loc->addr; rem_in.sin_addr = rem->addr; if (loc->if_idx) sk->sk_bound_dev_if = loc->if_idx; ret = kernel_bind(sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in)); if (ret < 0) { net_err_ratelimited("%s: token %#x bind() to %pI4 index %d failed, error %d\n", __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, &loc_in.sin_addr, loc->if_idx, ret); goto error; } mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n", __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token, tp->mptcp->path_index, &loc_in.sin_addr, ntohs(loc_in.sin_port), &rem_in.sin_addr, ntohs(rem_in.sin_port), loc->if_idx); ret = kernel_connect(sock, (struct sockaddr *)&rem_in, sizeof(struct sockaddr_in), O_NONBLOCK); if (ret < 0 && ret != -EINPROGRESS) { net_err_ratelimited("%s: MPTCP subsocket connect() failed, error %d\n", __func__, ret); goto error; } MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX); sk_set_socket(sk, meta_sk->sk_socket); sk->sk_wq = meta_sk->sk_wq; return 0; error: /* May happen if mptcp_add_sock fails first */ if (!mptcp(tp)) { tcp_close(sk, 0); } else { local_bh_disable(); mptcp_sub_force_close(sk); local_bh_enable(); } return ret; }
int generic_send_file (tux_req_t *req, struct socket *sock, int cachemiss) { sock_send_desc_t sock_desc; int len, want, nonblock = !cachemiss; struct tcp_opt *tp = tcp_sk(sock->sk); tp->nonagle = 2; sock_desc.sock = sock; sock_desc.req = req; repeat: Dprintk("generic_send_file(%p,%d,%p) called, f_pos: %Ld, output_len: %Ld.\n", req, nonblock, sock, req->in_file->f_pos, req->output_len); if (req->proto->check_req_err(req, cachemiss)) return -1; if (connection_too_fast(req) == 2) { len = -5; goto out; } if (req->total_file_len < req->in_file->f_pos) TUX_BUG(); req->desc.written = 0; /* * Careful, output_len can be 64-bit, while 'want' can be 32-bit. */ if (req->output_len > SEND_BLOCKSIZE) want = SEND_BLOCKSIZE; else want = req->output_len; req->desc.count = want; req->desc.arg.buf = (char *) &sock_desc; req->desc.error = 0; Dprintk("sendfile(), desc.count: %d.\n", req->desc.count); do_generic_file_read(req->in_file, &req->in_file->f_pos, &req->desc, sock_send_actor, nonblock); if (req->desc.written > 0) { req->bytes_sent += req->desc.written; req->output_len -= req->desc.written; } if (!nonblock && (req->desc.error == -EWOULDBLOCKIO)) TUX_BUG(); Dprintk("sendfile() wrote: %d bytes.\n", req->desc.written); if (req->output_len && !req->desc.written && !req->desc.error) { #if CONFIG_TUX_DEBUG req->bytes_expected = 0; #endif req->in_file->f_pos = 0; req->error = TUX_ERROR_CONN_CLOSE; zap_request(req, cachemiss); return -1; } switch (req->desc.error) { case -EWOULDBLOCKIO: len = -3; break; case -EAGAIN: no_write_space: Dprintk("sk->wmem_queued: %d, sk->sndbuf: %d.\n", sock->sk->sk_wmem_queued, sock->sk->sk_sndbuf); len = -4; break; default: len = req->desc.written; #if CONFIG_TUX_DEBUG if (req->desc.error) TDprintk("TUX: sendfile() returned error %d (signals pending: %08lx)!\n", req->desc.error, current->pending.signal.sig[0]); #endif if (!req->desc.error) { if (req->output_len < 0) BUG(); if (req->output_len) { if (test_bit(SOCK_NOSPACE, &sock->flags)) goto no_write_space; goto repeat; } } #if CONFIG_TUX_DEBUG if (req->desc.written != want) TDprintk("TUX: sendfile() wrote %d bytes, wanted %d! (pos %Ld) (signals pending: %08lx).\n", req->desc.written, want, req->in_file->f_pos, current->pending.signal.sig[0]); else Dprintk("TUX: sendfile() FINISHED for req %p, wrote %d bytes.\n", req, req->desc.written); req->bytes_expected = 0; #endif break; } out: Dprintk("sendfile() wrote %d bytes.\n", len); return len; }