/* * tcp_time_wait_processing() handles processing of incoming packets when * the tcp_t is in the TIME_WAIT state. * * A TIME_WAIT tcp_t that has an associated open TCP end point (not in * detached state) is never put on the time wait list. */ void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) { int32_t bytes_acked; int32_t gap; int32_t rgap; tcp_opt_t tcpopt; uint_t flags; uint32_t new_swnd = 0; conn_t *nconnp; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; BUMP_LOCAL(tcp->tcp_ibsegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); flags = (unsigned int)tcpha->tha_flags & 0xFF; new_swnd = ntohs(tcpha->tha_win) << ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); if (tcp->tcp_snd_ts_ok) { if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); goto done; } } gap = seg_seq - tcp->tcp_rnxt; rgap = tcp->tcp_rwnd - (gap + seg_len); if (gap < 0) { TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, (seg_len > -gap ? -gap : seg_len)); seg_len += gap; if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { if (flags & TH_RST) { goto done; } if ((flags & TH_FIN) && seg_len == -1) { /* * When TCP receives a duplicate FIN in * TIME_WAIT state, restart the 2 MSL timer. * See page 73 in RFC 793. Make sure this TCP * is already on the TIME_WAIT list. If not, * just restart the timer. */ if (TCP_IS_DETACHED(tcp)) { if (tcp_time_wait_remove(tcp, NULL) == B_TRUE) { tcp_time_wait_append(tcp); TCP_DBGSTAT(tcps, tcp_rput_time_wait); } } else { ASSERT(tcp != NULL); TCP_TIMER_RESTART(tcp, tcps->tcps_time_wait_interval); } tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); goto done; } flags |= TH_ACK_NEEDED; seg_len = 0; goto process_ack; } /* Fix seg_seq, and chew the gap off the front. */ seg_seq = tcp->tcp_rnxt; } if ((flags & TH_SYN) && gap > 0 && rgap < 0) { /* * Make sure that when we accept the connection, pick * an ISS greater than (tcp_snxt + ISS_INCR/2) for the * old connection. * * The next ISS generated is equal to tcp_iss_incr_extra * + ISS_INCR/2 + other components depending on the * value of tcp_strong_iss. We pre-calculate the new * ISS here and compare with tcp_snxt to determine if * we need to make adjustment to tcp_iss_incr_extra. * * The above calculation is ugly and is a * waste of CPU cycles... */ uint32_t new_iss = tcps->tcps_iss_incr_extra; int32_t adj; ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; switch (tcps->tcps_strong_iss) { case 2: { /* Add time and MD5 components. */ uint32_t answer[4]; struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; MD5_CTX context; mutex_enter(&tcps->tcps_iss_key_lock); context = tcps->tcps_iss_key; mutex_exit(&tcps->tcps_iss_key_lock); arg.ports = connp->conn_ports; /* We use MAPPED addresses in tcp_iss_init */ arg.src = connp->conn_laddr_v6; arg.dst = connp->conn_faddr_v6; MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); MD5Final((uchar_t *)answer, &context); answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; break; } case 1: /* Add time component and min random (i.e. 1). */ new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; break; default: /* Add only time component. */ new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; break; } if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { /* * New ISS not guaranteed to be ISS_INCR/2 * ahead of the current tcp_snxt, so add the * difference to tcp_iss_incr_extra. */ tcps->tcps_iss_incr_extra += adj; } /* * If tcp_clean_death() can not perform the task now, * drop the SYN packet and let the other side re-xmit. * Otherwise pass the SYN packet back in, since the * old tcp state has been cleaned up or freed. */ if (tcp_clean_death(tcp, 0) == -1) goto done; nconnp = ipcl_classify(mp, ira, ipst); if (nconnp != NULL) { TCP_STAT(tcps, tcp_time_wait_syn_success); /* Drops ref on nconnp */ tcp_reinput(nconnp, mp, ira, ipst); return; } goto done; }
/* * Fusion output routine for urgent data. This routine is called by * tcp_fuse_output() for handling non-M_DATA mblks. */ void tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) { mblk_t *mp1; struct T_exdata_ind *tei; tcp_t *peer_tcp = tcp->tcp_loopback_peer; mblk_t *head, *prev_head = NULL; ASSERT(tcp->tcp_fused); ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA); ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0); /* * Urgent data arrives in the form of T_EXDATA_REQ from above. * Each occurence denotes a new urgent pointer. For each new * urgent pointer we signal (SIGURG) the receiving app to indicate * that it needs to go into urgent mode. This is similar to the * urgent data handling in the regular tcp. We don't need to keep * track of where the urgent pointer is, because each T_EXDATA_REQ * "advances" the urgent pointer for us. * * The actual urgent data carried by T_EXDATA_REQ is then prepended * by a T_EXDATA_IND before being enqueued behind any existing data * destined for the receiving app. There is only a single urgent * pointer (out-of-band mark) for a given tcp. If the new urgent * data arrives before the receiving app reads some existing urgent * data, the previous marker is lost. This behavior is emulated * accordingly below, by removing any existing T_EXDATA_IND messages * and essentially converting old urgent data into non-urgent. */ ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID); /* Let sender get out of urgent mode */ tcp->tcp_valid_bits &= ~TCP_URG_VALID; /* * This flag indicates that a signal needs to be sent up. * This flag will only get cleared once SIGURG is delivered and * is not affected by the tcp_fused flag -- delivery will still * happen even after an endpoint is unfused, to handle the case * where the sending endpoint immediately closes/unfuses after * sending urgent data and the accept is not yet finished. */ peer_tcp->tcp_fused_sigurg = B_TRUE; /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */ DB_TYPE(mp) = M_PROTO; tei = (struct T_exdata_ind *)mp->b_rptr; tei->PRIM_type = T_EXDATA_IND; tei->MORE_flag = 0; mp->b_wptr = (uchar_t *)&tei[1]; TCP_STAT(tcp_fusion_urg); BUMP_MIB(&tcp_mib, tcpOutUrg); head = peer_tcp->tcp_rcv_list; while (head != NULL) { /* * Remove existing T_EXDATA_IND, keep the data which follows * it and relink our list. Note that we don't modify the * tcp_rcv_last_tail since it never points to T_EXDATA_IND. */ if (DB_TYPE(head) != M_DATA) { mp1 = head; ASSERT(DB_TYPE(mp1->b_cont) == M_DATA); head = mp1->b_cont; mp1->b_cont = NULL; head->b_next = mp1->b_next; mp1->b_next = NULL; if (prev_head != NULL) prev_head->b_next = head; if (peer_tcp->tcp_rcv_list == mp1) peer_tcp->tcp_rcv_list = head; if (peer_tcp->tcp_rcv_last_head == mp1) peer_tcp->tcp_rcv_last_head = head; freeb(mp1); } prev_head = head; head = head->b_next; } }
/* * Fusion output routine, called by tcp_output() and tcp_wput_proto(). */ boolean_t tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) { tcp_t *peer_tcp = tcp->tcp_loopback_peer; uint_t max_unread; boolean_t flow_stopped; boolean_t urgent = (DB_TYPE(mp) != M_DATA); ASSERT(tcp->tcp_fused); ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); max_unread = peer_tcp->tcp_fuse_rcv_unread_hiwater; /* If this connection requires IP, unfuse and use regular path */ if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) || IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { TCP_STAT(tcp_fusion_aborted); tcp_unfuse(tcp); return (B_FALSE); } if (send_size == 0) { freemsg(mp); return (B_TRUE); } /* * Handle urgent data; we either send up SIGURG to the peer now * or do it later when we drain, in case the peer is detached * or if we're short of memory for M_PCSIG mblk. */ if (urgent) { /* * We stop synchronous streams when we have urgent data * queued to prevent tcp_fuse_rrw() from pulling it. If * for some reasons the urgent data can't be delivered * below, synchronous streams will remain stopped until * someone drains the tcp_rcv_list. */ TCP_FUSE_SYNCSTR_PLUG_DRAIN(peer_tcp); tcp_fuse_output_urg(tcp, mp); } mutex_enter(&peer_tcp->tcp_fuse_lock); /* * Wake up and signal the peer; it is okay to do this before * enqueueing because we are holding the lock. One of the * advantages of synchronous streams is the ability for us to * find out when the application performs a read on the socket, * by way of tcp_fuse_rrw() entry point being called. Every * data that gets enqueued onto the receiver is treated as if * it has arrived at the receiving endpoint, thus generating * SIGPOLL/SIGIO for asynchronous socket just as in the strrput() * case. However, we only wake up the application when necessary, * i.e. during the first enqueue. When tcp_fuse_rrw() is called * it will send everything upstream. */ if (peer_tcp->tcp_direct_sockfs && !urgent && !TCP_IS_DETACHED(peer_tcp)) { if (peer_tcp->tcp_rcv_list == NULL) STR_WAKEUP_SET(STREAM(peer_tcp->tcp_rq)); /* Update poll events and send SIGPOLL/SIGIO if necessary */ STR_SENDSIG(STREAM(peer_tcp->tcp_rq)); } /* * Enqueue data into the peer's receive list; we may or may not * drain the contents depending on the conditions below. */ tcp_rcv_enqueue(peer_tcp, mp, send_size); /* In case it wrapped around and also to keep it constant */ peer_tcp->tcp_rwnd += send_size; /* * Exercise flow-control when needed; we will get back-enabled * in either tcp_accept_finish(), tcp_unfuse(), or tcp_fuse_rrw(). * If tcp_direct_sockfs is on or if the peer endpoint is detached, * we emulate streams flow control by checking the peer's queue * size and high water mark; otherwise we simply use canputnext() * to decide if we need to stop our flow. * * The outstanding unread data block check does not apply for a * detached receiver; this is to avoid unnecessary blocking of the * sender while the accept is currently in progress and is quite * similar to the regular tcp. */ if (TCP_IS_DETACHED(peer_tcp) || max_unread == 0) max_unread = UINT_MAX; flow_stopped = tcp->tcp_flow_stopped; if (!flow_stopped && (((peer_tcp->tcp_direct_sockfs || TCP_IS_DETACHED(peer_tcp)) && (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater || ++peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) || (!peer_tcp->tcp_direct_sockfs && !TCP_IS_DETACHED(peer_tcp) && !canputnext(peer_tcp->tcp_rq)))) { tcp_setqfull(tcp); flow_stopped = B_TRUE; TCP_STAT(tcp_fusion_flowctl); DTRACE_PROBE4(tcp__fuse__output__flowctl, tcp_t *, tcp, uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt, uint_t, peer_tcp->tcp_fuse_rcv_unread_cnt); } else if (flow_stopped &&
/* * This routine gets called by the eager tcp upon changing state from * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself * and the active connect tcp such that the regular tcp processings * may be bypassed under allowable circumstances. Because the fusion * requires both endpoints to be in the same squeue, it does not work * for simultaneous active connects because there is no easy way to * switch from one squeue to another once the connection is created. * This is different from the eager tcp case where we assign it the * same squeue as the one given to the active connect tcp during open. */ void tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) { conn_t *peer_connp, *connp = tcp->tcp_connp; tcp_t *peer_tcp; ASSERT(!tcp->tcp_fused); ASSERT(tcp->tcp_loopback); ASSERT(tcp->tcp_loopback_peer == NULL); /* * We need to inherit q_hiwat of the listener tcp, but we can't * really use tcp_listener since we get here after sending up * T_CONN_IND and tcp_wput_accept() may be called independently, * at which point tcp_listener is cleared; this is why we use * tcp_saved_listener. The listener itself is guaranteed to be * around until tcp_accept_finish() is called on this eager -- * this won't happen until we're done since we're inside the * eager's perimeter now. */ ASSERT(tcp->tcp_saved_listener != NULL); /* * Lookup peer endpoint; search for the remote endpoint having * the reversed address-port quadruplet in ESTABLISHED state, * which is guaranteed to be unique in the system. Zone check * is applied accordingly for loopback address, but not for * local address since we want fusion to happen across Zones. */ if (tcp->tcp_ipversion == IPV4_VERSION) { peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp, (ipha_t *)iphdr, tcph); } else { peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp, (ip6_t *)iphdr, tcph); } /* * We can only proceed if peer exists, resides in the same squeue * as our conn and is not raw-socket. The squeue assignment of * this eager tcp was done earlier at the time of SYN processing * in ip_fanout_tcp{_v6}. Note that similar squeues by itself * doesn't guarantee a safe condition to fuse, hence we perform * additional tests below. */ ASSERT(peer_connp == NULL || peer_connp != connp); if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp || !IPCL_IS_TCP(peer_connp)) { if (peer_connp != NULL) { TCP_STAT(tcp_fusion_unqualified); CONN_DEC_REF(peer_connp); } return; } peer_tcp = peer_connp->conn_tcp; /* active connect tcp */ ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused); ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL); ASSERT(peer_connp->conn_sqp == connp->conn_sqp); /* * Fuse the endpoints; we perform further checks against both * tcp endpoints to ensure that a fusion is allowed to happen. * In particular we bail out for non-simple TCP/IP or if IPsec/ * IPQoS policy/kernel SSL exists. */ if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable && !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) && tcp->tcp_kssl_ent == NULL && !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { mblk_t *mp; struct stroptions *stropt; queue_t *peer_rq = peer_tcp->tcp_rq; ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL); ASSERT(tcp->tcp_fused_sigurg_mp == NULL); ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); ASSERT(tcp->tcp_kssl_ctx == NULL); /* * We need to drain data on both endpoints during unfuse. * If we need to send up SIGURG at the time of draining, * we want to be sure that an mblk is readily available. * This is why we pre-allocate the M_PCSIG mblks for both * endpoints which will only be used during/after unfuse. */ if ((mp = allocb(1, BPRI_HI)) == NULL) goto failed; tcp->tcp_fused_sigurg_mp = mp; if ((mp = allocb(1, BPRI_HI)) == NULL) goto failed; peer_tcp->tcp_fused_sigurg_mp = mp; /* Allocate M_SETOPTS mblk */ if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL) goto failed; /* Fuse both endpoints */ peer_tcp->tcp_loopback_peer = tcp; tcp->tcp_loopback_peer = peer_tcp; peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE; /* * We never use regular tcp paths in fusion and should * therefore clear tcp_unsent on both endpoints. Having * them set to non-zero values means asking for trouble * especially after unfuse, where we may end up sending * through regular tcp paths which expect xmit_list and * friends to be correctly setup. */ peer_tcp->tcp_unsent = tcp->tcp_unsent = 0; tcp_timers_stop(tcp); tcp_timers_stop(peer_tcp); /* * At this point we are a detached eager tcp and therefore * don't have a queue assigned to us until accept happens. * In the mean time the peer endpoint may immediately send * us data as soon as fusion is finished, and we need to be * able to flow control it in case it sends down huge amount * of data while we're still detached. To prevent that we * inherit the listener's q_hiwat value; this is temporary * since we'll repeat the process in tcp_accept_finish(). */ (void) tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_saved_listener->tcp_rq->q_hiwat); /* * Set the stream head's write offset value to zero since we * won't be needing any room for TCP/IP headers; tell it to * not break up the writes (this would reduce the amount of * work done by kmem); and configure our receive buffer. * Note that we can only do this for the active connect tcp * since our eager is still detached; it will be dealt with * later in tcp_accept_finish(). */ DB_TYPE(mp) = M_SETOPTS; mp->b_wptr += sizeof (*stropt); stropt = (struct stroptions *)mp->b_rptr; stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT; stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE); stropt->so_wroff = 0; /* * Record the stream head's high water mark for * peer endpoint; this is used for flow-control * purposes in tcp_fuse_output(). */ stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp, peer_rq->q_hiwat); /* Send the options up */ putnext(peer_rq, mp); } else { TCP_STAT(tcp_fusion_unqualified); } CONN_DEC_REF(peer_connp); return; failed: if (tcp->tcp_fused_sigurg_mp != NULL) { freeb(tcp->tcp_fused_sigurg_mp); tcp->tcp_fused_sigurg_mp = NULL; } if (peer_tcp->tcp_fused_sigurg_mp != NULL) { freeb(peer_tcp->tcp_fused_sigurg_mp); peer_tcp->tcp_fused_sigurg_mp = NULL; } CONN_DEC_REF(peer_connp); }