/* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { u64 key = ((u64)addr << 32) | port; struct rds_sock *rs; rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); return rs; }
/* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; unsigned long flags; spin_lock_irqsave(&rds_bind_lock, flags); rs = rds_bind_tree_walk(addr, port, NULL); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; spin_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); return rs; }
/* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. * * We currently assert that only one fragmented message will be sent * down a connection at a time. This lets us reassemble in the conn * instead of per-flow which means that we don't have to go digging through * flows to tear down partial reassembly progress on conn failure and * we save flow lookup and locking for each frag arrival. It does mean * that small messages will wait behind large ones. Fragmenting at all * is only to reduce the memory consumption of pre-posted buffers. * * The caller passes in saddr and daddr instead of us getting it from the * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; struct sock *sk; unsigned long flags; struct rds_conn_path *cp; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; if (conn->c_trans->t_mp_capable) cp = inc->i_conn_path; else cp = &conn->c_path[0]; rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn, (unsigned long long)cp->cp_next_rx_seq, inc, (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), be32_to_cpu(inc->i_hdr.h_len), be16_to_cpu(inc->i_hdr.h_sport), be16_to_cpu(inc->i_hdr.h_dport), inc->i_hdr.h_flags, inc->i_rx_jiffies); /* * Sequence numbers should only increase. Messages get their * sequence number as they're queued in a sending conn. They * can be dropped, though, if the sending socket is closed before * they hit the wire. So sequence numbers can skip forward * under normal operation. They can also drop back in the conn * failover case as previously sent messages are resent down the * new instance of a conn. We drop those, otherwise we have * to assume that the next valid seq does not come after a * hole in the fragment stream. * * The headers don't give us a way to realize if fragments of * a message have been dropped. We assume that frags that arrive * to a flow are part of the current message on the flow that is * being reassembled. This means that senders can't drop messages * from the sending conn until all their frags are sent. * * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); goto out; } rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); /* if this is a handshake ping, start multipath if necessary */ if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); rds_start_mprds(cp->cp_conn); } goto out; } if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT && inc->i_hdr.h_sport == 0) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); /* if this is a handshake pong, start multipath if necessary */ rds_start_mprds(cp->cp_conn); wake_up(&cp->cp_conn->c_hs_waitq); goto out; } rs = rds_find_bound(daddr, inc->i_hdr.h_dport); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } /* Process extension headers */ rds_recv_incoming_exthdrs(inc, rs); /* We can be racing with rds_release() which marks the socket dead. */ sk = rds_rs_to_sk(rs); /* serialize with rds_release -> sock_orphan */ write_lock_irqsave(&rs->rs_recv_lock, flags); if (!sock_flag(sk, SOCK_DEAD)) { rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); rds_stats_inc(s_recv_queued); rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { rds_stats_inc(s_recv_drop_dead_sock); } write_unlock_irqrestore(&rs->rs_recv_lock, flags); out: if (rs) rds_sock_put(rs); }
/* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. * * We currently assert that only one fragmented message will be sent * down a connection at a time. This lets us reassemble in the conn * instead of per-flow which means that we don't have to go digging through * flows to tear down partial reassembly progress on conn failure and * we save flow lookup and locking for each frag arrival. It does mean * that small messages will wait behind large ones. Fragmenting at all * is only to reduce the memory consumption of pre-posted buffers. * * The caller passes in saddr and daddr instead of us getting it from the * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp) { struct sk_buff *skb; struct rds_sock *rs; struct sock *sk; struct rds_nf_hdr *dst, *org; int ret; rdsdebug(KERN_ALERT "incoming: conn %p, inc %p, %u.%u.%u.%u : %d -> %u.%u.%u.%u : %d\n", conn, inc, NIPQUAD(saddr), inc->i_hdr.h_sport, NIPQUAD(daddr), inc->i_hdr.h_dport); /* initialize some globals */ rs = NULL; sk = NULL; /* save off the original connection against which the request arrived */ inc->i_oconn = conn; inc->i_skb = NULL; /* lets find a socket to which this request belongs */ rs = rds_find_bound(daddr, inc->i_hdr.h_dport); /* pass it on locally if there is no socket bound, or if netfilter is * disabled for this socket */ if (NULL == rs || !rs->rs_netfilter_enabled) { /* drop the reference if we had taken one */ if (NULL != rs) rds_sock_put(rs); rds_recv_local(conn, saddr, daddr, inc, gfp); return; } /* otherwise pull out the socket */ sk = rds_rs_to_sk(rs); /* create an skb with some additional space to store our rds_nf_hdr info */ skb = alloc_skb(sizeof(struct rds_nf_hdr) * 2, gfp); if (NULL == skb) { /* if we have allocation problems, then we just need to depart */ rdsdebug("failure to allocate space for inc %p, %u.%u.%u.%u -> %u.%d.%u.%u\n", inc, NIPQUAD(saddr), NIPQUAD(daddr)); rds_recv_local(conn, saddr, daddr, inc, gfp); return; } /* once we've allocated an skb, also store it in our structures */ inc->i_skb = skb; /* now pull out the rds headers */ dst = rds_nf_hdr_dst(skb); org = rds_nf_hdr_org(skb); /* now update our rds_nf_hdr for tracking locations of the request */ dst->saddr = saddr; dst->daddr = daddr; dst->sport = inc->i_hdr.h_sport; dst->dport = inc->i_hdr.h_dport; dst->flags = 0; /* assign the appropriate protocol if any */ if (NULL != sk) { dst->protocol = sk->sk_protocol; dst->sk = sk; } else { dst->protocol = 0; dst->sk = NULL; } /* cleanup any references taken */ if (NULL != rs) rds_sock_put(rs); /* the original info is just a copy */ memcpy(org, dst, sizeof(struct rds_nf_hdr)); /* convert our local data structures in the message to a generalized skb form */ if (conn->c_trans->inc_to_skb(inc, skb)) { rdsdebug("handing off to PRE_ROUTING hook\n"); /* call down through the hook layers */ ret = NF_HOOK(PF_RDS_HOOK, NF_RDS_PRE_ROUTING, skb, NULL, NULL, rds_recv_ok); } /* if we had a failure to convert, then just assuming to continue as local */ else { rdsdebug("failed to create skb form, conn %p, inc %p, %u.%u.%u.%u -> %u.%u.%u.%u\n", conn, inc, NIPQUAD(saddr), NIPQUAD(daddr)); ret = 1; } /* pull back out the rds headers */ dst = rds_nf_hdr_dst(skb); org = rds_nf_hdr_org(skb); /* now depending upon we got back we can perform appropriate activities */ if (dst->flags & RDS_NF_HDR_FLAG_DONE) { rds_recv_drop(conn, saddr, daddr, inc, gfp); } /* this is the normal good processed state */ else if (ret >= 0) { /* check the original header and if changed do the needful */ if (dst->saddr == org->saddr && dst->daddr == org->daddr && conn->c_trans->skb_local(skb)) { rds_recv_local(conn, saddr, daddr, inc, gfp); } /* the send both case does both a local recv and a reroute */ else if (dst->flags & RDS_NF_HDR_FLAG_BOTH) { /* we must be sure to take an extra reference on the inc * to be sure it doesn't accidentally get freed in between */ rds_inc_addref(inc); /* send it up the stream locally */ rds_recv_local(conn, saddr, daddr, inc, gfp); /* and also reroute the request */ rds_recv_route(conn, inc, gfp); /* since we are done with processing we can drop this additional reference */ rds_inc_put(inc); } /* anything else is a change in possible destination so pass to route */ else rds_recv_route(conn, inc, gfp); } /* we don't really expect an error state from this call that isn't the done above */ else { /* we don't really know how to handle this yet - just ignore for now */ printk(KERN_ERR "unacceptible state for skb ret %d, conn %p, inc %p, " "%u.%u.%u.%u -> %u.%u.%u.%u\n", ret, conn, inc, NIPQUAD(saddr), NIPQUAD(daddr)); } }