Exemplo n.º 1
0
/*
 * be very careful here.  This is being called as the condition in
 * wait_event_*() needs to cope with being called many times.
 */
static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
{
	unsigned long flags;

	if (!*inc) {
		read_lock_irqsave(&rs->rs_recv_lock, flags);
		if (!list_empty(&rs->rs_recv_queue)) {
			*inc = list_entry(rs->rs_recv_queue.next,
					  struct rds_incoming,
					  i_item);
			rds_inc_addref(*inc);
		}
Exemplo n.º 2
0
/*
 * The transport must make sure that this is serialized against other
 * rx and conn reset on this specific conn.
 *
 * We currently assert that only one fragmented message will be sent
 * down a connection at a time.  This lets us reassemble in the conn
 * instead of per-flow which means that we don't have to go digging through
 * flows to tear down partial reassembly progress on conn failure and
 * we save flow lookup and locking for each frag arrival.  It does mean
 * that small messages will wait behind large ones.  Fragmenting at all
 * is only to reduce the memory consumption of pre-posted buffers.
 *
 * The caller passes in saddr and daddr instead of us getting it from the
 * conn.  This lets loopback, who only has one conn for both directions,
 * tell us which roles the addrs in the conn are playing for this message.
 */
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
		       struct rds_incoming *inc, gfp_t gfp)
{
	struct rds_sock *rs = NULL;
	struct sock *sk;
	unsigned long flags;
	struct rds_conn_path *cp;

	inc->i_conn = conn;
	inc->i_rx_jiffies = jiffies;
	if (conn->c_trans->t_mp_capable)
		cp = inc->i_conn_path;
	else
		cp = &conn->c_path[0];

	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
		 "flags 0x%x rx_jiffies %lu\n", conn,
		 (unsigned long long)cp->cp_next_rx_seq,
		 inc,
		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
		 be32_to_cpu(inc->i_hdr.h_len),
		 be16_to_cpu(inc->i_hdr.h_sport),
		 be16_to_cpu(inc->i_hdr.h_dport),
		 inc->i_hdr.h_flags,
		 inc->i_rx_jiffies);

	/*
	 * Sequence numbers should only increase.  Messages get their
	 * sequence number as they're queued in a sending conn.  They
	 * can be dropped, though, if the sending socket is closed before
	 * they hit the wire.  So sequence numbers can skip forward
	 * under normal operation.  They can also drop back in the conn
	 * failover case as previously sent messages are resent down the
	 * new instance of a conn.  We drop those, otherwise we have
	 * to assume that the next valid seq does not come after a
	 * hole in the fragment stream.
	 *
	 * The headers don't give us a way to realize if fragments of
	 * a message have been dropped.  We assume that frags that arrive
	 * to a flow are part of the current message on the flow that is
	 * being reassembled.  This means that senders can't drop messages
	 * from the sending conn until all their frags are sent.
	 *
	 * XXX we could spend more on the wire to get more robust failure
	 * detection, arguably worth it to avoid data corruption.
	 */
	if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
	    (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
		rds_stats_inc(s_recv_drop_old_seq);
		goto out;
	}
	cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;

	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
		if (inc->i_hdr.h_sport == 0) {
			rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
			goto out;
		}
		rds_stats_inc(s_recv_ping);
		rds_send_pong(cp, inc->i_hdr.h_sport);
		/* if this is a handshake ping, start multipath if necessary */
		if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) {
			rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
			rds_start_mprds(cp->cp_conn);
		}
		goto out;
	}

	if (inc->i_hdr.h_dport ==  RDS_FLAG_PROBE_PORT &&
	    inc->i_hdr.h_sport == 0) {
		rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn);
		/* if this is a handshake pong, start multipath if necessary */
		rds_start_mprds(cp->cp_conn);
		wake_up(&cp->cp_conn->c_hs_waitq);
		goto out;
	}

	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
	if (!rs) {
		rds_stats_inc(s_recv_drop_no_sock);
		goto out;
	}

	/* Process extension headers */
	rds_recv_incoming_exthdrs(inc, rs);

	/* We can be racing with rds_release() which marks the socket dead. */
	sk = rds_rs_to_sk(rs);

	/* serialize with rds_release -> sock_orphan */
	write_lock_irqsave(&rs->rs_recv_lock, flags);
	if (!sock_flag(sk, SOCK_DEAD)) {
		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
		rds_stats_inc(s_recv_queued);
		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
				      be32_to_cpu(inc->i_hdr.h_len),
				      inc->i_hdr.h_dport);
		if (sock_flag(sk, SOCK_RCVTSTAMP))
			do_gettimeofday(&inc->i_rx_tstamp);
		rds_inc_addref(inc);
		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
		__rds_wake_sk_sleep(sk);
	} else {
		rds_stats_inc(s_recv_drop_dead_sock);
	}
	write_unlock_irqrestore(&rs->rs_recv_lock, flags);

out:
	if (rs)
		rds_sock_put(rs);
}
Exemplo n.º 3
0
/*
 * The transport must make sure that this is serialized against other
 * rx and conn reset on this specific conn.
 *
 * We currently assert that only one fragmented message will be sent
 * down a connection at a time.  This lets us reassemble in the conn
 * instead of per-flow which means that we don't have to go digging through
 * flows to tear down partial reassembly progress on conn failure and
 * we save flow lookup and locking for each frag arrival.  It does mean
 * that small messages will wait behind large ones.  Fragmenting at all
 * is only to reduce the memory consumption of pre-posted buffers.
 *
 * The caller passes in saddr and daddr instead of us getting it from the
 * conn.  This lets loopback, who only has one conn for both directions,
 * tell us which roles the addrs in the conn are playing for this message.
 */
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
		       struct rds_incoming *inc, gfp_t gfp)
{
	struct sk_buff *skb;
	struct rds_sock *rs;
	struct sock *sk;
	struct rds_nf_hdr *dst, *org;
	int    ret;

	rdsdebug(KERN_ALERT "incoming:  conn %p, inc %p, %u.%u.%u.%u : %d -> %u.%u.%u.%u : %d\n",
		 conn, inc, NIPQUAD(saddr), inc->i_hdr.h_sport, NIPQUAD(daddr), inc->i_hdr.h_dport);

	/* initialize some globals */
	rs = NULL;
	sk = NULL;

	/* save off the original connection against which the request arrived */
	inc->i_oconn = conn;
	inc->i_skb   = NULL;

	/* lets find a socket to which this request belongs */
	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);

	/* pass it on locally if there is no socket bound, or if netfilter is
	 * disabled for this socket */
	if (NULL == rs || !rs->rs_netfilter_enabled) {
		/* drop the reference if we had taken one */
		if (NULL != rs)
			rds_sock_put(rs);

		rds_recv_local(conn, saddr, daddr, inc, gfp);
		return;
	}

	/* otherwise pull out the socket */
	sk = rds_rs_to_sk(rs);

	/* create an skb with some additional space to store our rds_nf_hdr info */
	skb = alloc_skb(sizeof(struct rds_nf_hdr) * 2, gfp);
	if (NULL == skb) {
		/* if we have allocation problems, then we just need to depart */
		rdsdebug("failure to allocate space for inc %p, %u.%u.%u.%u -> %u.%d.%u.%u\n",
			 inc, NIPQUAD(saddr), NIPQUAD(daddr));
		rds_recv_local(conn, saddr, daddr, inc, gfp);
		return;
	}

	/* once we've allocated an skb, also store it in our structures */
	inc->i_skb = skb;

	/* now pull out the rds headers */
	dst = rds_nf_hdr_dst(skb);
	org = rds_nf_hdr_org(skb);

	/* now update our rds_nf_hdr for tracking locations of the request */
	dst->saddr = saddr;
	dst->daddr = daddr;
	dst->sport = inc->i_hdr.h_sport;
	dst->dport = inc->i_hdr.h_dport;
	dst->flags = 0;

	/* assign the appropriate protocol if any */
	if (NULL != sk) {
		dst->protocol = sk->sk_protocol;
		dst->sk = sk;
	} else {
		dst->protocol = 0;
		dst->sk = NULL;
	}

	/* cleanup any references taken */
	if (NULL != rs)
		rds_sock_put(rs);

	/* the original info is just a copy */
	memcpy(org, dst, sizeof(struct rds_nf_hdr));

	/* convert our local data structures in the message to a generalized skb form */
	if (conn->c_trans->inc_to_skb(inc, skb)) {
		rdsdebug("handing off to PRE_ROUTING hook\n");
		/* call down through the hook layers */
		ret = NF_HOOK(PF_RDS_HOOK, NF_RDS_PRE_ROUTING, skb, NULL, NULL, rds_recv_ok);
	}
	/* if we had a failure to convert, then just assuming to continue as local */
	else {
		rdsdebug("failed to create skb form, conn %p, inc %p, %u.%u.%u.%u -> %u.%u.%u.%u\n",
			 conn, inc, NIPQUAD(saddr), NIPQUAD(daddr));
		ret = 1;
	}

	/* pull back out the rds headers */
	dst = rds_nf_hdr_dst(skb);
	org = rds_nf_hdr_org(skb);

	/* now depending upon we got back we can perform appropriate activities */
	if (dst->flags & RDS_NF_HDR_FLAG_DONE) {
		rds_recv_drop(conn, saddr, daddr, inc, gfp);
	}
	/* this is the normal good processed state */
	else if (ret >= 0) {
		/* check the original header and if changed do the needful */
		if (dst->saddr == org->saddr && dst->daddr == org->daddr &&
		    conn->c_trans->skb_local(skb)) {
			rds_recv_local(conn, saddr, daddr, inc, gfp);
		}
		/* the send both case does both a local recv and a reroute */
		else if (dst->flags & RDS_NF_HDR_FLAG_BOTH) {
			/* we must be sure to take an extra reference on the inc
			 * to be sure it doesn't accidentally get freed in between */
			rds_inc_addref(inc);

			/* send it up the stream locally */
			rds_recv_local(conn, saddr, daddr, inc, gfp);

			/* and also reroute the request */
			rds_recv_route(conn, inc, gfp);

			/* since we are done with processing we can drop this additional reference */
			rds_inc_put(inc);

		}
		/* anything else is a change in possible destination so pass to route */
		else
			rds_recv_route(conn, inc, gfp);
	}
	/* we don't really expect an error state from this call that isn't the done above */
	else {
		/* we don't really know how to handle this yet - just ignore for now */
		printk(KERN_ERR "unacceptible state for skb ret %d, conn %p, inc %p, "
				"%u.%u.%u.%u -> %u.%u.%u.%u\n",
		       ret, conn, inc, NIPQUAD(saddr), NIPQUAD(daddr));
	}
}