void rds_send_worker(struct work_struct *work) { struct rds_conn_path *cp = container_of(work, struct rds_conn_path, cp_send_w.work); int ret; if (rds_conn_path_state(cp) == RDS_CONN_UP) { clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags); ret = rds_send_xmit(cp); cond_resched(); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); queue_delayed_work(rds_wq, &cp->cp_send_w, 2); default: break; } } }
void rds_recv_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); int ret; if (rds_conn_state(conn) == RDS_CONN_UP) { ret = conn->c_trans->recv(conn); rdsdebug("conn %p ret %d\n", conn, ret); switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); queue_delayed_work(rds_wq, &conn->c_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); queue_delayed_work(rds_wq, &conn->c_recv_w, 2); default: break; } } }
static void rds_recv_forward(struct rds_connection *conn, struct rds_incoming *inc, gfp_t gfp) { int len, ret; struct rds_nf_hdr *dst, *org; struct rds_sock *rs; /* initialize some bits */ rs = NULL; /* pull out the destination and original rds headers */ dst = rds_nf_hdr_dst(inc->i_skb); org = rds_nf_hdr_org(inc->i_skb); /* find the proper output socket - it should be the local one on which we originated */ rs = rds_find_bound(dst->saddr, dst->sport); if (!rs) { rdsdebug("failed to find output rds_socket dst %u.%u.%u.%u : %u, inc %p, conn %p\n", NIPQUAD(dst->daddr), dst->dport, inc, conn); rds_stats_inc(s_recv_drop_no_sock); goto out; } /* pull out the actual message len */ len = be32_to_cpu(inc->i_hdr.h_len); /* now lets see if we can send it all */ ret = rds_send_internal(conn, rs, inc->i_skb, gfp); if (len != ret) { rdsdebug("failed to send rds_data dst %u.%u.%u.%u : %u, inc %p, conn %p, len %d != ret %d\n", NIPQUAD(dst->daddr), dst->dport, inc, conn, len, ret); goto out; } if (NULL != rs) rds_sock_put(rs); /* all good so we are done */ return; out: /* cleanup any handles */ if (NULL != rs) rds_sock_put(rs); /* on error lets take a shot at hook cleanup */ NF_HOOK(PF_RDS_HOOK, NF_RDS_FORWARD_ERROR, inc->i_skb, NULL, NULL, rds_recv_ok); /* then hand the request off to normal local processing on the old connection */ rds_recv_local(inc->i_oconn, org->saddr, org->daddr, inc, gfp); }
/* * The transport must make sure that this is serialized against other * rx and conn reset on this specific conn. * * We currently assert that only one fragmented message will be sent * down a connection at a time. This lets us reassemble in the conn * instead of per-flow which means that we don't have to go digging through * flows to tear down partial reassembly progress on conn failure and * we save flow lookup and locking for each frag arrival. It does mean * that small messages will wait behind large ones. Fragmenting at all * is only to reduce the memory consumption of pre-posted buffers. * * The caller passes in saddr and daddr instead of us getting it from the * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; struct sock *sk; unsigned long flags; struct rds_conn_path *cp; inc->i_conn = conn; inc->i_rx_jiffies = jiffies; if (conn->c_trans->t_mp_capable) cp = inc->i_conn_path; else cp = &conn->c_path[0]; rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " "flags 0x%x rx_jiffies %lu\n", conn, (unsigned long long)cp->cp_next_rx_seq, inc, (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), be32_to_cpu(inc->i_hdr.h_len), be16_to_cpu(inc->i_hdr.h_sport), be16_to_cpu(inc->i_hdr.h_dport), inc->i_hdr.h_flags, inc->i_rx_jiffies); /* * Sequence numbers should only increase. Messages get their * sequence number as they're queued in a sending conn. They * can be dropped, though, if the sending socket is closed before * they hit the wire. So sequence numbers can skip forward * under normal operation. They can also drop back in the conn * failover case as previously sent messages are resent down the * new instance of a conn. We drop those, otherwise we have * to assume that the next valid seq does not come after a * hole in the fragment stream. * * The headers don't give us a way to realize if fragments of * a message have been dropped. We assume that frags that arrive * to a flow are part of the current message on the flow that is * being reassembled. This means that senders can't drop messages * from the sending conn until all their frags are sent. * * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); goto out; } rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); /* if this is a handshake ping, start multipath if necessary */ if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); rds_start_mprds(cp->cp_conn); } goto out; } if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT && inc->i_hdr.h_sport == 0) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); /* if this is a handshake pong, start multipath if necessary */ rds_start_mprds(cp->cp_conn); wake_up(&cp->cp_conn->c_hs_waitq); goto out; } rs = rds_find_bound(daddr, inc->i_hdr.h_dport); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } /* Process extension headers */ rds_recv_incoming_exthdrs(inc, rs); /* We can be racing with rds_release() which marks the socket dead. */ sk = rds_rs_to_sk(rs); /* serialize with rds_release -> sock_orphan */ write_lock_irqsave(&rs->rs_recv_lock, flags); if (!sock_flag(sk, SOCK_DEAD)) { rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs); rds_stats_inc(s_recv_queued); rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { rds_stats_inc(s_recv_drop_dead_sock); } write_unlock_irqrestore(&rs->rs_recv_lock, flags); out: if (rs) rds_sock_put(rs); }
/** * rds_page_remainder_alloc - build up regions of a message. * * @scat: Scatter list for message * @bytes: the number of bytes needed. * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. * * If @bytes is at least a full page then this just returns a page from * alloc_page(). * * If @bytes is a partial page then this stores the unused region of the * page in a per-cpu structure. Future partial-page allocations may be * satisfied from that cached region. This lets us waste less memory on * small allocations with minimal complexity. It works because the transmit * path passes read-only page regions down to devices. They hold a page * reference until they are done with the region. */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; unsigned long flags; struct page *page; int ret; gfp |= __GFP_HIGHMEM; /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); ret = 0; } goto out; } rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); while (1) { /* avoid a tiny region getting stuck by tossing it */ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { rds_stats_inc(s_page_remainder_miss); __free_page(rem->r_page); rem->r_page = NULL; } /* hand out a fragment from the cached page */ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { sg_set_page(scat, rem->r_page, bytes, rem->r_offset); get_page(sg_page(scat)); if (rem->r_offset != 0) rds_stats_inc(s_page_remainder_hit); rem->r_offset += bytes; if (rem->r_offset == PAGE_SIZE) { __free_page(rem->r_page); rem->r_page = NULL; } ret = 0; break; } /* alloc if there is nothing for us to use */ local_irq_restore(flags); put_cpu(); page = alloc_page(gfp); rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); if (!page) { ret = -ENOMEM; break; } /* did someone race to fill the remainder before us? */ if (rem->r_page) { __free_page(page); continue; } /* otherwise install our page and loop around to alloc */ rem->r_page = page; rem->r_offset = 0; } local_irq_restore(flags); put_cpu(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, ret ? 0 : scat->length); return ret; }