int sdp_process_rx(struct sdp_sock *ssk) { int wc_processed = 0; int credits_before; if (!rx_ring_trylock(&ssk->rx_ring)) { sdp_dbg(ssk->socket, "ring destroyed. not polling it\n"); return 0; } credits_before = tx_credits(ssk); wc_processed = sdp_poll_rx_cq(ssk); sdp_prf(ssk->socket, NULL, "processed %d", wc_processed); if (wc_processed) { sdp_prf(ssk->socket, NULL, "credits: %d -> %d", credits_before, tx_credits(ssk)); queue_work(rx_comp_wq, &ssk->rx_comp_work); } sdp_arm_rx_cq(ssk); rx_ring_unlock(&ssk->rx_ring); return (wc_processed); }
static void sdp_rx_comp_work(struct work_struct *work) { struct sdp_sock *ssk = container_of(work, struct sdp_sock, rx_comp_work); sdp_prf(ssk->socket, NULL, "%s", __func__); SDP_WLOCK(ssk); if (unlikely(!ssk->qp)) { sdp_prf(ssk->socket, NULL, "qp was destroyed"); goto out; } if (unlikely(!ssk->rx_ring.cq)) { sdp_prf(ssk->socket, NULL, "rx_ring.cq is NULL"); goto out; } if (unlikely(!ssk->poll_cq)) { struct rdma_cm_id *id = ssk->id; if (id && id->qp) rdma_notify(id, IB_EVENT_COMM_EST); goto out; } sdp_do_posts(ssk); out: SDP_WUNLOCK(ssk); }
static int sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc) { struct mbuf *mb = NULL; struct sdp_bsdh *h; if (unlikely(wc->status)) { if (wc->status != IB_WC_WR_FLUSH_ERR) { sdp_prf(ssk->socket, mb, "Send completion with error. " "Status %d", wc->status); sdp_dbg_data(ssk->socket, "Send completion with error. " "Status %d\n", wc->status); sdp_notify(ssk, ECONNRESET); } } mb = sdp_send_completion(ssk, wc->wr_id); if (unlikely(!mb)) return -1; h = mtod(mb, struct sdp_bsdh *); sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq)); sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d", mb, mb->m_pkthdr.len, ntohl(h->mseq)); m_freem(mb); return 0; }
static void sdp_poll_tx_timeout(unsigned long data) { struct sdp_sock *ssk = (struct sdp_sock *)data; struct sock *sk = sk_ssk(ssk); u32 inflight, wc_processed; sdp_prf1(sk_ssk(ssk), NULL, "TX timeout: inflight=%d, head=%d tail=%d", (u32) tx_ring_posted(ssk), ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring)); /* Only process if the socket is not in use */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sdp_prf(sk_ssk(ssk), NULL, "TX comp: socket is busy"); if (sdp_tx_handler_select(ssk) && sk->sk_state != TCP_CLOSE && likely(ssk->qp_active)) { sdp_prf1(sk, NULL, "schedule a timer"); mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); } SDPSTATS_COUNTER_INC(tx_poll_busy); goto out; } if (unlikely(!ssk->qp || sk->sk_state == TCP_CLOSE)) { SDPSTATS_COUNTER_INC(tx_poll_no_op); goto out; } wc_processed = sdp_process_tx_cq(ssk); if (!wc_processed) SDPSTATS_COUNTER_INC(tx_poll_miss); else { sdp_post_sends(ssk, GFP_ATOMIC); SDPSTATS_COUNTER_INC(tx_poll_hit); } inflight = (u32) tx_ring_posted(ssk); sdp_prf1(sk_ssk(ssk), NULL, "finished tx proccessing. inflight = %d", tx_ring_posted(ssk)); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ if (inflight && likely(ssk->qp_active)) mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); out: if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) { sdp_prf1(sk, NULL, "RDMA is inflight - arming irq"); sdp_arm_tx_cq(sk); } bh_unlock_sock(sk); }
void sdp_do_posts(struct sdp_sock *ssk) { struct socket *sk = ssk->socket; int xmit_poll_force; struct mbuf *mb; SDP_WLOCK_ASSERT(ssk); if (!ssk->qp_active) { sdp_dbg(sk, "QP is deactivated\n"); return; } while ((mb = ssk->rx_ctl_q)) { ssk->rx_ctl_q = mb->m_nextpkt; mb->m_nextpkt = NULL; sdp_process_rx_ctl_mb(ssk, mb); } if (ssk->state == TCPS_TIME_WAIT) return; if (!ssk->rx_ring.cq || !ssk->tx_ring.cq) return; sdp_post_recvs(ssk); if (tx_ring_posted(ssk)) sdp_xmit_poll(ssk, 1); sdp_post_sends(ssk, M_NOWAIT); xmit_poll_force = tx_credits(ssk) < SDP_MIN_TX_CREDITS; if (credit_update_needed(ssk) || xmit_poll_force) { /* if has pending tx because run out of tx_credits - xmit it */ sdp_prf(sk, NULL, "Processing to free pending sends"); sdp_xmit_poll(ssk, xmit_poll_force); sdp_prf(sk, NULL, "Sending credit update"); sdp_post_sends(ssk, M_NOWAIT); } }
static void sdp_rx_irq(struct ib_cq *cq, void *cq_context) { struct socket *sk = cq_context; struct sdp_sock *ssk = sdp_sk(sk); if (cq != ssk->rx_ring.cq) { sdp_dbg(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq); return; } SDPSTATS_COUNTER_INC(rx_int_count); sdp_prf(sk, NULL, "rx irq"); sdp_process_rx(ssk); }
int sdp_xmit_poll(struct sdp_sock *ssk, int force) { int wc_processed = 0; SDP_WLOCK_ASSERT(ssk); sdp_prf(ssk->socket, NULL, "%s", __func__); /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (!callout_pending(&ssk->tx_ring.timer)) callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, sdp_poll_tx_timeout, ssk); /* Poll the CQ every SDP_TX_POLL_MODER packets */ if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0) wc_processed = sdp_process_tx_cq(ssk); return wc_processed; }
static int sdp_process_tx_cq(struct sdp_sock *ssk) { struct ib_wc ibwc[SDP_NUM_WC]; int n, i; int wc_processed = 0; if (!ssk->tx_ring.cq) { sdp_dbg(sk_ssk(ssk), "tx irq on destroyed tx_cq\n"); return 0; } do { n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc); for (i = 0; i < n; ++i) { sdp_process_tx_wc(ssk, ibwc + i); wc_processed++; } } while (n == SDP_NUM_WC); if (wc_processed) { struct sock *sk = sk_ssk(ssk); sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", (u32) tx_ring_posted(ssk)); sk_mem_reclaim(sk); sk_stream_write_space(sk_ssk(ssk)); if (sk->sk_write_pending && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && tx_ring_posted(ssk)) { /* a write is pending and still no room in tx queue, * arm tx cq */ sdp_prf(sk_ssk(ssk), NULL, "pending tx - rearming"); sdp_arm_tx_cq(sk); } } return wc_processed; }
int sdp_xmit_poll(struct sdp_sock *ssk, int force) { int wc_processed = 0; sdp_prf(sk_ssk(ssk), NULL, "%s", __func__); /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (likely(ssk->qp_active && sk_ssk(ssk)->sk_state != TCP_CLOSE) && !timer_pending(&ssk->tx_ring.timer)) { mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); } ssk->tx_compl_pending = 0; /* Poll the CQ every SDP_TX_POLL_MODER packets */ if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0) wc_processed = sdp_process_tx_cq(ssk); return wc_processed; }
void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb) { struct sdp_buf *tx_req; struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); unsigned long mseq = ring_head(ssk->tx_ring); int i, rc, frags; u64 addr; struct ib_device *dev; struct ib_send_wr *bad_wr; struct ib_sge ibsge[SDP_MAX_SEND_SGES]; struct ib_sge *sge = ibsge; struct ib_send_wr tx_wr = { NULL }; u32 send_flags = IB_SEND_SIGNALED; SDPSTATS_COUNTER_MID_INC(post_send, h->mid); SDPSTATS_HIST(send_size, skb->len); if (!ssk->qp_active) goto err; ssk->tx_packets++; if (h->mid != SDP_MID_SRCAVAIL && h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL_CANCEL) { struct sock *sk = sk_ssk(ssk); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); } if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(skb); if (ssk->tx_sa != tx_sa) { sdp_dbg_data(sk_ssk(ssk), "SrcAvail cancelled " "before being sent!\n"); SDP_WARN_ON(1); sk_wmem_free_skb(sk_ssk(ssk), skb); return; } TX_SRCAVAIL_STATE(skb)->mseq = mseq; } if (unlikely(SDP_SKB_CB(skb)->flags & TCPHDR_URG)) h->flags = SDP_OOB_PRES | SDP_OOB_PEND; else h->flags = 0; h->bufs = htons(rx_ring_posted(ssk)); h->len = htonl(skb->len); h->mseq = htonl(mseq); h->mseq_ack = htonl(mseq_ack(ssk)); sdp_prf(sk_ssk(ssk), skb, "TX: %s bufs: %d mseq:%ld ack:%d c: %d", mid2str(h->mid), rx_ring_posted(ssk), mseq, ntohl(h->mseq_ack), tx_credits(ssk)); SDP_DUMP_PACKET(sk_ssk(ssk), "TX", skb, h); tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)]; tx_req->skb = skb; dev = ssk->ib_device; if (skb->len <= ssk->inline_thresh && !skb_shinfo(skb)->nr_frags) { SDPSTATS_COUNTER_INC(inline_sends); sge->addr = (u64) skb->data; sge->length = skb->len; sge->lkey = 0; frags = 0; tx_req->mapping[0] = 0; /* Nothing to be cleaned up by sdp_cleanup_sdp_buf() */ send_flags |= IB_SEND_INLINE; } else { addr = ib_dma_map_single(dev, skb->data, skb->len - skb->data_len, DMA_TO_DEVICE); tx_req->mapping[0] = addr; /* TODO: proper error handling */ BUG_ON(ib_dma_mapping_error(dev, addr)); sge->addr = addr; sge->length = skb->len - skb->data_len; sge->lkey = ssk->sdp_dev->mr->lkey; frags = skb_shinfo(skb)->nr_frags; for (i = 0; i < frags; ++i) { ++sge; addr = ib_dma_map_page(dev, skb_shinfo(skb)->frags[i].page.p, skb_shinfo(skb)->frags[i].page_offset, skb_shinfo(skb)->frags[i].size, DMA_TO_DEVICE); BUG_ON(ib_dma_mapping_error(dev, addr)); tx_req->mapping[i + 1] = addr; sge->addr = addr; sge->length = skb_shinfo(skb)->frags[i].size; sge->lkey = ssk->sdp_dev->mr->lkey; } } tx_wr.next = NULL; tx_wr.wr_id = ring_head(ssk->tx_ring) | SDP_OP_SEND; tx_wr.sg_list = ibsge; tx_wr.num_sge = frags + 1; tx_wr.opcode = IB_WR_SEND; tx_wr.send_flags = send_flags; if (unlikely(SDP_SKB_CB(skb)->flags & TCPHDR_URG)) tx_wr.send_flags |= IB_SEND_SOLICITED; rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr); if (unlikely(rc)) { sdp_dbg(sk_ssk(ssk), "ib_post_send failed with status %d.\n", rc); sdp_cleanup_sdp_buf(ssk, tx_req, skb->len - skb->data_len, DMA_TO_DEVICE); sdp_set_error(sk_ssk(ssk), -ECONNRESET); goto err; } atomic_inc(&ssk->tx_ring.head); atomic_dec(&ssk->tx_ring.credits); atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); return; err: sk_wmem_free_skb(sk_ssk(ssk), skb); }
static int sdp_post_recv(struct sdp_sock *ssk) { struct sdp_buf *rx_req; int i, rc; u64 addr; struct ib_device *dev; struct ib_recv_wr rx_wr = { NULL }; struct ib_sge ibsge[SDP_MAX_RECV_SGES]; struct ib_sge *sge = ibsge; struct ib_recv_wr *bad_wr; struct mbuf *mb, *m; struct sdp_bsdh *h; int id = ring_head(ssk->rx_ring); /* Now, allocate and repost recv */ sdp_prf(ssk->socket, mb, "Posting mb"); mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR); if (mb == NULL) { /* Retry so we can't stall out with no memory. */ if (!rx_ring_posted(ssk)) queue_work(rx_comp_wq, &ssk->rx_comp_work); return -1; } for (m = mb; m != NULL; m = m->m_next) { m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size : ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN); mb->m_pkthdr.len += m->m_len; } h = mtod(mb, struct sdp_bsdh *); rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1)); rx_req->mb = mb; dev = ssk->ib_device; for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) { addr = ib_dma_map_single(dev, mb->m_data, mb->m_len, DMA_TO_DEVICE); /* TODO: proper error handling */ BUG_ON(ib_dma_mapping_error(dev, addr)); BUG_ON(i >= SDP_MAX_RECV_SGES); rx_req->mapping[i] = addr; sge->addr = addr; sge->length = mb->m_len; sge->lkey = ssk->sdp_dev->mr->lkey; } rx_wr.next = NULL; rx_wr.wr_id = id | SDP_OP_RECV; rx_wr.sg_list = ibsge; rx_wr.num_sge = i; rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr); if (unlikely(rc)) { sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc); sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE); m_freem(mb); sdp_notify(ssk, ECONNRESET); return -1; } atomic_inc(&ssk->rx_ring.head); SDPSTATS_COUNTER_INC(post_recv); return 0; }
static int sdp_process_rx_mb(struct sdp_sock *ssk, struct mbuf *mb) { struct socket *sk; struct sdp_bsdh *h; unsigned long mseq_ack; int credits_before; h = mtod(mb, struct sdp_bsdh *); sk = ssk->socket; /* * If another thread is in so_pcbfree this may be partially torn * down but no further synchronization is required as the destroying * thread will wait for receive to shutdown before discarding the * socket. */ if (sk == NULL) { m_freem(mb); return 0; } SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk)); mseq_ack = ntohl(h->mseq_ack); credits_before = tx_credits(ssk); atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) + 1 + ntohs(h->bufs)); if (mseq_ack >= ssk->nagle_last_unacked) ssk->nagle_last_unacked = 0; sdp_prf1(ssk->socket, mb, "RX %s +%d c:%d->%d mseq:%d ack:%d\n", mid2str(h->mid), ntohs(h->bufs), credits_before, tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack)); if (unlikely(h->mid == SDP_MID_DATA && mb->m_pkthdr.len == SDP_HEAD_SIZE)) { /* Credit update is valid even after RCV_SHUTDOWN */ m_freem(mb); return 0; } if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL) || TCPS_HAVERCVDFIN(ssk->state)) { sdp_prf(sk, NULL, "Control mb - queing to control queue"); #ifdef SDP_ZCOPY if (h->mid == SDP_MID_SRCAVAIL_CANCEL) { sdp_dbg_data(sk, "Got SrcAvailCancel. " "seq: 0x%d seq_ack: 0x%d\n", ntohl(h->mseq), ntohl(h->mseq_ack)); ssk->srcavail_cancel_mseq = ntohl(h->mseq); } if (h->mid == SDP_MID_RDMARDCOMPL) { struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1); sdp_dbg_data(sk, "RdmaRdCompl message arrived\n"); sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack), ntohl(rrch->len)); } #endif mb->m_nextpkt = NULL; if (ssk->rx_ctl_tail) ssk->rx_ctl_tail->m_nextpkt = mb; else ssk->rx_ctl_q = mb; ssk->rx_ctl_tail = mb; return 0; } sdp_prf1(sk, NULL, "queueing %s mb\n", mid2str(h->mid)); mb = sdp_sock_queue_rcv_mb(sk, mb); return 0; }
/* socket lock should be taken before calling this */ static int sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb) { struct sdp_bsdh *h; struct socket *sk; SDP_WLOCK_ASSERT(ssk); sk = ssk->socket; h = mtod(mb, struct sdp_bsdh *); switch (h->mid) { case SDP_MID_DATA: case SDP_MID_SRCAVAIL: sdp_dbg(sk, "DATA after socket rcv was shutdown\n"); /* got data in RCV_SHUTDOWN */ if (ssk->state == TCPS_FIN_WAIT_1) { sdp_dbg(sk, "RX data when state = FIN_WAIT1\n"); sdp_notify(ssk, ECONNRESET); } m_freem(mb); break; #ifdef SDP_ZCOPY case SDP_MID_RDMARDCOMPL: m_freem(mb); break; case SDP_MID_SENDSM: sdp_handle_sendsm(ssk, ntohl(h->mseq_ack)); m_freem(mb); break; case SDP_MID_SRCAVAIL_CANCEL: sdp_dbg_data(sk, "Handling SrcAvailCancel\n"); sdp_prf(sk, NULL, "Handling SrcAvailCancel"); if (ssk->rx_sa) { ssk->srcavail_cancel_mseq = ntohl(h->mseq); ssk->rx_sa->flags |= RX_SA_ABORTED; ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get the dirty logic from recvmsg */ } else { sdp_dbg(sk, "Got SrcAvailCancel - " "but no SrcAvail in process\n"); } m_freem(mb); break; case SDP_MID_SINKAVAIL: sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n"); sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored"); /* FALLTHROUGH */ #endif case SDP_MID_ABORT: sdp_dbg_data(sk, "Handling ABORT\n"); sdp_prf(sk, NULL, "Handling ABORT"); sdp_notify(ssk, ECONNRESET); m_freem(mb); break; case SDP_MID_DISCONN: sdp_dbg_data(sk, "Handling DISCONN\n"); sdp_prf(sk, NULL, "Handling DISCONN"); sdp_handle_disconn(ssk); break; case SDP_MID_CHRCVBUF: sdp_dbg_data(sk, "Handling RX CHRCVBUF\n"); sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1)); m_freem(mb); break; case SDP_MID_CHRCVBUF_ACK: sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n"); sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1)); m_freem(mb); break; default: /* TODO: Handle other messages */ sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid); m_freem(mb); } return 0; }