/* Receive data over TCP/IP. */ int usbip_recv(struct socket *sock, void *buf, int size) { int result; struct kvec iov = {.iov_base = buf, .iov_len = size}; struct msghdr msg = {.msg_flags = MSG_NOSIGNAL}; int total = 0; iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size); usbip_dbg_xmit("enter\n"); if (!sock || !buf || !size) { pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf, size); return -EINVAL; } do { int sz = msg_data_left(&msg); sock->sk->sk_allocation = GFP_NOIO; result = sock_recvmsg(sock, &msg, MSG_WAITALL); if (result <= 0) { pr_debug("receive sock %p buf %p size %u ret %d total %d\n", sock, buf + total, sz, result, total); goto err; } total += result; } while (msg_data_left(&msg)); if (usbip_dbg_flag_xmit) { if (!in_interrupt()) pr_debug("%-10s:", current->comm); else pr_debug("interrupt :"); pr_debug("receiving....\n"); usbip_dump_buffer(buf, size); pr_debug("received, osize %d ret %d size %zd total %d\n", size, result, msg_data_left(&msg), total); } return total; err: return result; }
int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) { int rc; long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); unsigned long then; struct timeval tv; struct kvec iov = { .iov_base = buffer, .iov_len = nob }; struct msghdr msg = {NULL,}; LASSERT(nob > 0); /* * Caller may pass a zero timeout if she thinks the socket buffer is * empty enough to take the whole message immediately */ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, nob); for (;;) { msg.msg_flags = !timeout ? MSG_DONTWAIT : 0; if (timeout) { /* Set send timeout to remaining time */ jiffies_to_timeval(jiffies_left, &tv); rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv)); if (rc) { CERROR("Can't set socket send timeout %ld.%06d: %d\n", (long)tv.tv_sec, (int)tv.tv_usec, rc); return rc; } } then = jiffies; rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); jiffies_left -= jiffies - then; if (rc < 0) return rc; if (!rc) { CERROR("Unexpected zero rc\n"); return -ECONNABORTED; } if (!msg_data_left(&msg)) break; if (jiffies_left <= 0) return -EAGAIN; } return 0; }
/* * smb_send_kvec - send an array of kvecs to the server * @server: Server to send the data to * @smb_msg: Message to send * @sent: amount of data sent on socket is stored here * * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ static int smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, size_t *sent) { int rc = 0; int retries = 0; struct socket *ssocket = server->ssocket; *sent = 0; smb_msg->msg_name = (struct sockaddr *) &server->dstaddr; smb_msg->msg_namelen = sizeof(struct sockaddr); smb_msg->msg_control = NULL; smb_msg->msg_controllen = 0; if (server->noblocksnd) smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; else smb_msg->msg_flags = MSG_NOSIGNAL; while (msg_data_left(smb_msg)) { /* * If blocking send, we try 3 times, since each can block * for 5 seconds. For nonblocking we have to try more * but wait increasing amounts of time allowing time for * socket to clear. The overall time we wait in either * case to send on the socket is about 15 seconds. * Similarly we wait for 15 seconds for a response from * the server in SendReceive[2] for the server to send * a response back for most types of requests (except * SMB Write past end of file which can be slow, and * blocking lock operations). NFS waits slightly longer * than CIFS, but this can make it take longer for * nonresponsive servers to be detected and 15 seconds * is more than enough time for modern networks to * send a packet. In most cases if we fail to send * after the retries we will kill the socket and * reconnect which may clear the network problem. */ rc = sock_sendmsg(ssocket, smb_msg); if (rc == -EAGAIN) { retries++; if (retries >= 14 || (!server->noblocksnd && (retries > 2))) { cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", ssocket); return -EAGAIN; } msleep(1 << retries); continue; } if (rc < 0) return rc; if (rc == 0) { /* should never happen, letting socket clear before retrying is our only obvious option here */ cifs_dbg(VFS, "tcp sent no data\n"); msleep(500); continue; } /* send was at least partially successful */ *sent += rc; retries = 0; /* in case we get ENOSPC on the next send */ } return 0; }
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int ret = 0; int required_size; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; unsigned char record_type = TLS_RECORD_TYPE_DATA; int record_room; bool full_record; int orig_size; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; lock_sock(sk); if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo)) goto send_end; if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); if (ret) goto send_end; } while (msg_data_left(msg)) { if (sk->sk_err) { ret = sk->sk_err; goto send_end; } orig_size = ctx->sg_plaintext_size; full_record = false; try_to_copy = msg_data_left(msg); record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } required_size = ctx->sg_plaintext_size + try_to_copy + tls_ctx->overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_encrypted: ret = alloc_encrypted_sg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ try_to_copy -= required_size - ctx->sg_encrypted_size; full_record = true; } if (full_record || eor) { ret = zerocopy_from_iter(sk, &msg->msg_iter, try_to_copy); if (ret) goto fallback_to_reg_send; copied += try_to_copy; ret = tls_push_record(sk, msg->msg_flags, record_type); if (!ret) continue; if (ret == -EAGAIN) goto send_end; copied -= try_to_copy; fallback_to_reg_send: iov_iter_revert(&msg->msg_iter, ctx->sg_plaintext_size - orig_size); trim_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, orig_size); } required_size = ctx->sg_plaintext_size + try_to_copy; alloc_plaintext: ret = alloc_plaintext_sg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; /* Adjust try_to_copy according to the amount that was * actually allocated. The difference is due * to max sg elements limit */ try_to_copy -= required_size - ctx->sg_plaintext_size; full_record = true; trim_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, ctx->sg_plaintext_size + tls_ctx->overhead_size); } ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); if (ret) goto trim_sgl; copied += try_to_copy; if (full_record || eor) { push_record: ret = tls_push_record(sk, msg->msg_flags, record_type); if (ret) { if (ret == -ENOMEM) goto wait_for_memory; goto send_end; } } continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: trim_both_sgl(sk, orig_size); goto send_end; } if (tls_is_pending_closed_record(tls_ctx)) goto push_record; if (ctx->sg_encrypted_size < required_size) goto alloc_encrypted; goto alloc_plaintext; } send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); release_sock(sk); return copied ? copied : ret; }
/* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); bool zcopy, zcopy_used; mutex_lock(&vq->mutex); sock = vq->private_data; if (!sock) goto out; vhost_disable_notify(&net->dev, vq); hdr_size = nvq->vhost_hlen; zcopy = nvq->ubufs; for (;;) { /* Release DMAs done buffers first */ if (zcopy) vhost_zerocopy_signal_used(net, vq); /* If more outstanding DMAs, queue the work. * Handle upend_idx wrap around */ if (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV == nvq->done_idx)) break; head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } if (in) { vq_err(vq, "Unexpected descriptor format for TX: " "out %d, int %d\n", out, in); break; } /* Skip header. TODO: support TSO. */ len = iov_length(vq->iov, out); iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len); iov_iter_advance(&msg.msg_iter, hdr_size); /* Sanity check */ if (!msg_data_left(&msg)) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", len, hdr_size); break; } len = msg_data_left(&msg); zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN && (nvq->upend_idx + 1) % UIO_MAXIOV != nvq->done_idx && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { struct ubuf_info *ubuf; ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (zcopy_used) { vhost_net_ubuf_put(ubufs); nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); break; } if (err != len) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); total_len += len; vhost_net_tx_packet(net); if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); break; } } out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, struct vring_used_elem *heads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 uninitialized_var(len); while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); heads[headcount].len = cpu_to_vhost32(vq, len); datalen -= len; ++headcount; seg += in; } heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; }
/* sndbuf producer: main API called by socket layer. * called under sock lock. */ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) { size_t copylen, send_done = 0, send_remaining = len; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; union smc_host_cursor prep; struct sock *sk = &smc->sk; char *sndbuf_base; int tx_cnt_prep; int writespace; int rc, chunk; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { rc = -EPIPE; goto out_err; } while (msg_data_left(msg)) { if (sk->sk_state == SMC_INIT) return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; if (!atomic_read(&conn->sndbuf_space)) { rc = smc_tx_wait_memory(smc, msg->msg_flags); if (rc) { if (send_done) return send_done; goto out_err; } continue; } /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after smc_tx_wait_memory above */ writespace = atomic_read(&conn->sndbuf_space); /* not more than what user space asked for */ copylen = min_t(size_t, send_remaining, writespace); /* determine start of sndbuf */ sndbuf_base = conn->sndbuf_desc->cpu_addr; smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->sndbuf_size - tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); if (rc) { smc_sndbuf_sync_sg_for_device(conn); if (send_done) return send_done; goto out_err; } send_done += chunk_len; send_remaining -= chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ chunk_len = copylen - chunk_len; /* remainder */ chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in send ring buffer */ } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_size, &prep, copylen); smc_curs_write(&conn->tx_curs_prep, smc_curs_read(&prep, conn), conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_size */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; out_err: rc = sk_stream_error(sk, msg->msg_flags, rc); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(rc == -EAGAIN)) sk->sk_write_space(sk); return rc; }