size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { if (i->type & (ITER_BVEC|ITER_KVEC)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; } else return copy_page_to_iter_iovec(page, offset, bytes, i); }
static ssize_t hypfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; char *data = file->private_data; size_t available = strlen(data); loff_t pos = iocb->ki_pos; size_t count; if (pos < 0) return -EINVAL; if (pos >= available || !iov_iter_count(to)) return 0; count = copy_to_iter(data + pos, available - pos, to); if (!count) return -EFAULT; iocb->ki_pos = pos + count; file_accessed(file); return count; }
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; bool hole = false, need_wmb = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); long size; if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; rc = get_block(inode, block, bh, rw == WRITE); if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; } else { unsigned done = bh->b_size - (bh_max - (pos - first)); bh->b_blocknr += done >> blkbits; bh->b_size -= done; } hole = rw == READ && !buffer_written(bh); if (hole) { size = bh->b_size - first; } else { dax_unmap_atomic(bdev, &dax); dax.sector = to_sector(bh, inode); dax.size = bh->b_size; map_len = dax_map_atomic(bdev, &dax); if (map_len < 0) { rc = map_len; break; } if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(dax.addr, map_len, first, pos, end); need_wmb = true; } dax.addr += first; size = map_len - first; } max = min(pos + size, end); } if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { rc = -EFAULT; break; } pos += len; if (!IS_ERR(dax.addr)) dax.addr += len; } if (need_wmb) wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; }
static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { return copy_to_iter(addr, bytes, i); }
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; bool hole = false, need_wmb = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; unsigned blkbits = inode->i_blkbits; sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { size_t len; if (pos == max) { long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); long size; if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; rc = get_block(inode, block, bh, rw == WRITE); if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; /* * We allow uninitialized buffers for writes * beyond EOF as those cannot race with faults */ WARN_ON_ONCE( (buffer_new(bh) && block < file_blks) || (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); bh->b_blocknr += done >> blkbits; bh->b_size -= done; } hole = rw == READ && !buffer_written(bh); if (hole) { size = bh->b_size - first; } else { dax_unmap_atomic(bdev, &dax); dax.sector = to_sector(bh, inode); dax.size = bh->b_size; map_len = dax_map_atomic(bdev, &dax); if (map_len < 0) { rc = map_len; break; } dax.addr += first; size = map_len - first; } /* * pos + size is one past the last offset for IO, * so pos + size can overflow loff_t at extreme offsets. * Cast to u64 to catch this and get the true minimum. */ max = min_t(u64, pos + size, end); } if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { rc = -EFAULT; break; } pos += len; if (!IS_ERR(dax.addr)) dax.addr += len; } if (need_wmb) wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; }
kni_sock_rcvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) #endif /* HAVE_KIOCB_MSG_PARAM */ { int vnet_hdr_len = 0; int pkt_len = 0; struct kni_vhost_queue *q = container_of(sock->sk, struct kni_vhost_queue, sk); static struct virtio_net_hdr __attribute__ ((unused)) vnet_hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; if (unlikely(q == NULL || q->kni == NULL)) return 0; #ifdef RTE_KNI_VHOST_VNET_HDR_EN if (likely(q->flags & IFF_VNET_HDR)) { vnet_hdr_len = q->vnet_hdr_sz; if ((len -= vnet_hdr_len) < 0) return -EINVAL; } #endif if (unlikely(0 == (pkt_len = kni_vhost_net_rx(q->kni, m, vnet_hdr_len, len)))) return 0; #ifdef RTE_KNI_VHOST_VNET_HDR_EN /* no need to copy hdr when no pkt received */ #ifdef HAVE_IOV_ITER_MSGHDR if (unlikely(copy_to_iter((void *)&vnet_hdr, vnet_hdr_len, &m->msg_iter))) #else if (unlikely(memcpy_toiovecend(m->msg_iov, (void *)&vnet_hdr, 0, vnet_hdr_len))) #endif /* HAVE_IOV_ITER_MSGHDR */ return -EFAULT; #endif /* RTE_KNI_VHOST_VNET_HDR_EN */ KNI_DBG_RX("kni_rcvmsg expect_len %ld, flags 0x%08x, pkt_len %d\n", (unsigned long)len, q->flags, pkt_len); return (pkt_len + vnet_hdr_len); } /* dummy tap like ioctl */ static int kni_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; struct kni_vhost_queue *q = container_of(sock->sk, struct kni_vhost_queue, sk); struct kni_dev *kni; unsigned int u; int __user *sp = argp; int s; int ret; KNI_DBG("tap ioctl cmd 0x%08x\n", cmd); switch (cmd) { case TUNSETIFF: KNI_DBG("TUNSETIFF\n"); /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; ret = 0; if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP)) ret = -EINVAL; else q->flags = u; return ret; case TUNGETIFF: KNI_DBG("TUNGETIFF\n"); rcu_read_lock_bh(); kni = rcu_dereference_bh(q->kni); if (kni) dev_hold(kni->net_dev); rcu_read_unlock_bh(); if (!kni) return -ENOLINK; ret = 0; if (copy_to_user(&ifr->ifr_name, kni->net_dev->name, IFNAMSIZ) || put_user(q->flags, &ifr->ifr_flags)) ret = -EFAULT; dev_put(kni->net_dev); return ret; case TUNGETFEATURES: KNI_DBG("TUNGETFEATURES\n"); u = IFF_TAP | IFF_NO_PI; #ifdef RTE_KNI_VHOST_VNET_HDR_EN u |= IFF_VNET_HDR; #endif if (put_user(u, up)) return -EFAULT; return 0; case TUNSETSNDBUF: KNI_DBG("TUNSETSNDBUF\n"); if (get_user(u, up)) return -EFAULT; q->sk.sk_sndbuf = u; return 0; case TUNGETVNETHDRSZ: s = q->vnet_hdr_sz; if (put_user(s, sp)) return -EFAULT; KNI_DBG("TUNGETVNETHDRSZ %d\n", s); return 0; case TUNSETVNETHDRSZ: if (get_user(s, sp)) return -EFAULT; if (s < (int)sizeof(struct virtio_net_hdr)) return -EINVAL; KNI_DBG("TUNSETVNETHDRSZ %d\n", s); q->vnet_hdr_sz = s; return 0; case TUNSETOFFLOAD: KNI_DBG("TUNSETOFFLOAD %lx\n", arg); #ifdef RTE_KNI_VHOST_VNET_HDR_EN /* not support any offload yet */ if (!(q->flags & IFF_VNET_HDR)) return -EINVAL; return 0; #else return -EINVAL; #endif default: KNI_DBG("NOT SUPPORT\n"); return -EINVAL; } }
static inline int kni_vhost_net_rx(struct kni_dev *kni, struct msghdr *m, unsigned offset, unsigned len) { uint32_t pkt_len; struct rte_kni_mbuf *kva; struct rte_kni_mbuf *va; void * data_kva; struct sk_buff *skb; struct kni_vhost_queue *q = kni->vhost_queue; if (unlikely(q == NULL)) return 0; /* ensure at least one entry in free_q */ if (unlikely(kni_fifo_free_count(kni->free_q) == 0)) return 0; skb = skb_dequeue(&q->sk.sk_receive_queue); if (unlikely(skb == NULL)) return 0; kva = (struct rte_kni_mbuf*)skb->data; /* free skb to cache */ skb->data = NULL; if (unlikely(1 != kni_fifo_put(q->fifo, (void **)&skb, 1))) /* Failing should not happen */ KNI_ERR("Fail to enqueue entries into rx cache fifo\n"); pkt_len = kva->data_len; if (unlikely(pkt_len > len)) goto drop; KNI_DBG_RX("rx offset=%d, len=%d, pkt_len=%d, iovlen=%d\n", #ifdef HAVE_IOV_ITER_MSGHDR offset, len, pkt_len, (int)m->msg_iter.iov->iov_len); #else offset, len, pkt_len, (int)m->msg_iov->iov_len); #endif data_kva = kva->buf_addr + kva->data_off - kni->mbuf_va + kni->mbuf_kva; #ifdef HAVE_IOV_ITER_MSGHDR if (unlikely(copy_to_iter(data_kva, pkt_len, &m->msg_iter))) #else if (unlikely(memcpy_toiovecend(m->msg_iov, data_kva, offset, pkt_len))) #endif goto drop; /* Update statistics */ kni->stats.rx_bytes += pkt_len; kni->stats.rx_packets++; /* enqueue mbufs into free_q */ va = (void*)kva - kni->mbuf_kva + kni->mbuf_va; if (unlikely(1 != kni_fifo_put(kni->free_q, (void **)&va, 1))) /* Failing should not happen */ KNI_ERR("Fail to enqueue entries into free_q\n"); KNI_DBG_RX("receive done %d\n", pkt_len); return pkt_len; drop: /* Update drop statistics */ kni->stats.rx_dropped++; return 0; }
/* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned uninitialized_var(in), log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; mutex_lock(&vq->mutex); sock = vq->private_data; if (!sock) goto out; vhost_disable_notify(&net->dev, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); while ((sock_len = peek_head_len(sock->sk))) { sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) break; /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ break; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); break; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(mergeable) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); break; } vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len); total_len += vhost_len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); break; } } out: mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; int i; n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!n) { n = vmalloc(sizeof *n); if (!n) return -ENOMEM; } vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); f->private_data = n; return 0; } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vq->private_data) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vq->private_data; if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; mutex_lock(&vq->mutex); sock = vq->private_data; vhost_net_disable_vq(n, vq); vq->private_data = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush_vq(struct vhost_net *n, int index) { vhost_poll_flush(n->poll + index); vhost_poll_flush(&n->vqs[index].vq.poll); }
/** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec */ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; trace_skb_copy_datagram_iovec(skb, len); /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; n = copy_to_iter(skb->data + offset, copy, to); offset += n; if (n != copy) goto short_copy; if ((len -= copy) == 0) return 0; } /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { if (copy > len) copy = len; n = copy_page_to_iter(skb_frag_page(frag), frag->page_offset + offset - start, copy, to); offset += n; if (n != copy) goto short_copy; if (!(len -= copy)) return 0; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_datagram_iter(frag_iter, offset - start, to, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; }