/* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, io_fn_t fn, int flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos); if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iovec.iov_len) break; iov_iter_advance(iter, nr); } return ret; }
/* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iovec.iov_base, iovec.iov_len, ppos); } else { nr = filp->f_op->write(filp, iovec.iov_base, iovec.iov_len, ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iovec.iov_len) break; iov_iter_advance(iter, nr); } return ret; }
/* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, struct iov_iter *i) { size_t copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page * * Disable pagefault to avoid recursive lock since * the pages are already locked */ pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); /* * if we get a partial write, we can end up with * partially up to date pages. These add * a lot of complexity, so make sure they don't * happen by forcing this copy to be retried. * * The rest of the btrfs_file_write code will fall * back to page at a time copies after we return 0. */ if (!PageUptodate(page) && copied < count) copied = 0; iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; /* Return to btrfs_file_aio_write to fault page */ if (unlikely(copied == 0)) { break; } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { offset += copied; } else { pg++; offset = 0; } } return total_copied; }
static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); struct page *pages[MAX_SKB_FRAGS]; size_t offset; ssize_t copied, use; int i = 0; unsigned int size = ctx->sg_plaintext_size; int num_elem = ctx->sg_plaintext_num_elem; int rc = 0; int maxpages; while (length > 0) { i = 0; maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem; if (maxpages == 0) { rc = -EFAULT; goto out; } copied = iov_iter_get_pages(from, pages, length, maxpages, &offset); if (copied <= 0) { rc = -EFAULT; goto out; } iov_iter_advance(from, copied); length -= copied; size += copied; while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); sg_set_page(&ctx->sg_plaintext_data[num_elem], pages[i], use, offset); sg_unmark_end(&ctx->sg_plaintext_data[num_elem]); sk_mem_charge(sk, use); offset = 0; copied -= use; ++i; ++num_elem; } } out: ctx->sg_plaintext_size = size; ctx->sg_plaintext_num_elem = num_elem; return rc; }
static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { ssize_t ret; uio_seg_t seg = UIO_USERSPACE; if (from->type & ITER_KVEC) seg = UIO_SYSSPACE; if (from->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, iov_iter_count(from), seg, from->iov_offset); if (ret > 0) iov_iter_advance(from, ret); return (ret); }
static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { ssize_t ret; uio_seg_t seg = UIO_USERSPACE; if (to->type & ITER_KVEC) seg = UIO_SYSSPACE; if (to->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, iov_iter_count(to), seg, to->iov_offset); if (ret > 0) iov_iter_advance(to, ret); return (ret); }
/* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, struct iov_iter *i) { size_t copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page * * Disable pagefault to avoid recursive lock since * the pages are already locked */ pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; /* Return to btrfs_file_aio_write to fault page */ if (unlikely(copied == 0)) { break; } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { offset += copied; } else { pg++; offset = 0; } } return total_copied; }
static int __blk_rq_map_user_iov(struct request *rq, struct rq_map_data *map_data, struct iov_iter *iter, gfp_t gfp_mask, bool copy) { struct request_queue *q = rq->q; struct bio *bio, *orig_bio; int ret; if (copy) bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); else bio = bio_map_user_iov(q, iter, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); if (map_data && map_data->null_mapped) bio_set_flag(bio, BIO_NULL_MAPPED); iov_iter_advance(iter, bio->bi_iter.bi_size); if (map_data) map_data->offset += bio->bi_iter.bi_size; orig_bio = bio; blk_queue_bounce(q, &bio); /* * We link the bounce buffer in and could have to traverse it * later so we have to get a ref to prevent it from being freed */ bio_get(bio); ret = blk_rq_append_bio(q, rq, bio); if (ret) { bio_endio(bio); __blk_rq_unmap_user(orig_bio); bio_put(bio); return ret; } return 0; }
static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { size_t count; ssize_t ret; uio_seg_t seg = UIO_USERSPACE; #ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); count = iov_iter_count(from); ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); if (ret) return (ret); #else /* * XXX - ideally this check should be in the same lock region with * write operations, so that there's no TOCTTOU race when doing * append and someone else grow the file. */ ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); count = ret; #endif if (from->type & ITER_KVEC) seg = UIO_SYSSPACE; if (from->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, count, seg, from->iov_offset); if (ret > 0) iov_iter_advance(from, ret); return (ret); }
static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from) { int rc; struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; struct hypfs_sb_info *fs_info = sb->s_fs_info; size_t count = iov_iter_count(from); /* * Currently we only allow one update per second for two reasons: * 1. diag 204 is VERY expensive * 2. If several processes do updates in parallel and then read the * hypfs data, the likelihood of collisions is reduced, if we restrict * the minimum update interval. A collision occurs, if during the * data gathering of one process another process triggers an update * If the first process wants to ensure consistent data, it has * to restart data collection in this case. */ mutex_lock(&fs_info->lock); if (fs_info->last_update == get_seconds()) { rc = -EBUSY; goto out; } hypfs_delete_tree(sb->s_root); if (MACHINE_IS_VM) rc = hypfs_vm_create_files(sb->s_root); else rc = hypfs_diag_create_files(sb->s_root); if (rc) { pr_err("Updating the hypfs tree failed\n"); hypfs_delete_tree(sb->s_root); goto out; } hypfs_update_update(sb); rc = count; iov_iter_advance(from, count); out: mutex_unlock(&fs_info->lock); return rc; }
static ssize_t ll_direct_IO( # ifndef HAVE_IOV_ITER_RW int rw, # endif struct kiocb *iocb, struct iov_iter *iter, loff_t file_offset) { struct ll_cl_context *lcc; const struct lu_env *env; struct cl_io *io; struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t count = iov_iter_count(iter); ssize_t tot_bytes = 0, result = 0; size_t size = MAX_DIO_SIZE; /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) return -EINVAL; CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " "offset=%lld=%llx, pages %zd (max %lu)\n", PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, file_offset, file_offset, count >> PAGE_SHIFT, MAX_DIO_SIZE >> PAGE_SHIFT); /* Check that all user buffers are aligned as well */ if (iov_iter_alignment(iter) & ~PAGE_MASK) return -EINVAL; lcc = ll_cl_find(file); if (lcc == NULL) RETURN(-EIO); env = lcc->lcc_env; LASSERT(!IS_ERR(env)); io = lcc->lcc_io; LASSERT(io != NULL); /* 0. Need locking between buffered and direct access. and race with * size changing by concurrent truncates and writes. * 1. Need inode mutex to operate transient pages. */ if (iov_iter_rw(iter) == READ) inode_lock(inode); while (iov_iter_count(iter)) { struct page **pages; size_t offs; count = min_t(size_t, iov_iter_count(iter), size); if (iov_iter_rw(iter) == READ) { if (file_offset >= i_size_read(inode)) break; if (file_offset + count > i_size_read(inode)) count = i_size_read(inode) - file_offset; } result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); if (likely(result > 0)) { int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); result = ll_direct_IO_seg(env, io, iov_iter_rw(iter), inode, result, file_offset, pages, n); ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ); } if (unlikely(result <= 0)) { /* If we can't allocate a large enough buffer * for the request, shrink it to a smaller * PAGE_SIZE multiple and try again. * We should always be able to kmalloc for a * page worth of page pointers = 4MB on i386. */ if (result == -ENOMEM && size > (PAGE_SIZE / sizeof(*pages)) * PAGE_SIZE) { size = ((((size / 2) - 1) | ~PAGE_MASK) + 1) & PAGE_MASK; CDEBUG(D_VFSTRACE, "DIO size now %zu\n", size); continue; } GOTO(out, result); } iov_iter_advance(iter, result); tot_bytes += result; file_offset += result; } out: if (iov_iter_rw(iter) == READ) inode_unlock(inode); if (tot_bytes > 0) { struct vvp_io *vio = vvp_env_io(env); /* no commit async for direct IO */ vio->u.write.vui_written += tot_bytes; } return tot_bytes ? : result; }
/* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned uninitialized_var(in), log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, mergeable; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; struct socket *sock; struct iov_iter fixup; __virtio16 num_buffers; mutex_lock(&vq->mutex); sock = vq->private_data; if (!sock) goto out; vhost_disable_notify(&net->dev, vq); vhost_hlen = nvq->vhost_hlen; sock_hlen = nvq->sock_hlen; vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); while ((sock_len = peek_head_len(sock->sk))) { sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) break; /* On overrun, truncate and discard */ if (unlikely(headcount > UIO_MAXIOV)) { iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); err = sock->ops->recvmsg(sock, &msg, 1, MSG_DONTWAIT | MSG_TRUNC); pr_debug("Discarded rx packet: len %zd\n", sock_len); continue; } /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ break; } /* We don't need to be notified again. */ iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); fixup = msg.msg_iter; if (unlikely((vhost_hlen))) { /* We will supply the header ourselves * TODO: support TSO. */ iov_iter_advance(&msg.msg_iter, vhost_hlen); } err = sock->ops->recvmsg(sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ if (unlikely(vhost_hlen)) { if (copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) { vq_err(vq, "Unable to write vnet_hdr " "at addr %p\n", vq->iov->iov_base); break; } } else { /* Header came from socket; we'll need to patch * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF */ iov_iter_advance(&fixup, sizeof(hdr)); } /* TODO: Should check and handle checksum. */ num_buffers = cpu_to_vhost16(vq, headcount); if (likely(mergeable) && copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); break; } vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount); if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len); total_len += vhost_len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); break; } } out: mutex_unlock(&vq->mutex); } static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_rx(net); } static void handle_tx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); handle_tx(net); } static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; int i; n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); if (!n) { n = vmalloc(sizeof *n); if (!n) return -ENOMEM; } vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); return -ENOMEM; } dev = &n->dev; vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick; for (i = 0; i < VHOST_NET_VQ_MAX; i++) { n->vqs[i].ubufs = NULL; n->vqs[i].ubuf_info = NULL; n->vqs[i].upend_idx = 0; n->vqs[i].done_idx = 0; n->vqs[i].vhost_hlen = 0; n->vqs[i].sock_hlen = 0; } vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); f->private_data = n; return 0; } static void vhost_net_disable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); if (!vq->private_data) return; vhost_poll_stop(poll); } static int vhost_net_enable_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct vhost_net_virtqueue *nvq = container_of(vq, struct vhost_net_virtqueue, vq); struct vhost_poll *poll = n->poll + (nvq - n->vqs); struct socket *sock; sock = vq->private_data; if (!sock) return 0; return vhost_poll_start(poll, sock->file); } static struct socket *vhost_net_stop_vq(struct vhost_net *n, struct vhost_virtqueue *vq) { struct socket *sock; mutex_lock(&vq->mutex); sock = vq->private_data; vhost_net_disable_vq(n, vq); vq->private_data = NULL; mutex_unlock(&vq->mutex); return sock; } static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, struct socket **rx_sock) { *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq); *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } static void vhost_net_flush_vq(struct vhost_net *n, int index) { vhost_poll_flush(n->poll + index); vhost_poll_flush(&n->vqs[index].vq.poll); }
/* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; unsigned out, in; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err; size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); bool zcopy, zcopy_used; mutex_lock(&vq->mutex); sock = vq->private_data; if (!sock) goto out; vhost_disable_notify(&net->dev, vq); hdr_size = nvq->vhost_hlen; zcopy = nvq->ubufs; for (;;) { /* Release DMAs done buffers first */ if (zcopy) vhost_zerocopy_signal_used(net, vq); /* If more outstanding DMAs, queue the work. * Handle upend_idx wrap around */ if (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV == nvq->done_idx)) break; head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { if (unlikely(vhost_enable_notify(&net->dev, vq))) { vhost_disable_notify(&net->dev, vq); continue; } break; } if (in) { vq_err(vq, "Unexpected descriptor format for TX: " "out %d, int %d\n", out, in); break; } /* Skip header. TODO: support TSO. */ len = iov_length(vq->iov, out); iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len); iov_iter_advance(&msg.msg_iter, hdr_size); /* Sanity check */ if (!msg_data_left(&msg)) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", len, hdr_size); break; } len = msg_data_left(&msg); zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN && (nvq->upend_idx + 1) % UIO_MAXIOV != nvq->done_idx && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { struct ubuf_info *ubuf; ubuf = nvq->ubuf_info + nvq->upend_idx; vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; } else { msg.msg_control = NULL; ubufs = NULL; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (zcopy_used) { vhost_net_ubuf_put(ubufs); nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); break; } if (err != len) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy_used) vhost_add_used_and_signal(&net->dev, vq, head, 0); else vhost_zerocopy_signal_used(net, vq); total_len += len; vhost_net_tx_packet(net); if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); break; } } out: mutex_unlock(&vq->mutex); } static int peek_head_len(struct sock *sk) { struct sk_buff *head; int len = 0; unsigned long flags; spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); if (likely(head)) { len = head->len; if (skb_vlan_tag_present(head)) len += VLAN_HLEN; } spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. * @vq - the relevant virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, struct vring_used_elem *heads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { unsigned int out, in; int seg = 0; int headcount = 0; unsigned d; int r, nlogs = 0; /* len is always initialized before use since we are always called with * datalen > 0. */ u32 uninitialized_var(len); while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; } r = vhost_get_vq_desc(vq, vq->iov + seg, ARRAY_SIZE(vq->iov) - seg, &out, &in, log, log_num); if (unlikely(r < 0)) goto err; d = r; if (d == vq->num) { r = 0; goto err; } if (unlikely(out || in <= 0)) { vq_err(vq, "unexpected descriptor format for RX: " "out %d, in %d\n", out, in); r = -EINVAL; goto err; } if (unlikely(log)) { nlogs += *log_num; log += *log_num; } heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); heads[headcount].len = cpu_to_vhost32(vq, len); datalen -= len; ++headcount; seg += in; } heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); *iovcount = seg; if (unlikely(log)) *log_num = nlogs; /* Detect overrun */ if (unlikely(datalen > 0)) { r = UIO_MAXIOV + 1; goto err; } return headcount; err: vhost_discard_vq_desc(vq, headcount); return r; }
STATIC ssize_t xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ if (XFS_IS_REALTIME_INODE(ip)) target = ip->i_mount->m_rtdev_targp; else target = ip->i_mount->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { if (iocb->ki_pos == isize) return 0; return -EINVAL; } file_accessed(iocb->ki_filp); /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file * and block it behind IO that is currently in progress because IO in * progress holds the IO lock shared. We only need to hold the lock * exclusive to blow away the page cache, so only take lock exclusively * if the page cache needs invalidation. This allows the normal direct * IO case of no page cache pages to proceeed concurrently without * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); /* * The generic dio code only flushes the range of the particular * I/O. Because we take an exclusive lock here, this whole * sequence is considerably more expensive for us. This has a * noticeable performance impact for any file with cached pages, * even when outside of the range of the particular I/O. * * Hence, amortize the cost of the lock against a full file * flush and reduce the chances of repeated iolock cycles going * forward. */ if (mapping->nrpages) { ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } data = *to; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, NULL, NULL, 0); if (ret >= 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
/* * Completely synchronous read and write methods. Direct from __user * buffer to osd, or directly to user pages (if O_DIRECT). * * If the read spans object boundary, just do multiple reads. */ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, int *checkeof) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; u64 off = iocb->ki_pos; int num_pages; ssize_t ret; size_t len = iov_iter_count(to); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (!len) return 0; /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); if (ret < 0) return ret; if (unlikely(to->type & ITER_PIPE)) { size_t page_off; ret = iov_iter_get_pages_alloc(to, &pages, len, &page_off); if (ret <= 0) return -ENOMEM; num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); ret = striped_read(inode, off, ret, pages, num_pages, page_off, checkeof); if (ret > 0) { iov_iter_advance(to, ret); off += ret; } else { iov_iter_advance(to, 0); } ceph_put_page_vector(pages, num_pages, false); } else { num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = striped_read(inode, off, len, pages, num_pages, (off & ~PAGE_MASK), checkeof); if (ret > 0) { int l, k = 0; size_t left = ret; while (left) { size_t page_off = off & ~PAGE_MASK; size_t copy = min_t(size_t, left, PAGE_SIZE - page_off); l = copy_page_to_iter(pages[k++], page_off, copy, to); off += l; left -= l; if (l < copy) break; } } ceph_release_page_vector(pages, num_pages); } if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; iocb->ki_pos = off; } dout("sync_read result %zd\n", ret); return ret; }