Beispiel #1
0
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
		loff_t *ppos, io_fn_t fn, int flags)
{
	ssize_t ret = 0;

	if (flags & ~RWF_HIPRI)
		return -EOPNOTSUPP;

	while (iov_iter_count(iter)) {
		struct iovec iovec = iov_iter_iovec(iter);
		ssize_t nr;

		nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);

		if (nr < 0) {
			if (!ret)
				ret = nr;
			break;
		}
		ret += nr;
		if (nr != iovec.iov_len)
			break;
		iov_iter_advance(iter, nr);
	}

	return ret;
}
Beispiel #2
0
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
		loff_t *ppos, int type, rwf_t flags)
{
	ssize_t ret = 0;

	if (flags & ~RWF_HIPRI)
		return -EOPNOTSUPP;

	while (iov_iter_count(iter)) {
		struct iovec iovec = iov_iter_iovec(iter);
		ssize_t nr;

		if (type == READ) {
			nr = filp->f_op->read(filp, iovec.iov_base,
					      iovec.iov_len, ppos);
		} else {
			nr = filp->f_op->write(filp, iovec.iov_base,
					       iovec.iov_len, ppos);
		}

		if (nr < 0) {
			if (!ret)
				ret = nr;
			break;
		}
		ret += nr;
		if (nr != iovec.iov_len)
			break;
		iov_iter_advance(iter, nr);
	}

	return ret;
}
Beispiel #3
0
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
					 int write_bytes,
					 struct page **prepared_pages,
					 struct iov_iter *i)
{
	size_t copied = 0;
	int pg = 0;
	int offset = pos & (PAGE_CACHE_SIZE - 1);
	int total_copied = 0;

	while (write_bytes > 0) {
		size_t count = min_t(size_t,
				     PAGE_CACHE_SIZE - offset, write_bytes);
		struct page *page = prepared_pages[pg];
		/*
		 * Copy data from userspace to the current page
		 *
		 * Disable pagefault to avoid recursive lock since
		 * the pages are already locked
		 */
		pagefault_disable();
		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
		pagefault_enable();

		/* Flush processor's dcache for this page */
		flush_dcache_page(page);

		/*
		 * if we get a partial write, we can end up with
		 * partially up to date pages.  These add
		 * a lot of complexity, so make sure they don't
		 * happen by forcing this copy to be retried.
		 *
		 * The rest of the btrfs_file_write code will fall
		 * back to page at a time copies after we return 0.
		 */
		if (!PageUptodate(page) && copied < count)
			copied = 0;

		iov_iter_advance(i, copied);
		write_bytes -= copied;
		total_copied += copied;

		/* Return to btrfs_file_aio_write to fault page */
		if (unlikely(copied == 0)) {
			break;
		}

		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
			offset += copied;
		} else {
			pg++;
			offset = 0;
		}
	}
	return total_copied;
}
Beispiel #4
0
static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
			      int length)
{
	struct tls_context *tls_ctx = tls_get_ctx(sk);
	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
	struct page *pages[MAX_SKB_FRAGS];

	size_t offset;
	ssize_t copied, use;
	int i = 0;
	unsigned int size = ctx->sg_plaintext_size;
	int num_elem = ctx->sg_plaintext_num_elem;
	int rc = 0;
	int maxpages;

	while (length > 0) {
		i = 0;
		maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
		if (maxpages == 0) {
			rc = -EFAULT;
			goto out;
		}
		copied = iov_iter_get_pages(from, pages,
					    length,
					    maxpages, &offset);
		if (copied <= 0) {
			rc = -EFAULT;
			goto out;
		}

		iov_iter_advance(from, copied);

		length -= copied;
		size += copied;
		while (copied) {
			use = min_t(int, copied, PAGE_SIZE - offset);

			sg_set_page(&ctx->sg_plaintext_data[num_elem],
				    pages[i], use, offset);
			sg_unmark_end(&ctx->sg_plaintext_data[num_elem]);
			sk_mem_charge(sk, use);

			offset = 0;
			copied -= use;

			++i;
			++num_elem;
		}
	}

out:
	ctx->sg_plaintext_size = size;
	ctx->sg_plaintext_num_elem = num_elem;
	return rc;
}
Beispiel #5
0
static ssize_t
zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
{
	ssize_t ret;
	uio_seg_t seg = UIO_USERSPACE;
	if (from->type & ITER_KVEC)
		seg = UIO_SYSSPACE;
	if (from->type & ITER_BVEC)
		seg = UIO_BVEC;
	ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
	    iov_iter_count(from), seg, from->iov_offset);
	if (ret > 0)
		iov_iter_advance(from, ret);
	return (ret);
}
Beispiel #6
0
static ssize_t
zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
{
	ssize_t ret;
	uio_seg_t seg = UIO_USERSPACE;
	if (to->type & ITER_KVEC)
		seg = UIO_SYSSPACE;
	if (to->type & ITER_BVEC)
		seg = UIO_BVEC;
	ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
	    iov_iter_count(to), seg, to->iov_offset);
	if (ret > 0)
		iov_iter_advance(to, ret);
	return (ret);
}
Beispiel #7
0
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
        int write_bytes,
        struct page **prepared_pages,
        struct iov_iter *i)
{
    size_t copied = 0;
    int pg = 0;
    int offset = pos & (PAGE_CACHE_SIZE - 1);
    int total_copied = 0;

    while (write_bytes > 0) {
        size_t count = min_t(size_t,
                             PAGE_CACHE_SIZE - offset, write_bytes);
        struct page *page = prepared_pages[pg];
        /*
         * Copy data from userspace to the current page
         *
         * Disable pagefault to avoid recursive lock since
         * the pages are already locked
         */
        pagefault_disable();
        copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
        pagefault_enable();

        /* Flush processor's dcache for this page */
        flush_dcache_page(page);
        iov_iter_advance(i, copied);
        write_bytes -= copied;
        total_copied += copied;

        /* Return to btrfs_file_aio_write to fault page */
        if (unlikely(copied == 0)) {
            break;
        }

        if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
            offset += copied;
        } else {
            pg++;
            offset = 0;
        }
    }
    return total_copied;
}
static int __blk_rq_map_user_iov(struct request *rq,
		struct rq_map_data *map_data, struct iov_iter *iter,
		gfp_t gfp_mask, bool copy)
{
	struct request_queue *q = rq->q;
	struct bio *bio, *orig_bio;
	int ret;

	if (copy)
		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
	else
		bio = bio_map_user_iov(q, iter, gfp_mask);

	if (IS_ERR(bio))
		return PTR_ERR(bio);

	if (map_data && map_data->null_mapped)
		bio_set_flag(bio, BIO_NULL_MAPPED);

	iov_iter_advance(iter, bio->bi_iter.bi_size);
	if (map_data)
		map_data->offset += bio->bi_iter.bi_size;

	orig_bio = bio;
	blk_queue_bounce(q, &bio);

	/*
	 * We link the bounce buffer in and could have to traverse it
	 * later so we have to get a ref to prevent it from being freed
	 */
	bio_get(bio);

	ret = blk_rq_append_bio(q, rq, bio);
	if (ret) {
		bio_endio(bio);
		__blk_rq_unmap_user(orig_bio);
		bio_put(bio);
		return ret;
	}

	return 0;
}
Beispiel #9
0
static ssize_t
zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
{
	size_t count;
	ssize_t ret;
	uio_seg_t seg = UIO_USERSPACE;

#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
	struct file *file = kiocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
	struct inode *ip = mapping->host;
	int isblk = S_ISBLK(ip->i_mode);

	count = iov_iter_count(from);
	ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
	if (ret)
		return (ret);
#else
	/*
	 * XXX - ideally this check should be in the same lock region with
	 * write operations, so that there's no TOCTTOU race when doing
	 * append and someone else grow the file.
	 */
	ret = generic_write_checks(kiocb, from);
	if (ret <= 0)
		return (ret);
	count = ret;
#endif

	if (from->type & ITER_KVEC)
		seg = UIO_SYSSPACE;
	if (from->type & ITER_BVEC)
		seg = UIO_BVEC;

	ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
	    count, seg, from->iov_offset);
	if (ret > 0)
		iov_iter_advance(from, ret);

	return (ret);
}
Beispiel #10
0
static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	int rc;
	struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
	struct hypfs_sb_info *fs_info = sb->s_fs_info;
	size_t count = iov_iter_count(from);

	/*
	 * Currently we only allow one update per second for two reasons:
	 * 1. diag 204 is VERY expensive
	 * 2. If several processes do updates in parallel and then read the
	 *    hypfs data, the likelihood of collisions is reduced, if we restrict
	 *    the minimum update interval. A collision occurs, if during the
	 *    data gathering of one process another process triggers an update
	 *    If the first process wants to ensure consistent data, it has
	 *    to restart data collection in this case.
	 */
	mutex_lock(&fs_info->lock);
	if (fs_info->last_update == get_seconds()) {
		rc = -EBUSY;
		goto out;
	}
	hypfs_delete_tree(sb->s_root);
	if (MACHINE_IS_VM)
		rc = hypfs_vm_create_files(sb->s_root);
	else
		rc = hypfs_diag_create_files(sb->s_root);
	if (rc) {
		pr_err("Updating the hypfs tree failed\n");
		hypfs_delete_tree(sb->s_root);
		goto out;
	}
	hypfs_update_update(sb);
	rc = count;
	iov_iter_advance(from, count);
out:
	mutex_unlock(&fs_info->lock);
	return rc;
}
Beispiel #11
0
static ssize_t
ll_direct_IO(
# ifndef HAVE_IOV_ITER_RW
    int rw,
# endif
    struct kiocb *iocb, struct iov_iter *iter,
    loff_t file_offset)
{
    struct ll_cl_context *lcc;
    const struct lu_env *env;
    struct cl_io *io;
    struct file *file = iocb->ki_filp;
    struct inode *inode = file->f_mapping->host;
    ssize_t count = iov_iter_count(iter);
    ssize_t tot_bytes = 0, result = 0;
    size_t size = MAX_DIO_SIZE;

    /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
    if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK))
        return -EINVAL;

    CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), "
           "offset=%lld=%llx, pages %zd (max %lu)\n",
           PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
           file_offset, file_offset, count >> PAGE_SHIFT,
           MAX_DIO_SIZE >> PAGE_SHIFT);

    /* Check that all user buffers are aligned as well */
    if (iov_iter_alignment(iter) & ~PAGE_MASK)
        return -EINVAL;

    lcc = ll_cl_find(file);
    if (lcc == NULL)
        RETURN(-EIO);

    env = lcc->lcc_env;
    LASSERT(!IS_ERR(env));
    io = lcc->lcc_io;
    LASSERT(io != NULL);

    /* 0. Need locking between buffered and direct access. and race with
     *    size changing by concurrent truncates and writes.
     * 1. Need inode mutex to operate transient pages.
     */
    if (iov_iter_rw(iter) == READ)
        inode_lock(inode);

    while (iov_iter_count(iter)) {
        struct page **pages;
        size_t offs;

        count = min_t(size_t, iov_iter_count(iter), size);
        if (iov_iter_rw(iter) == READ) {
            if (file_offset >= i_size_read(inode))
                break;

            if (file_offset + count > i_size_read(inode))
                count = i_size_read(inode) - file_offset;
        }

        result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
        if (likely(result > 0)) {
            int n = DIV_ROUND_UP(result + offs, PAGE_SIZE);

            result = ll_direct_IO_seg(env, io, iov_iter_rw(iter),
                                      inode, result, file_offset,
                                      pages, n);
            ll_free_user_pages(pages, n,
                               iov_iter_rw(iter) == READ);

        }
        if (unlikely(result <= 0)) {
            /* If we can't allocate a large enough buffer
             * for the request, shrink it to a smaller
             * PAGE_SIZE multiple and try again.
             * We should always be able to kmalloc for a
             * page worth of page pointers = 4MB on i386. */
            if (result == -ENOMEM &&
                    size > (PAGE_SIZE / sizeof(*pages)) *
                    PAGE_SIZE) {
                size = ((((size / 2) - 1) |
                         ~PAGE_MASK) + 1) & PAGE_MASK;
                CDEBUG(D_VFSTRACE, "DIO size now %zu\n",
                       size);
                continue;
            }

            GOTO(out, result);
        }

        iov_iter_advance(iter, result);
        tot_bytes += result;
        file_offset += result;
    }
out:
    if (iov_iter_rw(iter) == READ)
        inode_unlock(inode);

    if (tot_bytes > 0) {
        struct vvp_io *vio = vvp_env_io(env);

        /* no commit async for direct IO */
        vio->u.write.vui_written += tot_bytes;
    }

    return tot_bytes ? : result;
}
Beispiel #12
0
/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_rx(struct vhost_net *net)
{
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
	struct vhost_virtqueue *vq = &nvq->vq;
	unsigned uninitialized_var(in), log;
	struct vhost_log *vq_log;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
		.msg_controllen = 0,
		.msg_flags = MSG_DONTWAIT,
	};
	struct virtio_net_hdr hdr = {
		.flags = 0,
		.gso_type = VIRTIO_NET_HDR_GSO_NONE
	};
	size_t total_len = 0;
	int err, mergeable;
	s16 headcount;
	size_t vhost_hlen, sock_hlen;
	size_t vhost_len, sock_len;
	struct socket *sock;
	struct iov_iter fixup;
	__virtio16 num_buffers;

	mutex_lock(&vq->mutex);
	sock = vq->private_data;
	if (!sock)
		goto out;
	vhost_disable_notify(&net->dev, vq);

	vhost_hlen = nvq->vhost_hlen;
	sock_hlen = nvq->sock_hlen;

	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
		vq->log : NULL;
	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);

	while ((sock_len = peek_head_len(sock->sk))) {
		sock_len += sock_hlen;
		vhost_len = sock_len + vhost_hlen;
		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
					&in, vq_log, &log,
					likely(mergeable) ? UIO_MAXIOV : 1);
		/* On error, stop handling until the next kick. */
		if (unlikely(headcount < 0))
			break;
		/* On overrun, truncate and discard */
		if (unlikely(headcount > UIO_MAXIOV)) {
			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
			err = sock->ops->recvmsg(sock, &msg,
						 1, MSG_DONTWAIT | MSG_TRUNC);
			pr_debug("Discarded rx packet: len %zd\n", sock_len);
			continue;
		}
		/* OK, now we need to know about added descriptors. */
		if (!headcount) {
			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				/* They have slipped one in as we were
				 * doing that: check again. */
				vhost_disable_notify(&net->dev, vq);
				continue;
			}
			/* Nothing new?  Wait for eventfd to tell us
			 * they refilled. */
			break;
		}
		/* We don't need to be notified again. */
		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
		fixup = msg.msg_iter;
		if (unlikely((vhost_hlen))) {
			/* We will supply the header ourselves
			 * TODO: support TSO.
			 */
			iov_iter_advance(&msg.msg_iter, vhost_hlen);
		}
		err = sock->ops->recvmsg(sock, &msg,
					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
		/* Userspace might have consumed the packet meanwhile:
		 * it's not supposed to do this usually, but might be hard
		 * to prevent. Discard data we got (if any) and keep going. */
		if (unlikely(err != sock_len)) {
			pr_debug("Discarded rx packet: "
				 " len %d, expected %zd\n", err, sock_len);
			vhost_discard_vq_desc(vq, headcount);
			continue;
		}
		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
		if (unlikely(vhost_hlen)) {
			if (copy_to_iter(&hdr, sizeof(hdr),
					 &fixup) != sizeof(hdr)) {
				vq_err(vq, "Unable to write vnet_hdr "
				       "at addr %p\n", vq->iov->iov_base);
				break;
			}
		} else {
			/* Header came from socket; we'll need to patch
			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
			 */
			iov_iter_advance(&fixup, sizeof(hdr));
		}
		/* TODO: Should check and handle checksum. */

		num_buffers = cpu_to_vhost16(vq, headcount);
		if (likely(mergeable) &&
		    copy_to_iter(&num_buffers, sizeof num_buffers,
				 &fixup) != sizeof num_buffers) {
			vq_err(vq, "Failed num_buffers write");
			vhost_discard_vq_desc(vq, headcount);
			break;
		}
		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
					    headcount);
		if (unlikely(vq_log))
			vhost_log_write(vq, vq_log, log, vhost_len);
		total_len += vhost_len;
		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}
out:
	mutex_unlock(&vq->mutex);
}

static void handle_tx_kick(struct vhost_work *work)
{
	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
						  poll.work);
	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);

	handle_tx(net);
}

static void handle_rx_kick(struct vhost_work *work)
{
	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
						  poll.work);
	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);

	handle_rx(net);
}

static void handle_tx_net(struct vhost_work *work)
{
	struct vhost_net *net = container_of(work, struct vhost_net,
					     poll[VHOST_NET_VQ_TX].work);
	handle_tx(net);
}

static void handle_rx_net(struct vhost_work *work)
{
	struct vhost_net *net = container_of(work, struct vhost_net,
					     poll[VHOST_NET_VQ_RX].work);
	handle_rx(net);
}

static int vhost_net_open(struct inode *inode, struct file *f)
{
	struct vhost_net *n;
	struct vhost_dev *dev;
	struct vhost_virtqueue **vqs;
	int i;

	n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
	if (!n) {
		n = vmalloc(sizeof *n);
		if (!n)
			return -ENOMEM;
	}
	vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL);
	if (!vqs) {
		kvfree(n);
		return -ENOMEM;
	}

	dev = &n->dev;
	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
	n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
	n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
		n->vqs[i].ubufs = NULL;
		n->vqs[i].ubuf_info = NULL;
		n->vqs[i].upend_idx = 0;
		n->vqs[i].done_idx = 0;
		n->vqs[i].vhost_hlen = 0;
		n->vqs[i].sock_hlen = 0;
	}
	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);

	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);

	f->private_data = n;

	return 0;
}

static void vhost_net_disable_vq(struct vhost_net *n,
				 struct vhost_virtqueue *vq)
{
	struct vhost_net_virtqueue *nvq =
		container_of(vq, struct vhost_net_virtqueue, vq);
	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
	if (!vq->private_data)
		return;
	vhost_poll_stop(poll);
}

static int vhost_net_enable_vq(struct vhost_net *n,
				struct vhost_virtqueue *vq)
{
	struct vhost_net_virtqueue *nvq =
		container_of(vq, struct vhost_net_virtqueue, vq);
	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
	struct socket *sock;

	sock = vq->private_data;
	if (!sock)
		return 0;

	return vhost_poll_start(poll, sock->file);
}

static struct socket *vhost_net_stop_vq(struct vhost_net *n,
					struct vhost_virtqueue *vq)
{
	struct socket *sock;

	mutex_lock(&vq->mutex);
	sock = vq->private_data;
	vhost_net_disable_vq(n, vq);
	vq->private_data = NULL;
	mutex_unlock(&vq->mutex);
	return sock;
}

static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
			   struct socket **rx_sock)
{
	*tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
}

static void vhost_net_flush_vq(struct vhost_net *n, int index)
{
	vhost_poll_flush(n->poll + index);
	vhost_poll_flush(&n->vqs[index].vq.poll);
}
Beispiel #13
0
/* Expects to be always run from workqueue - which acts as
 * read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
	struct vhost_virtqueue *vq = &nvq->vq;
	unsigned out, in;
	int head;
	struct msghdr msg = {
		.msg_name = NULL,
		.msg_namelen = 0,
		.msg_control = NULL,
		.msg_controllen = 0,
		.msg_flags = MSG_DONTWAIT,
	};
	size_t len, total_len = 0;
	int err;
	size_t hdr_size;
	struct socket *sock;
	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
	bool zcopy, zcopy_used;

	mutex_lock(&vq->mutex);
	sock = vq->private_data;
	if (!sock)
		goto out;

	vhost_disable_notify(&net->dev, vq);

	hdr_size = nvq->vhost_hlen;
	zcopy = nvq->ubufs;

	for (;;) {
		/* Release DMAs done buffers first */
		if (zcopy)
			vhost_zerocopy_signal_used(net, vq);

		/* If more outstanding DMAs, queue the work.
		 * Handle upend_idx wrap around
		 */
		if (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND)
			      % UIO_MAXIOV == nvq->done_idx))
			break;

		head = vhost_get_vq_desc(vq, vq->iov,
					 ARRAY_SIZE(vq->iov),
					 &out, &in,
					 NULL, NULL);
		/* On error, stop handling until the next kick. */
		if (unlikely(head < 0))
			break;
		/* Nothing new?  Wait for eventfd to tell us they refilled. */
		if (head == vq->num) {
			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
				vhost_disable_notify(&net->dev, vq);
				continue;
			}
			break;
		}
		if (in) {
			vq_err(vq, "Unexpected descriptor format for TX: "
			       "out %d, int %d\n", out, in);
			break;
		}
		/* Skip header. TODO: support TSO. */
		len = iov_length(vq->iov, out);
		iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
		iov_iter_advance(&msg.msg_iter, hdr_size);
		/* Sanity check */
		if (!msg_data_left(&msg)) {
			vq_err(vq, "Unexpected header len for TX: "
			       "%zd expected %zd\n",
			       len, hdr_size);
			break;
		}
		len = msg_data_left(&msg);

		zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
				   && (nvq->upend_idx + 1) % UIO_MAXIOV !=
				      nvq->done_idx
				   && vhost_net_tx_select_zcopy(net);

		/* use msg_control to pass vhost zerocopy ubuf info to skb */
		if (zcopy_used) {
			struct ubuf_info *ubuf;
			ubuf = nvq->ubuf_info + nvq->upend_idx;

			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
			ubuf->callback = vhost_zerocopy_callback;
			ubuf->ctx = nvq->ubufs;
			ubuf->desc = nvq->upend_idx;
			msg.msg_control = ubuf;
			msg.msg_controllen = sizeof(ubuf);
			ubufs = nvq->ubufs;
			atomic_inc(&ubufs->refcount);
			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
		} else {
			msg.msg_control = NULL;
			ubufs = NULL;
		}
		/* TODO: Check specific error and bomb out unless ENOBUFS? */
		err = sock->ops->sendmsg(sock, &msg, len);
		if (unlikely(err < 0)) {
			if (zcopy_used) {
				vhost_net_ubuf_put(ubufs);
				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
					% UIO_MAXIOV;
			}
			vhost_discard_vq_desc(vq, 1);
			break;
		}
		if (err != len)
			pr_debug("Truncated TX packet: "
				 " len %d != %zd\n", err, len);
		if (!zcopy_used)
			vhost_add_used_and_signal(&net->dev, vq, head, 0);
		else
			vhost_zerocopy_signal_used(net, vq);
		total_len += len;
		vhost_net_tx_packet(net);
		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
			vhost_poll_queue(&vq->poll);
			break;
		}
	}
out:
	mutex_unlock(&vq->mutex);
}

static int peek_head_len(struct sock *sk)
{
	struct sk_buff *head;
	int len = 0;
	unsigned long flags;

	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
	head = skb_peek(&sk->sk_receive_queue);
	if (likely(head)) {
		len = head->len;
		if (skb_vlan_tag_present(head))
			len += VLAN_HLEN;
	}

	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
	return len;
}

/* This is a multi-buffer version of vhost_get_desc, that works if
 *	vq has read descriptors only.
 * @vq		- the relevant virtqueue
 * @datalen	- data length we'll be reading
 * @iovcount	- returned count of io vectors we fill
 * @log		- vhost log
 * @log_num	- log offset
 * @quota       - headcount quota, 1 for big buffer
 *	returns number of buffer heads allocated, negative on error
 */
static int get_rx_bufs(struct vhost_virtqueue *vq,
		       struct vring_used_elem *heads,
		       int datalen,
		       unsigned *iovcount,
		       struct vhost_log *log,
		       unsigned *log_num,
		       unsigned int quota)
{
	unsigned int out, in;
	int seg = 0;
	int headcount = 0;
	unsigned d;
	int r, nlogs = 0;
	/* len is always initialized before use since we are always called with
	 * datalen > 0.
	 */
	u32 uninitialized_var(len);

	while (datalen > 0 && headcount < quota) {
		if (unlikely(seg >= UIO_MAXIOV)) {
			r = -ENOBUFS;
			goto err;
		}
		r = vhost_get_vq_desc(vq, vq->iov + seg,
				      ARRAY_SIZE(vq->iov) - seg, &out,
				      &in, log, log_num);
		if (unlikely(r < 0))
			goto err;

		d = r;
		if (d == vq->num) {
			r = 0;
			goto err;
		}
		if (unlikely(out || in <= 0)) {
			vq_err(vq, "unexpected descriptor format for RX: "
				"out %d, in %d\n", out, in);
			r = -EINVAL;
			goto err;
		}
		if (unlikely(log)) {
			nlogs += *log_num;
			log += *log_num;
		}
		heads[headcount].id = cpu_to_vhost32(vq, d);
		len = iov_length(vq->iov + seg, in);
		heads[headcount].len = cpu_to_vhost32(vq, len);
		datalen -= len;
		++headcount;
		seg += in;
	}
	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
	*iovcount = seg;
	if (unlikely(log))
		*log_num = nlogs;

	/* Detect overrun */
	if (unlikely(datalen > 0)) {
		r = UIO_MAXIOV + 1;
		goto err;
	}
	return headcount;
err:
	vhost_discard_vq_desc(vq, headcount);
	return r;
}
Beispiel #14
0
STATIC ssize_t
xfs_file_dio_aio_read(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct address_space	*mapping = iocb->ki_filp->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	loff_t			isize = i_size_read(inode);
	size_t			count = iov_iter_count(to);
	struct iov_iter		data;
	struct xfs_buftarg	*target;
	ssize_t			ret = 0;

	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);

	if (!count)
		return 0; /* skip atime */

	if (XFS_IS_REALTIME_INODE(ip))
		target = ip->i_mount->m_rtdev_targp;
	else
		target = ip->i_mount->m_ddev_targp;

	/* DIO must be aligned to device logical sector size */
	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
		if (iocb->ki_pos == isize)
			return 0;
		return -EINVAL;
	}

	file_accessed(iocb->ki_filp);

	/*
	 * Locking is a bit tricky here. If we take an exclusive lock for direct
	 * IO, we effectively serialise all new concurrent read IO to this file
	 * and block it behind IO that is currently in progress because IO in
	 * progress holds the IO lock shared. We only need to hold the lock
	 * exclusive to blow away the page cache, so only take lock exclusively
	 * if the page cache needs invalidation. This allows the normal direct
	 * IO case of no page cache pages to proceeed concurrently without
	 * serialisation.
	 */
	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
	if (mapping->nrpages) {
		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);

		/*
		 * The generic dio code only flushes the range of the particular
		 * I/O. Because we take an exclusive lock here, this whole
		 * sequence is considerably more expensive for us. This has a
		 * noticeable performance impact for any file with cached pages,
		 * even when outside of the range of the particular I/O.
		 *
		 * Hence, amortize the cost of the lock against a full file
		 * flush and reduce the chances of repeated iolock cycles going
		 * forward.
		 */
		if (mapping->nrpages) {
			ret = filemap_write_and_wait(mapping);
			if (ret) {
				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
				return ret;
			}

			/*
			 * Invalidate whole pages. This can return an error if
			 * we fail to invalidate a page, but this should never
			 * happen on XFS. Warn if it does fail.
			 */
			ret = invalidate_inode_pages2(mapping);
			WARN_ON_ONCE(ret);
			ret = 0;
		}
		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
	}

	data = *to;
	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
			xfs_get_blocks_direct, NULL, NULL, 0);
	if (ret >= 0) {
		iocb->ki_pos += ret;
		iov_iter_advance(to, ret);
	}
	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);

	return ret;
}
Beispiel #15
0
/*
 * Completely synchronous read and write methods.  Direct from __user
 * buffer to osd, or directly to user pages (if O_DIRECT).
 *
 * If the read spans object boundary, just do multiple reads.
 */
static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
			      int *checkeof)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
	struct page **pages;
	u64 off = iocb->ki_pos;
	int num_pages;
	ssize_t ret;
	size_t len = iov_iter_count(to);

	dout("sync_read on file %p %llu~%u %s\n", file, off,
	     (unsigned)len,
	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");

	if (!len)
		return 0;
	/*
	 * flush any page cache pages in this range.  this
	 * will make concurrent normal and sync io slow,
	 * but it will at least behave sensibly when they are
	 * in sequence.
	 */
	ret = filemap_write_and_wait_range(inode->i_mapping, off,
						off + len);
	if (ret < 0)
		return ret;

	if (unlikely(to->type & ITER_PIPE)) {
		size_t page_off;
		ret = iov_iter_get_pages_alloc(to, &pages, len,
					       &page_off);
		if (ret <= 0)
			return -ENOMEM;
		num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);

		ret = striped_read(inode, off, ret, pages, num_pages,
				   page_off, checkeof);
		if (ret > 0) {
			iov_iter_advance(to, ret);
			off += ret;
		} else {
			iov_iter_advance(to, 0);
		}
		ceph_put_page_vector(pages, num_pages, false);
	} else {
		num_pages = calc_pages_for(off, len);
		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
		if (IS_ERR(pages))
			return PTR_ERR(pages);

		ret = striped_read(inode, off, len, pages, num_pages,
				   (off & ~PAGE_MASK), checkeof);
		if (ret > 0) {
			int l, k = 0;
			size_t left = ret;

			while (left) {
				size_t page_off = off & ~PAGE_MASK;
				size_t copy = min_t(size_t, left,
						    PAGE_SIZE - page_off);
				l = copy_page_to_iter(pages[k++], page_off,
						      copy, to);
				off += l;
				left -= l;
				if (l < copy)
					break;
			}
		}
		ceph_release_page_vector(pages, num_pages);
	}

	if (off > iocb->ki_pos) {
		ret = off - iocb->ki_pos;
		iocb->ki_pos = off;
	}

	dout("sync_read result %zd\n", ret);
	return ret;
}