/* Caller should have device mutex */ void vhost_dev_cleanup(struct vhost_dev *dev) { int i; for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { vhost_poll_stop(&dev->vqs[i].poll); vhost_poll_flush(&dev->vqs[i].poll); } if (dev->vqs[i].error_ctx) eventfd_ctx_put(dev->vqs[i].error_ctx); if (dev->vqs[i].error) fput(dev->vqs[i].error); if (dev->vqs[i].kick) fput(dev->vqs[i].kick); if (dev->vqs[i].call_ctx) eventfd_ctx_put(dev->vqs[i].call_ctx); if (dev->vqs[i].call) fput(dev->vqs[i].call); vhost_vq_reset(dev, dev->vqs + i); } if (dev->log_ctx) eventfd_ctx_put(dev->log_ctx); dev->log_ctx = NULL; if (dev->log_file) fput(dev->log_file); dev->log_file = NULL; /* No one will access memory at this point */ kfree(dev->memory); dev->memory = NULL; if (dev->mm) mmput(dev->mm); dev->mm = NULL; }
static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, int vector, int fd, bool msix) { struct pci_dev *pdev = vdev->pdev; int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; char *name = msix ? "vfio-msix" : "vfio-msi"; struct eventfd_ctx *trigger; int ret; if (vector >= vdev->num_ctx) return -EINVAL; if (vdev->ctx[vector].trigger) { free_irq(irq, vdev->ctx[vector].trigger); kfree(vdev->ctx[vector].name); eventfd_ctx_put(vdev->ctx[vector].trigger); vdev->ctx[vector].trigger = NULL; } if (fd < 0) return 0; vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", name, vector, pci_name(pdev)); if (!vdev->ctx[vector].name) return -ENOMEM; trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { kfree(vdev->ctx[vector].name); return PTR_ERR(trigger); } /* * The MSIx vector table resides in device memory which may be cleared * via backdoor resets. We don't allow direct access to the vector * table so even if a userspace driver attempts to save/restore around * such a reset it would be unsuccessful. To avoid this, restore the * cached value of the message prior to enabling. */ if (msix) { struct msi_msg msg; get_cached_msi_msg(irq, &msg); write_msi_msg(irq, &msg); } ret = request_irq(irq, vfio_msihandler, 0, vdev->ctx[vector].name, trigger); if (ret) { kfree(vdev->ctx[vector].name); eventfd_ctx_put(trigger); return ret; } vdev->ctx[vector].trigger = trigger; return 0; }
static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) { struct pci_dev *pdev = vdev->pdev; unsigned long irqflags = IRQF_SHARED; struct eventfd_ctx *trigger; unsigned long flags; int ret; if (vdev->ctx[0].trigger) { free_irq(pdev->irq, vdev); kfree(vdev->ctx[0].name); eventfd_ctx_put(vdev->ctx[0].trigger); vdev->ctx[0].trigger = NULL; } if (fd < 0) /* Disable only */ return 0; vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)", pci_name(pdev)); if (!vdev->ctx[0].name) return -ENOMEM; trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { kfree(vdev->ctx[0].name); return PTR_ERR(trigger); } vdev->ctx[0].trigger = trigger; if (!vdev->pci_2_3) irqflags = 0; ret = request_irq(pdev->irq, vfio_intx_handler, irqflags, vdev->ctx[0].name, vdev); if (ret) { vdev->ctx[0].trigger = NULL; kfree(vdev->ctx[0].name); eventfd_ctx_put(trigger); return ret; } /* * INTx disable will stick across the new irq setup, * disable_irq won't. */ spin_lock_irqsave(&vdev->irqlock, flags); if (!vdev->pci_2_3 && vdev->ctx[0].masked) disable_irq_nosync(pdev->irq); spin_unlock_irqrestore(&vdev->irqlock, flags); return 0; }
static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, unsigned int count, uint32_t flags, void *data) { /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { if (*ctx) { if (count) { eventfd_signal(*ctx, 1); } else { eventfd_ctx_put(*ctx); *ctx = NULL; } return 0; } } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger; if (!count) return -EINVAL; trigger = *(uint8_t *)data; if (trigger && *ctx) eventfd_signal(*ctx, 1); return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { int32_t fd; if (!count) return -EINVAL; fd = *(int32_t *)data; if (fd == -1) { if (*ctx) eventfd_ctx_put(*ctx); *ctx = NULL; } else if (fd >= 0) { struct eventfd_ctx *efdctx; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); if (*ctx) eventfd_ctx_put(*ctx); *ctx = efdctx; } return 0; } return -EINVAL; }
static int eventfd_release(struct inode *inode, struct file *file) { struct eventfd_ctx *ctx = file->private_data; wake_up_poll(&ctx->wqh, POLLHUP); eventfd_ctx_put(ctx); return 0; }
static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, int vector, int fd, bool msix) { struct pci_dev *pdev = vdev->pdev; int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; char *name = msix ? "vfio-msix" : "vfio-msi"; struct eventfd_ctx *trigger; int ret; if (vector >= vdev->num_ctx) return -EINVAL; if (vdev->ctx[vector].trigger) { free_irq(irq, vdev->ctx[vector].trigger); kfree(vdev->ctx[vector].name); eventfd_ctx_put(vdev->ctx[vector].trigger); vdev->ctx[vector].trigger = NULL; } if (fd < 0) return 0; vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", name, vector, pci_name(pdev)); if (!vdev->ctx[vector].name) return -ENOMEM; trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { kfree(vdev->ctx[vector].name); return PTR_ERR(trigger); } ret = request_irq(irq, vfio_msihandler, 0, vdev->ctx[vector].name, trigger); if (ret) { kfree(vdev->ctx[vector].name); eventfd_ctx_put(trigger); return ret; } vdev->ctx[vector].trigger = trigger; return 0; }
static void virqfd_shutdown(struct work_struct *work) { struct virqfd *virqfd = container_of(work, struct virqfd, shutdown); u64 cnt; eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt); flush_work(&virqfd->inject); eventfd_ctx_put(virqfd->eventfd); kfree(virqfd); }
static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, unsigned index, unsigned start, unsigned count, uint32_t flags, void *data) { int32_t fd = *(int32_t *)data; if ((index != VFIO_PCI_ERR_IRQ_INDEX) || !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) return -EINVAL; /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { if (vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); return 0; } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger = *(uint8_t *)data; if (trigger && vdev->err_trigger) eventfd_signal(vdev->err_trigger, 1); return 0; } /* Handle SET_DATA_EVENTFD */ if (fd == -1) { if (vdev->err_trigger) eventfd_ctx_put(vdev->err_trigger); vdev->err_trigger = NULL; return 0; } else if (fd >= 0) { struct eventfd_ctx *efdctx; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); if (vdev->err_trigger) eventfd_ctx_put(vdev->err_trigger); vdev->err_trigger = efdctx; return 0; } else return -EINVAL; }
static int virqfd_enable(struct vfio_pci_device *vdev, int (*handler)(struct vfio_pci_device *, void *), void (*thread)(struct vfio_pci_device *, void *), void *data, struct virqfd **pvirqfd, int fd) { struct fd irqfd; struct eventfd_ctx *ctx; struct virqfd *virqfd; int ret = 0; unsigned int events; virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); if (!virqfd) return -ENOMEM; virqfd->pvirqfd = pvirqfd; virqfd->vdev = vdev; virqfd->handler = handler; virqfd->thread = thread; virqfd->data = data; INIT_WORK(&virqfd->shutdown, virqfd_shutdown); INIT_WORK(&virqfd->inject, virqfd_inject); irqfd = fdget(fd); if (!irqfd.file) { ret = -EBADF; goto err_fd; } ctx = eventfd_ctx_fileget(irqfd.file); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto err_ctx; } virqfd->eventfd = ctx; /* * virqfds can be released by closing the eventfd or directly * through ioctl. These are both done through a workqueue, so * we update the pointer to the virqfd under lock to avoid * pushing multiple jobs to release the same virqfd. */ spin_lock_irq(&vdev->irqlock); if (*pvirqfd) { spin_unlock_irq(&vdev->irqlock); ret = -EBUSY; goto err_busy; } *pvirqfd = virqfd; spin_unlock_irq(&vdev->irqlock); /* * Install our own custom wake-up handling so we are notified via * a callback whenever someone signals the underlying eventfd. */ init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt); /* * Check if there was an event already pending on the eventfd * before we registered and trigger it as if we didn't miss it. */ if (events & POLLIN) { if ((!handler || handler(vdev, data)) && thread) schedule_work(&virqfd->inject); } /* * Do not drop the file until the irqfd is fully initialized, * otherwise we might race against the POLLHUP. */ fdput(irqfd); return 0; err_busy: eventfd_ctx_put(ctx); err_ctx: fdput(irqfd); err_fd: kfree(virqfd); return ret; }
/* Caller must have device mutex */ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; struct file *eventfp, *filep = NULL; struct eventfd_ctx *ctx = NULL; u64 p; long r; int i, fd; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { r = vhost_dev_set_owner(d); goto done; } /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) goto done; switch (ioctl) { case VHOST_SET_MEM_TABLE: r = vhost_set_memory(d, argp); break; case VHOST_SET_LOG_BASE: r = copy_from_user(&p, argp, sizeof p); if (r < 0) break; if ((u64)(unsigned long)p != p) { r = -EFAULT; break; } for (i = 0; i < d->nvqs; ++i) { struct vhost_virtqueue *vq; void __user *base = (void __user *)(unsigned long)p; vq = d->vqs + i; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ if (vq->private_data && !vq_log_access_ok(vq, base)) r = -EFAULT; else vq->log_base = base; mutex_unlock(&vq->mutex); } break; case VHOST_SET_LOG_FD: r = get_user(fd, (int __user *)argp); if (r < 0) break; eventfp = fd == -1 ? NULL : eventfd_fget(fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != d->log_file) { filep = d->log_file; ctx = d->log_ctx; d->log_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; for (i = 0; i < d->nvqs; ++i) { mutex_lock(&d->vqs[i].mutex); d->vqs[i].log_ctx = d->log_ctx; mutex_unlock(&d->vqs[i].mutex); } if (ctx) eventfd_ctx_put(ctx); if (filep) fput(filep); break; default: r = vhost_set_vring(d, ioctl, argp); break; } done: return r; }
static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) { struct file *eventfp, *filep = NULL, *pollstart = NULL, *pollstop = NULL; struct eventfd_ctx *ctx = NULL; u32 __user *idxp = argp; struct vhost_virtqueue *vq; struct vhost_vring_state s; struct vhost_vring_file f; struct vhost_vring_addr a; u32 idx; long r; r = get_user(idx, idxp); if (r < 0) return r; if (idx > d->nvqs) return -ENOBUFS; vq = d->vqs + idx; mutex_lock(&vq->mutex); switch (ioctl) { case VHOST_SET_VRING_NUM: /* Resizing ring with an active backend? * You don't want to do that. */ if (vq->private_data) { r = -EBUSY; break; } r = copy_from_user(&s, argp, sizeof s); if (r < 0) break; if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { r = -EINVAL; break; } vq->num = s.num; break; case VHOST_SET_VRING_BASE: /* Moving base with an active backend? * You don't want to do that. */ if (vq->private_data) { r = -EBUSY; break; } r = copy_from_user(&s, argp, sizeof s); if (r < 0) break; if (s.num > 0xffff) { r = -EINVAL; break; } vq->last_avail_idx = s.num; /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; case VHOST_GET_VRING_BASE: s.index = idx; s.num = vq->last_avail_idx; r = copy_to_user(argp, &s, sizeof s); break; case VHOST_SET_VRING_ADDR: r = copy_from_user(&a, argp, sizeof a); if (r < 0) break; if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { r = -EOPNOTSUPP; break; } /* For 32bit, verify that the top 32bits of the user data are set to zero. */ if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || (u64)(unsigned long)a.used_user_addr != a.used_user_addr || (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { r = -EFAULT; break; } if ((a.avail_user_addr & (sizeof *vq->avail->ring - 1)) || (a.used_user_addr & (sizeof *vq->used->ring - 1)) || (a.log_guest_addr & (sizeof *vq->used->ring - 1))) { r = -EINVAL; break; } /* We only verify access here if backend is configured. * If it is not, we don't as size might not have been setup. * We will verify when backend is configured. */ if (vq->private_data) { if (!vq_access_ok(vq->num, (void __user *)(unsigned long)a.desc_user_addr, (void __user *)(unsigned long)a.avail_user_addr, (void __user *)(unsigned long)a.used_user_addr)) { r = -EINVAL; break; } /* Also validate log access for used ring if enabled. */ if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && !log_access_ok(vq->log_base, a.log_guest_addr, sizeof *vq->used + vq->num * sizeof *vq->used->ring)) { r = -EINVAL; break; } } r = init_used(vq, (struct vring_used __user *)(unsigned long) a.used_user_addr); if (r) break; vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); vq->desc = (void __user *)(unsigned long)a.desc_user_addr; vq->avail = (void __user *)(unsigned long)a.avail_user_addr; vq->log_addr = a.log_guest_addr; vq->used = (void __user *)(unsigned long)a.used_user_addr; break; case VHOST_SET_VRING_KICK: r = copy_from_user(&f, argp, sizeof f); if (r < 0) break; eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->kick) { pollstop = filep = vq->kick; pollstart = vq->kick = eventfp; } else filep = eventfp; break; case VHOST_SET_VRING_CALL: r = copy_from_user(&f, argp, sizeof f); if (r < 0) break; eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->call) { filep = vq->call; ctx = vq->call_ctx; vq->call = eventfp; vq->call_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; break; case VHOST_SET_VRING_ERR: r = copy_from_user(&f, argp, sizeof f); if (r < 0) break; eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->error) { filep = vq->error; vq->error = eventfp; ctx = vq->error_ctx; vq->error_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; break; default: r = -ENOIOCTLCMD; } if (pollstop && vq->handle_kick) vhost_poll_stop(&vq->poll); if (ctx) eventfd_ctx_put(ctx); if (filep) fput(filep); if (pollstart && vq->handle_kick) vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_poll_flush(&vq->poll); return r; }