static bool vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) { struct vhost_vring_addr *vra = &vmsg->payload.addr; unsigned int index = vra->index; VuVirtq *vq = &dev->vq[index]; DPRINT("vhost_vring_addr:\n"); DPRINT(" index: %d\n", vra->index); DPRINT(" flags: %d\n", vra->flags); DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); vq->vring.flags = vra->flags; vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); vq->vring.used = qva_to_va(dev, vra->used_user_addr); vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); vq->vring.log_guest_addr = vra->log_guest_addr; DPRINT("Setting virtq addresses:\n"); DPRINT(" vring_desc at %p\n", vq->vring.desc); DPRINT(" vring_used at %p\n", vq->vring.used); DPRINT(" vring_avail at %p\n", vq->vring.avail); if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { vu_panic(dev, "Invalid vring_addr message"); return false; } vq->used_idx = vq->vring.used->idx; return false; }
static bool vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) { int fd; uint64_t log_mmap_size, log_mmap_offset; void *rc; if (vmsg->fd_num != 1 || vmsg->size != sizeof(vmsg->payload.log)) { vu_panic(dev, "Invalid log_base message"); return true; } fd = vmsg->fds[0]; log_mmap_offset = vmsg->payload.log.mmap_offset; log_mmap_size = vmsg->payload.log.mmap_size; DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, log_mmap_offset); if (rc == MAP_FAILED) { perror("log mmap error"); } dev->log_table = rc; dev->log_size = log_mmap_size; vmsg->size = sizeof(vmsg->payload.u64); return true; }
static bool vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { int i; VhostUserMemory *memory = &vmsg->payload.memory; for (i = 0; i < dev->nregions; i++) { VuDevRegion *r = &dev->regions[i]; void *m = (void *) (uintptr_t) r->mmap_addr; if (m) { munmap(m, r->size + r->mmap_offset); } } dev->nregions = memory->nregions; DPRINT("Nregions: %d\n", memory->nregions); for (i = 0; i < dev->nregions; i++) { void *mmap_addr; VhostUserMemoryRegion *msg_region = &memory->regions[i]; VuDevRegion *dev_region = &dev->regions[i]; DPRINT("Region %d\n", i); DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", msg_region->guest_phys_addr); DPRINT(" memory_size: 0x%016"PRIx64"\n", msg_region->memory_size); DPRINT(" userspace_addr 0x%016"PRIx64"\n", msg_region->userspace_addr); DPRINT(" mmap_offset 0x%016"PRIx64"\n", msg_region->mmap_offset); dev_region->gpa = msg_region->guest_phys_addr; dev_region->size = msg_region->memory_size; dev_region->qva = msg_region->userspace_addr; dev_region->mmap_offset = msg_region->mmap_offset; /* We don't use offset argument of mmap() since the * mapped address has to be page aligned, and we use huge * pages. */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, PROT_READ | PROT_WRITE, MAP_SHARED, vmsg->fds[i], 0); if (mmap_addr == MAP_FAILED) { vu_panic(dev, "region mmap error: %s", strerror(errno)); } else { dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr); } close(vmsg->fds[i]); } return false; }
static bool vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) { int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; if (index >= VHOST_MAX_NR_VIRTQUEUE) { vmsg_close_fds(vmsg); vu_panic(dev, "Invalid queue index: %u", index); return false; } if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK || vmsg->fd_num != 1) { vmsg_close_fds(vmsg); vu_panic(dev, "Invalid fds in request: %d", vmsg->request); return false; } return true; }
static bool vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) { struct vhost_vring_addr *vra = &vmsg->payload.addr; unsigned int index = vra->index; VuVirtq *vq = &dev->vq[index]; DPRINT("vhost_vring_addr:\n"); DPRINT(" index: %d\n", vra->index); DPRINT(" flags: %d\n", vra->flags); DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); vq->vring.flags = vra->flags; vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); vq->vring.used = qva_to_va(dev, vra->used_user_addr); vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); vq->vring.log_guest_addr = vra->log_guest_addr; DPRINT("Setting virtq addresses:\n"); DPRINT(" vring_desc at %p\n", vq->vring.desc); DPRINT(" vring_used at %p\n", vq->vring.used); DPRINT(" vring_avail at %p\n", vq->vring.avail); if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { vu_panic(dev, "Invalid vring_addr message"); return false; } vq->used_idx = vq->vring.used->idx; if (vq->last_avail_idx != vq->used_idx) { bool resume = dev->iface->queue_is_processed_in_order && dev->iface->queue_is_processed_in_order(dev, index); DPRINT("Last avail index != used index: %u != %u%s\n", vq->last_avail_idx, vq->used_idx, resume ? ", resuming" : ""); if (resume) { vq->shadow_avail_idx = vq->last_avail_idx = vq->used_idx; } } return false; }
static bool vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg) { if (vmsg->fd_num != 1) { vu_panic(dev, "Invalid slave_req_fd message (%d fd's)", vmsg->fd_num); return false; } if (dev->slave_fd != -1) { close(dev->slave_fd); } dev->slave_fd = vmsg->fds[0]; DPRINT("Got slave_fd: %d\n", vmsg->fds[0]); return false; }
static bool vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) { if (vmsg->fd_num != 1) { vu_panic(dev, "Invalid log_fd message"); return false; } if (dev->log_call_fd != -1) { close(dev->log_call_fd); } dev->log_call_fd = vmsg->fds[0]; DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); return false; }
static bool virtqueue_get_head(VuDev *dev, VuVirtq *vq, unsigned int idx, unsigned int *head) { /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ *head = vring_avail_ring(vq, idx % vq->vring.num); /* If their number is silly, that's a fatal mistake. */ if (*head >= vq->vring.num) { vu_panic(dev, "Guest says index %u is available", head); return false; } return true; }
static bool vu_set_config(VuDev *dev, VhostUserMsg *vmsg) { int ret = -1; if (dev->iface->set_config) { ret = dev->iface->set_config(dev, vmsg->payload.config.region, vmsg->payload.config.offset, vmsg->payload.config.size, vmsg->payload.config.flags); if (ret) { vu_panic(dev, "Set virtio configuration space failed"); } } return false; }
static bool vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) { unsigned int index = vmsg->payload.state.index; unsigned int enable = vmsg->payload.state.num; DPRINT("State.index: %d\n", index); DPRINT("State.enable: %d\n", enable); if (index >= VHOST_MAX_NR_VIRTQUEUE) { vu_panic(dev, "Invalid vring_enable index: %u", index); return false; } dev->vq[index].enable = enable; return false; }
static int virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) { uint16_t num_heads = vring_avail_idx(vq) - idx; /* Check it isn't doing very strange things with descriptor numbers. */ if (num_heads > vq->vring.num) { vu_panic(dev, "Guest moved used index from %u to %u", idx, vq->shadow_avail_idx); return -1; } if (num_heads) { /* On success, callers read a descriptor at vq->last_avail_idx. * Make sure descriptor read does not bypass avail index read. */ smp_rmb(); } return num_heads; }
static void vu_kick_cb(VuDev *dev, int condition, void *data) { int index = (intptr_t)data; VuVirtq *vq = &dev->vq[index]; int sock = vq->kick_fd; eventfd_t kick_data; ssize_t rc; rc = eventfd_read(sock, &kick_data); if (rc == -1) { vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); dev->remove_watch(dev, dev->vq[index].kick_fd); } else { DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", kick_data, vq->handler, index); if (vq->handler) { vq->handler(dev, index); } } }
static int virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, int i, unsigned int max, unsigned int *next) { /* If this descriptor says it doesn't chain, we're done. */ if (!(desc[i].flags & VRING_DESC_F_NEXT)) { return VIRTQUEUE_READ_DESC_DONE; } /* Check they're not leading us off end of descriptors. */ *next = desc[i].next; /* Make sure compiler knows to grab that: we don't want it changing! */ smp_wmb(); if (*next >= max) { vu_panic(dev, "Desc next is %u", next); return VIRTQUEUE_READ_DESC_ERROR; } return VIRTQUEUE_READ_DESC_MORE; }
static bool vu_process_message(VuDev *dev, VhostUserMsg *vmsg) { int do_reply = 0; /* Print out generic part of the request. */ DPRINT("================ Vhost user message ================\n"); DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), vmsg->request); DPRINT("Flags: 0x%x\n", vmsg->flags); DPRINT("Size: %d\n", vmsg->size); if (vmsg->fd_num) { int i; DPRINT("Fds:"); for (i = 0; i < vmsg->fd_num; i++) { DPRINT(" %d", vmsg->fds[i]); } DPRINT("\n"); } if (dev->iface->process_msg && dev->iface->process_msg(dev, vmsg, &do_reply)) { return do_reply; } switch (vmsg->request) { case VHOST_USER_GET_FEATURES: return vu_get_features_exec(dev, vmsg); case VHOST_USER_SET_FEATURES: return vu_set_features_exec(dev, vmsg); case VHOST_USER_GET_PROTOCOL_FEATURES: return vu_get_protocol_features_exec(dev, vmsg); case VHOST_USER_SET_PROTOCOL_FEATURES: return vu_set_protocol_features_exec(dev, vmsg); case VHOST_USER_SET_OWNER: return vu_set_owner_exec(dev, vmsg); case VHOST_USER_RESET_OWNER: return vu_reset_device_exec(dev, vmsg); case VHOST_USER_SET_MEM_TABLE: return vu_set_mem_table_exec(dev, vmsg); case VHOST_USER_SET_LOG_BASE: return vu_set_log_base_exec(dev, vmsg); case VHOST_USER_SET_LOG_FD: return vu_set_log_fd_exec(dev, vmsg); case VHOST_USER_SET_VRING_NUM: return vu_set_vring_num_exec(dev, vmsg); case VHOST_USER_SET_VRING_ADDR: return vu_set_vring_addr_exec(dev, vmsg); case VHOST_USER_SET_VRING_BASE: return vu_set_vring_base_exec(dev, vmsg); case VHOST_USER_GET_VRING_BASE: return vu_get_vring_base_exec(dev, vmsg); case VHOST_USER_SET_VRING_KICK: return vu_set_vring_kick_exec(dev, vmsg); case VHOST_USER_SET_VRING_CALL: return vu_set_vring_call_exec(dev, vmsg); case VHOST_USER_SET_VRING_ERR: return vu_set_vring_err_exec(dev, vmsg); case VHOST_USER_GET_QUEUE_NUM: return vu_get_queue_num_exec(dev, vmsg); case VHOST_USER_SET_VRING_ENABLE: return vu_set_vring_enable_exec(dev, vmsg); case VHOST_USER_SET_SLAVE_REQ_FD: return vu_set_slave_req_fd(dev, vmsg); case VHOST_USER_GET_CONFIG: return vu_get_config(dev, vmsg); case VHOST_USER_SET_CONFIG: return vu_set_config(dev, vmsg); case VHOST_USER_NONE: break; default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); } return false; }
static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; struct iovec iov = { .iov_base = (char *)vmsg, .iov_len = VHOST_USER_HDR_SIZE, }; struct msghdr msg = { .msg_iov = &iov, .msg_iovlen = 1, .msg_control = control, .msg_controllen = sizeof(control), }; size_t fd_size; struct cmsghdr *cmsg; int rc; do { rc = recvmsg(conn_fd, &msg, 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); if (rc < 0) { vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); return false; } vmsg->fd_num = 0; for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { fd_size = cmsg->cmsg_len - CMSG_LEN(0); vmsg->fd_num = fd_size / sizeof(int); memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); break; } } if (vmsg->size > sizeof(vmsg->payload)) { vu_panic(dev, "Error: too big message request: %d, size: vmsg->size: %u, " "while sizeof(vmsg->payload) = %zu\n", vmsg->request, vmsg->size, sizeof(vmsg->payload)); goto fail; } if (vmsg->size) { do { rc = read(conn_fd, &vmsg->payload, vmsg->size); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); if (rc <= 0) { vu_panic(dev, "Error while reading: %s", strerror(errno)); goto fail; } assert(rc == vmsg->size); } return true; fail: vmsg_close_fds(vmsg); return false; } static bool vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { int rc; uint8_t *p = (uint8_t *)vmsg; /* Set the version in the flags when sending the reply */ vmsg->flags &= ~VHOST_USER_VERSION_MASK; vmsg->flags |= VHOST_USER_VERSION; vmsg->flags |= VHOST_USER_REPLY_MASK; do { rc = write(conn_fd, p, VHOST_USER_HDR_SIZE); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); do { if (vmsg->data) { rc = write(conn_fd, vmsg->data, vmsg->size); } else { rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); } } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); if (rc <= 0) { vu_panic(dev, "Error while writing: %s", strerror(errno)); return false; } return true; } /* Kick the log_call_fd if required. */ static void vu_log_kick(VuDev *dev) { if (dev->log_call_fd != -1) { DPRINT("Kicking the QEMU's log...\n"); if (eventfd_write(dev->log_call_fd, 1) < 0) { vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); } } } static void vu_log_page(uint8_t *log_table, uint64_t page) { DPRINT("Logged dirty guest page: %"PRId64"\n", page); atomic_or(&log_table[page / 8], 1 << (page % 8)); }
void vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, unsigned int *out_bytes, unsigned max_in_bytes, unsigned max_out_bytes) { unsigned int idx; unsigned int total_bufs, in_total, out_total; int rc; idx = vq->last_avail_idx; total_bufs = in_total = out_total = 0; if (unlikely(dev->broken) || unlikely(!vq->vring.avail)) { goto done; } while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { unsigned int max, desc_len, num_bufs, indirect = 0; uint64_t desc_addr, read_len; struct vring_desc *desc; struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; unsigned int i; max = vq->vring.num; num_bufs = total_bufs; if (!virtqueue_get_head(dev, vq, idx++, &i)) { goto err; } desc = vq->vring.desc; if (desc[i].flags & VRING_DESC_F_INDIRECT) { if (desc[i].len % sizeof(struct vring_desc)) { vu_panic(dev, "Invalid size for indirect buffer table"); goto err; } /* If we've got too many, that implies a descriptor loop. */ if (num_bufs >= max) { vu_panic(dev, "Looped descriptor"); goto err; } /* loop over the indirect descriptor table */ indirect = 1; desc_addr = desc[i].addr; desc_len = desc[i].len; max = desc_len / sizeof(struct vring_desc); read_len = desc_len; desc = vu_gpa_to_va(dev, &read_len, desc_addr); if (unlikely(desc && read_len != desc_len)) { /* Failed to use zero copy */ desc = NULL; if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) { desc = desc_buf; } } if (!desc) { vu_panic(dev, "Invalid indirect buffer table"); goto err; } num_bufs = i = 0; } do { /* If we've got too many, that implies a descriptor loop. */ if (++num_bufs > max) { vu_panic(dev, "Looped descriptor"); goto err; } if (desc[i].flags & VRING_DESC_F_WRITE) { in_total += desc[i].len; } else { out_total += desc[i].len; } if (in_total >= max_in_bytes && out_total >= max_out_bytes) { goto done; } rc = virtqueue_read_next_desc(dev, desc, i, max, &i); } while (rc == VIRTQUEUE_READ_DESC_MORE); if (rc == VIRTQUEUE_READ_DESC_ERROR) { goto err; } if (!indirect) { total_bufs = num_bufs; } else { total_bufs++; } } if (rc < 0) { goto err; } done: if (in_bytes) { *in_bytes = in_total; } if (out_bytes) { *out_bytes = out_total; } return; err: in_total = out_total = 0; goto done; }