/* Callback from postcopy_ram_supported_by_host block iterator. */ static int test_ramblock_postcopiable(const char *block_name, void *host_addr, ram_addr_t offset, ram_addr_t length, void *opaque) { RAMBlock *rb = qemu_ram_block_by_name(block_name); size_t pagesize = qemu_ram_pagesize(rb); if (length % pagesize) { error_report("Postcopy requires RAM blocks to be a page size multiple," " block %s is 0x" RAM_ADDR_FMT " bytes with a " "page size of 0x%zx", block_name, length, pagesize); return 1; } return 0; }
static void balloon_deflate_page(VirtIOBalloon *balloon, MemoryRegion *mr, hwaddr offset) { void *addr = memory_region_get_ram_ptr(mr) + offset; RAMBlock *rb; size_t rb_page_size; ram_addr_t ram_offset, host_page_base; void *host_addr; int ret; /* XXX is there a better way to get to the RAMBlock than via a * host address? */ rb = qemu_ram_block_from_host(addr, false, &ram_offset); rb_page_size = qemu_ram_pagesize(rb); host_page_base = ram_offset & ~(rb_page_size - 1); if (balloon->pbp && rb == balloon->pbp->rb && host_page_base == balloon->pbp->base) { int subpages = rb_page_size / BALLOON_PAGE_SIZE; /* * This means the guest has asked to discard some of the 4kiB * subpages of a host page, but then changed its mind and * asked to keep them after all. It's exceedingly unlikely * for a guest to do this in practice, but handle it anyway, * since getting it wrong could mean discarding memory the * guest is still using. */ bitmap_clear(balloon->pbp->bitmap, (ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE, subpages); if (bitmap_empty(balloon->pbp->bitmap, subpages)) { g_free(balloon->pbp); balloon->pbp = NULL; } } host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1)); /* When a page is deflated, we hint the whole host page it lives * on, since we can't do anything smaller */ ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED); if (ret != 0) { warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s", strerror(errno)); /* Otherwise ignore, failing to page hint shouldn't be fatal */ } }
int postcopy_wake_shared(struct PostCopyFD *pcfd, uint64_t client_addr, RAMBlock *rb) { size_t pagesize = qemu_ram_pagesize(rb); struct uffdio_range range; int ret; trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); range.start = client_addr & ~(pagesize - 1); range.len = pagesize; ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range); if (ret) { error_report("%s: Failed to wake: %zx in %s (%s)", __func__, (size_t)client_addr, qemu_ram_get_idstr(rb), strerror(errno)); } return ret; }
/* * Place a zero page at (host) atomically * returns 0 on success */ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, RAMBlock *rb) { size_t pagesize = qemu_ram_pagesize(rb); trace_postcopy_place_page_zero(host); /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE * but it's not available for everything (e.g. hugetlbpages) */ if (qemu_ram_is_uf_zeroable(rb)) { if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) { int e = errno; error_report("%s: %s zero host: %p", __func__, strerror(e), host); return -e; } return postcopy_notify_shared_wake(rb, qemu_ram_block_host_offset(rb, host)); } else { /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ if (!mis->postcopy_tmp_zero_page) { mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mis->postcopy_tmp_zero_page == MAP_FAILED) { int e = errno; mis->postcopy_tmp_zero_page = NULL; error_report("%s: %s mapping large zero page", __func__, strerror(e)); return -e; } memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size); } return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); } }
/* * Place a host page (from) at (host) atomically * returns 0 on success */ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, RAMBlock *rb) { size_t pagesize = qemu_ram_pagesize(rb); /* copy also acks to the kernel waking the stalled thread up * TODO: We can inhibit that ack and only do it if it was requested * which would be slightly cheaper, but we'd have to be careful * of the order of updating our page state. */ if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) { int e = errno; error_report("%s: %s copy host: %p from: %p (size: %zd)", __func__, strerror(e), host, from, pagesize); return -e; } trace_postcopy_place_page(host); return postcopy_notify_shared_wake(rb, qemu_ram_block_host_offset(rb, host)); }
/* * Callback from shared fault handlers to ask for a page, * the page must be specified by a RAMBlock and an offset in that rb * Note: Only for use by shared fault handlers (in fault thread) */ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, uint64_t client_addr, uint64_t rb_offset) { size_t pagesize = qemu_ram_pagesize(rb); uint64_t aligned_rbo = rb_offset & ~(pagesize - 1); MigrationIncomingState *mis = migration_incoming_get_current(); trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), rb_offset); if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { trace_postcopy_request_shared_page_present(pcfd->idstr, qemu_ram_get_idstr(rb), rb_offset); return postcopy_wake_shared(pcfd, client_addr, rb); } if (rb != mis->last_rb) { mis->last_rb = rb; migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), aligned_rbo, pagesize); } else { /* Save some space */ migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize); } return 0; }
/* * Handle faults detected by the USERFAULT markings */ static void *postcopy_ram_fault_thread(void *opaque) { MigrationIncomingState *mis = opaque; struct uffd_msg msg; int ret; size_t index; RAMBlock *rb = NULL; trace_postcopy_ram_fault_thread_entry(); mis->last_rb = NULL; /* last RAMBlock we sent part of */ qemu_sem_post(&mis->fault_thread_sem); struct pollfd *pfd; size_t pfd_len = 2 + mis->postcopy_remote_fds->len; pfd = g_new0(struct pollfd, pfd_len); pfd[0].fd = mis->userfault_fd; pfd[0].events = POLLIN; pfd[1].fd = mis->userfault_event_fd; pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd); for (index = 0; index < mis->postcopy_remote_fds->len; index++) { struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, struct PostCopyFD, index); pfd[2 + index].fd = pcfd->fd; pfd[2 + index].events = POLLIN; trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr, pcfd->fd); } while (true) { ram_addr_t rb_offset; int poll_result; /* * We're mainly waiting for the kernel to give us a faulting HVA, * however we can be told to quit via userfault_quit_fd which is * an eventfd */ poll_result = poll(pfd, pfd_len, -1 /* Wait forever */); if (poll_result == -1) { error_report("%s: userfault poll: %s", __func__, strerror(errno)); break; } if (pfd[1].revents) { uint64_t tmp64 = 0; /* Consume the signal */ if (read(mis->userfault_event_fd, &tmp64, 8) != 8) { /* Nothing obviously nicer than posting this error. */ error_report("%s: read() failed", __func__); } if (atomic_read(&mis->fault_thread_quit)) { trace_postcopy_ram_fault_thread_quit(); break; } } if (pfd[0].revents) { poll_result--; ret = read(mis->userfault_fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { if (errno == EAGAIN) { /* * if a wake up happens on the other thread just after * the poll, there is nothing to read. */ continue; } if (ret < 0) { error_report("%s: Failed to read full userfault " "message: %s", __func__, strerror(errno)); break; } else { error_report("%s: Read %d bytes from userfaultfd " "expected %zd", __func__, ret, sizeof(msg)); break; /* Lost alignment, don't know what we'd read next */ } } if (msg.event != UFFD_EVENT_PAGEFAULT) { error_report("%s: Read unexpected event %ud from userfaultfd", __func__, msg.event); continue; /* It's not a page fault, shouldn't happen */ } rb = qemu_ram_block_from_host( (void *)(uintptr_t)msg.arg.pagefault.address, true, &rb_offset); if (!rb) { error_report("postcopy_ram_fault_thread: Fault outside guest: %" PRIx64, (uint64_t)msg.arg.pagefault.address); break; } rb_offset &= ~(qemu_ram_pagesize(rb) - 1); trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), rb_offset, msg.arg.pagefault.feat.ptid); mark_postcopy_blocktime_begin( (uintptr_t)(msg.arg.pagefault.address), msg.arg.pagefault.feat.ptid, rb); /* * Send the request to the source - we want to request one * of our host page sizes (which is >= TPS) */ if (rb != mis->last_rb) { mis->last_rb = rb; migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), rb_offset, qemu_ram_pagesize(rb)); } else { /* Save some space */ migrate_send_rp_req_pages(mis, NULL, rb_offset, qemu_ram_pagesize(rb)); } } /* Now handle any requests from external processes on shared memory */ /* TODO: May need to handle devices deregistering during postcopy */ for (index = 2; index < pfd_len && poll_result; index++) { if (pfd[index].revents) { struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, struct PostCopyFD, index - 2); poll_result--; if (pfd[index].revents & POLLERR) { error_report("%s: POLLERR on poll %zd fd=%d", __func__, index, pcfd->fd); pfd[index].events = 0; continue; } ret = read(pcfd->fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { if (errno == EAGAIN) { /* * if a wake up happens on the other thread just after * the poll, there is nothing to read. */ continue; } if (ret < 0) { error_report("%s: Failed to read full userfault " "message: %s (shared) revents=%d", __func__, strerror(errno), pfd[index].revents); /*TODO: Could just disable this sharer */ break; } else { error_report("%s: Read %d bytes from userfaultfd " "expected %zd (shared)", __func__, ret, sizeof(msg)); /*TODO: Could just disable this sharer */ break; /*Lost alignment,don't know what we'd read next*/ } } if (msg.event != UFFD_EVENT_PAGEFAULT) { error_report("%s: Read unexpected event %ud " "from userfaultfd (shared)", __func__, msg.event); continue; /* It's not a page fault, shouldn't happen */ } /* Call the device handler registered with us */ ret = pcfd->handler(pcfd, &msg); if (ret) { error_report("%s: Failed to resolve shared fault on %zd/%s", __func__, index, pcfd->idstr); /* TODO: Fail? Disable this sharer? */ } } } } trace_postcopy_ram_fault_thread_exit(); g_free(pfd); return NULL; }
static void balloon_inflate_page(VirtIOBalloon *balloon, MemoryRegion *mr, hwaddr offset) { void *addr = memory_region_get_ram_ptr(mr) + offset; RAMBlock *rb; size_t rb_page_size; int subpages; ram_addr_t ram_offset, host_page_base; /* XXX is there a better way to get to the RAMBlock than via a * host address? */ rb = qemu_ram_block_from_host(addr, false, &ram_offset); rb_page_size = qemu_ram_pagesize(rb); host_page_base = ram_offset & ~(rb_page_size - 1); if (rb_page_size == BALLOON_PAGE_SIZE) { /* Easy case */ ram_block_discard_range(rb, ram_offset, rb_page_size); /* We ignore errors from ram_block_discard_range(), because it * has already reported them, and failing to discard a balloon * page is not fatal */ return; } /* Hard case * * We've put a piece of a larger host page into the balloon - we * need to keep track until we have a whole host page to * discard */ warn_report_once( "Balloon used with backing page size > 4kiB, this may not be reliable"); subpages = rb_page_size / BALLOON_PAGE_SIZE; if (balloon->pbp && (rb != balloon->pbp->rb || host_page_base != balloon->pbp->base)) { /* We've partially ballooned part of a host page, but now * we're trying to balloon part of a different one. Too hard, * give up on the old partial page */ g_free(balloon->pbp); balloon->pbp = NULL; } if (!balloon->pbp) { /* Starting on a new host page */ size_t bitlen = BITS_TO_LONGS(subpages) * sizeof(unsigned long); balloon->pbp = g_malloc0(sizeof(PartiallyBalloonedPage) + bitlen); balloon->pbp->rb = rb; balloon->pbp->base = host_page_base; } bitmap_set(balloon->pbp->bitmap, (ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE, subpages); if (bitmap_full(balloon->pbp->bitmap, subpages)) { /* We've accumulated a full host page, we can actually discard * it now */ ram_block_discard_range(rb, balloon->pbp->base, rb_page_size); /* We ignore errors from ram_block_discard_range(), because it * has already reported them, and failing to discard a balloon * page is not fatal */ g_free(balloon->pbp); balloon->pbp = NULL; } }