示例#1
0
/* Callback from postcopy_ram_supported_by_host block iterator.
 */
static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
                             ram_addr_t offset, ram_addr_t length, void *opaque)
{
    RAMBlock *rb = qemu_ram_block_by_name(block_name);
    size_t pagesize = qemu_ram_pagesize(rb);

    if (length % pagesize) {
        error_report("Postcopy requires RAM blocks to be a page size multiple,"
                     " block %s is 0x" RAM_ADDR_FMT " bytes with a "
                     "page size of 0x%zx", block_name, length, pagesize);
        return 1;
    }
    return 0;
}
示例#2
0
static void balloon_deflate_page(VirtIOBalloon *balloon,
                                 MemoryRegion *mr, hwaddr offset)
{
    void *addr = memory_region_get_ram_ptr(mr) + offset;
    RAMBlock *rb;
    size_t rb_page_size;
    ram_addr_t ram_offset, host_page_base;
    void *host_addr;
    int ret;

    /* XXX is there a better way to get to the RAMBlock than via a
     * host address? */
    rb = qemu_ram_block_from_host(addr, false, &ram_offset);
    rb_page_size = qemu_ram_pagesize(rb);
    host_page_base = ram_offset & ~(rb_page_size - 1);

    if (balloon->pbp
        && rb == balloon->pbp->rb
        && host_page_base == balloon->pbp->base) {
        int subpages = rb_page_size / BALLOON_PAGE_SIZE;

        /*
         * This means the guest has asked to discard some of the 4kiB
         * subpages of a host page, but then changed its mind and
         * asked to keep them after all.  It's exceedingly unlikely
         * for a guest to do this in practice, but handle it anyway,
         * since getting it wrong could mean discarding memory the
         * guest is still using. */
        bitmap_clear(balloon->pbp->bitmap,
                     (ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE,
                     subpages);

        if (bitmap_empty(balloon->pbp->bitmap, subpages)) {
            g_free(balloon->pbp);
            balloon->pbp = NULL;
        }
    }

    host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1));

    /* When a page is deflated, we hint the whole host page it lives
     * on, since we can't do anything smaller */
    ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED);
    if (ret != 0) {
        warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s",
                    strerror(errno));
        /* Otherwise ignore, failing to page hint shouldn't be fatal */
    }
}
示例#3
0
int postcopy_wake_shared(struct PostCopyFD *pcfd,
                         uint64_t client_addr,
                         RAMBlock *rb)
{
    size_t pagesize = qemu_ram_pagesize(rb);
    struct uffdio_range range;
    int ret;
    trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
    range.start = client_addr & ~(pagesize - 1);
    range.len = pagesize;
    ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
    if (ret) {
        error_report("%s: Failed to wake: %zx in %s (%s)",
                     __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
                     strerror(errno));
    }
    return ret;
}
示例#4
0
/*
 * Place a zero page at (host) atomically
 * returns 0 on success
 */
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
                             RAMBlock *rb)
{
    size_t pagesize = qemu_ram_pagesize(rb);
    trace_postcopy_place_page_zero(host);

    /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
     * but it's not available for everything (e.g. hugetlbpages)
     */
    if (qemu_ram_is_uf_zeroable(rb)) {
        if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
            int e = errno;
            error_report("%s: %s zero host: %p",
                         __func__, strerror(e), host);

            return -e;
        }
        return postcopy_notify_shared_wake(rb,
                                           qemu_ram_block_host_offset(rb,
                                                                      host));
    } else {
        /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
        if (!mis->postcopy_tmp_zero_page) {
            mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
                                               PROT_READ | PROT_WRITE,
                                               MAP_PRIVATE | MAP_ANONYMOUS,
                                               -1, 0);
            if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
                int e = errno;
                mis->postcopy_tmp_zero_page = NULL;
                error_report("%s: %s mapping large zero page",
                             __func__, strerror(e));
                return -e;
            }
            memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
        }
        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
                                   rb);
    }
}
示例#5
0
/*
 * Place a host page (from) at (host) atomically
 * returns 0 on success
 */
int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
                        RAMBlock *rb)
{
    size_t pagesize = qemu_ram_pagesize(rb);

    /* copy also acks to the kernel waking the stalled thread up
     * TODO: We can inhibit that ack and only do it if it was requested
     * which would be slightly cheaper, but we'd have to be careful
     * of the order of updating our page state.
     */
    if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
        int e = errno;
        error_report("%s: %s copy host: %p from: %p (size: %zd)",
                     __func__, strerror(e), host, from, pagesize);

        return -e;
    }

    trace_postcopy_place_page(host);
    return postcopy_notify_shared_wake(rb,
                                       qemu_ram_block_host_offset(rb, host));
}
示例#6
0
/*
 * Callback from shared fault handlers to ask for a page,
 * the page must be specified by a RAMBlock and an offset in that rb
 * Note: Only for use by shared fault handlers (in fault thread)
 */
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
                                 uint64_t client_addr, uint64_t rb_offset)
{
    size_t pagesize = qemu_ram_pagesize(rb);
    uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
    MigrationIncomingState *mis = migration_incoming_get_current();

    trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
                                       rb_offset);
    if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
        trace_postcopy_request_shared_page_present(pcfd->idstr,
                                        qemu_ram_get_idstr(rb), rb_offset);
        return postcopy_wake_shared(pcfd, client_addr, rb);
    }
    if (rb != mis->last_rb) {
        mis->last_rb = rb;
        migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
                                  aligned_rbo, pagesize);
    } else {
        /* Save some space */
        migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
    }
    return 0;
}
示例#7
0
/*
 * Handle faults detected by the USERFAULT markings
 */
static void *postcopy_ram_fault_thread(void *opaque)
{
    MigrationIncomingState *mis = opaque;
    struct uffd_msg msg;
    int ret;
    size_t index;
    RAMBlock *rb = NULL;

    trace_postcopy_ram_fault_thread_entry();
    mis->last_rb = NULL; /* last RAMBlock we sent part of */
    qemu_sem_post(&mis->fault_thread_sem);

    struct pollfd *pfd;
    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;

    pfd = g_new0(struct pollfd, pfd_len);

    pfd[0].fd = mis->userfault_fd;
    pfd[0].events = POLLIN;
    pfd[1].fd = mis->userfault_event_fd;
    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
                                                 struct PostCopyFD, index);
        pfd[2 + index].fd = pcfd->fd;
        pfd[2 + index].events = POLLIN;
        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
                                                  pcfd->fd);
    }

    while (true) {
        ram_addr_t rb_offset;
        int poll_result;

        /*
         * We're mainly waiting for the kernel to give us a faulting HVA,
         * however we can be told to quit via userfault_quit_fd which is
         * an eventfd
         */

        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
        if (poll_result == -1) {
            error_report("%s: userfault poll: %s", __func__, strerror(errno));
            break;
        }

        if (pfd[1].revents) {
            uint64_t tmp64 = 0;

            /* Consume the signal */
            if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
                /* Nothing obviously nicer than posting this error. */
                error_report("%s: read() failed", __func__);
            }

            if (atomic_read(&mis->fault_thread_quit)) {
                trace_postcopy_ram_fault_thread_quit();
                break;
            }
        }

        if (pfd[0].revents) {
            poll_result--;
            ret = read(mis->userfault_fd, &msg, sizeof(msg));
            if (ret != sizeof(msg)) {
                if (errno == EAGAIN) {
                    /*
                     * if a wake up happens on the other thread just after
                     * the poll, there is nothing to read.
                     */
                    continue;
                }
                if (ret < 0) {
                    error_report("%s: Failed to read full userfault "
                                 "message: %s",
                                 __func__, strerror(errno));
                    break;
                } else {
                    error_report("%s: Read %d bytes from userfaultfd "
                                 "expected %zd",
                                 __func__, ret, sizeof(msg));
                    break; /* Lost alignment, don't know what we'd read next */
                }
            }
            if (msg.event != UFFD_EVENT_PAGEFAULT) {
                error_report("%s: Read unexpected event %ud from userfaultfd",
                             __func__, msg.event);
                continue; /* It's not a page fault, shouldn't happen */
            }

            rb = qemu_ram_block_from_host(
                     (void *)(uintptr_t)msg.arg.pagefault.address,
                     true, &rb_offset);
            if (!rb) {
                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
                             PRIx64, (uint64_t)msg.arg.pagefault.address);
                break;
            }

            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                qemu_ram_get_idstr(rb),
                                                rb_offset,
                                                msg.arg.pagefault.feat.ptid);
            mark_postcopy_blocktime_begin(
                    (uintptr_t)(msg.arg.pagefault.address),
                                msg.arg.pagefault.feat.ptid, rb);

            /*
             * Send the request to the source - we want to request one
             * of our host page sizes (which is >= TPS)
             */
            if (rb != mis->last_rb) {
                mis->last_rb = rb;
                migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
                                         rb_offset, qemu_ram_pagesize(rb));
            } else {
                /* Save some space */
                migrate_send_rp_req_pages(mis, NULL,
                                         rb_offset, qemu_ram_pagesize(rb));
            }
        }

        /* Now handle any requests from external processes on shared memory */
        /* TODO: May need to handle devices deregistering during postcopy */
        for (index = 2; index < pfd_len && poll_result; index++) {
            if (pfd[index].revents) {
                struct PostCopyFD *pcfd =
                    &g_array_index(mis->postcopy_remote_fds,
                                   struct PostCopyFD, index - 2);

                poll_result--;
                if (pfd[index].revents & POLLERR) {
                    error_report("%s: POLLERR on poll %zd fd=%d",
                                 __func__, index, pcfd->fd);
                    pfd[index].events = 0;
                    continue;
                }

                ret = read(pcfd->fd, &msg, sizeof(msg));
                if (ret != sizeof(msg)) {
                    if (errno == EAGAIN) {
                        /*
                         * if a wake up happens on the other thread just after
                         * the poll, there is nothing to read.
                         */
                        continue;
                    }
                    if (ret < 0) {
                        error_report("%s: Failed to read full userfault "
                                     "message: %s (shared) revents=%d",
                                     __func__, strerror(errno),
                                     pfd[index].revents);
                        /*TODO: Could just disable this sharer */
                        break;
                    } else {
                        error_report("%s: Read %d bytes from userfaultfd "
                                     "expected %zd (shared)",
                                     __func__, ret, sizeof(msg));
                        /*TODO: Could just disable this sharer */
                        break; /*Lost alignment,don't know what we'd read next*/
                    }
                }
                if (msg.event != UFFD_EVENT_PAGEFAULT) {
                    error_report("%s: Read unexpected event %ud "
                                 "from userfaultfd (shared)",
                                 __func__, msg.event);
                    continue; /* It's not a page fault, shouldn't happen */
                }
                /* Call the device handler registered with us */
                ret = pcfd->handler(pcfd, &msg);
                if (ret) {
                    error_report("%s: Failed to resolve shared fault on %zd/%s",
                                 __func__, index, pcfd->idstr);
                    /* TODO: Fail? Disable this sharer? */
                }
            }
        }
    }
    trace_postcopy_ram_fault_thread_exit();
    g_free(pfd);
    return NULL;
}
示例#8
0
static void balloon_inflate_page(VirtIOBalloon *balloon,
                                 MemoryRegion *mr, hwaddr offset)
{
    void *addr = memory_region_get_ram_ptr(mr) + offset;
    RAMBlock *rb;
    size_t rb_page_size;
    int subpages;
    ram_addr_t ram_offset, host_page_base;

    /* XXX is there a better way to get to the RAMBlock than via a
     * host address? */
    rb = qemu_ram_block_from_host(addr, false, &ram_offset);
    rb_page_size = qemu_ram_pagesize(rb);
    host_page_base = ram_offset & ~(rb_page_size - 1);

    if (rb_page_size == BALLOON_PAGE_SIZE) {
        /* Easy case */

        ram_block_discard_range(rb, ram_offset, rb_page_size);
        /* We ignore errors from ram_block_discard_range(), because it
         * has already reported them, and failing to discard a balloon
         * page is not fatal */
        return;
    }

    /* Hard case
     *
     * We've put a piece of a larger host page into the balloon - we
     * need to keep track until we have a whole host page to
     * discard
     */
    warn_report_once(
"Balloon used with backing page size > 4kiB, this may not be reliable");

    subpages = rb_page_size / BALLOON_PAGE_SIZE;

    if (balloon->pbp
        && (rb != balloon->pbp->rb
            || host_page_base != balloon->pbp->base)) {
        /* We've partially ballooned part of a host page, but now
         * we're trying to balloon part of a different one.  Too hard,
         * give up on the old partial page */
        g_free(balloon->pbp);
        balloon->pbp = NULL;
    }

    if (!balloon->pbp) {
        /* Starting on a new host page */
        size_t bitlen = BITS_TO_LONGS(subpages) * sizeof(unsigned long);
        balloon->pbp = g_malloc0(sizeof(PartiallyBalloonedPage) + bitlen);
        balloon->pbp->rb = rb;
        balloon->pbp->base = host_page_base;
    }

    bitmap_set(balloon->pbp->bitmap,
               (ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE,
               subpages);

    if (bitmap_full(balloon->pbp->bitmap, subpages)) {
        /* We've accumulated a full host page, we can actually discard
         * it now */

        ram_block_discard_range(rb, balloon->pbp->base, rb_page_size);
        /* We ignore errors from ram_block_discard_range(), because it
         * has already reported them, and failing to discard a balloon
         * page is not fatal */

        g_free(balloon->pbp);
        balloon->pbp = NULL;
    }
}