Exemplo n.º 1
0
Arquivo: msix.c Projeto: Icenowy/qemu
int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries,
                            uint8_t bar_nr)
{
    int ret;
    char *name;
    uint32_t bar_size = 4096;
    uint32_t bar_pba_offset = bar_size / 2;
    uint32_t bar_pba_size = (nentries / 8 + 1) * 8;

    /*
     * Migration compatibility dictates that this remains a 4k
     * BAR with the vector table in the lower half and PBA in
     * the upper half for nentries which is lower or equal to 128.
     * No need to care about using more than 65 entries for legacy
     * machine types who has at most 64 queues.
     */
    if (nentries * PCI_MSIX_ENTRY_SIZE > bar_pba_offset) {
        bar_pba_offset = nentries * PCI_MSIX_ENTRY_SIZE;
    }

    if (bar_pba_offset + bar_pba_size > 4096) {
        bar_size = bar_pba_offset + bar_pba_size;
    }

    bar_size = pow2ceil(bar_size);

    name = g_strdup_printf("%s-msix", dev->name);
    memory_region_init(&dev->msix_exclusive_bar, OBJECT(dev), name, bar_size);
    g_free(name);

    ret = msix_init(dev, nentries, &dev->msix_exclusive_bar, bar_nr,
                    0, &dev->msix_exclusive_bar,
                    bar_nr, bar_pba_offset,
                    0);
    if (ret) {
        return ret;
    }

    pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY,
                     &dev->msix_exclusive_bar);

    return 0;
}
Exemplo n.º 2
0
static void vfio_prereg_listener_region_add(MemoryListener *listener,
                                            MemoryRegionSection *section)
{
    VFIOContainer *container = container_of(listener, VFIOContainer,
                                            prereg_listener);
    const hwaddr gpa = section->offset_within_address_space;
    hwaddr end;
    int ret;
    hwaddr page_mask = qemu_real_host_page_mask;
    struct vfio_iommu_spapr_register_memory reg = {
        .argsz = sizeof(reg),
        .flags = 0,
    };

    if (vfio_prereg_listener_skipped_section(section)) {
        trace_vfio_prereg_listener_region_add_skip(
                section->offset_within_address_space,
                section->offset_within_address_space +
                int128_get64(int128_sub(section->size, int128_one())));
        return;
    }

    if (unlikely((section->offset_within_address_space & ~page_mask) ||
                 (section->offset_within_region & ~page_mask) ||
                 (int128_get64(section->size) & ~page_mask))) {
        error_report("%s received unaligned region", __func__);
        return;
    }

    end = section->offset_within_address_space + int128_get64(section->size);
    if (gpa >= end) {
        return;
    }

    memory_region_ref(section->mr);

    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
    reg.size = end - gpa;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
    trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
    if (ret) {
        /*
         * On the initfn path, store the first error in the container so we
         * can gracefully fail.  Runtime, there's not much we can do other
         * than throw a hardware error.
         */
        if (!container->initialized) {
            if (!container->error) {
                container->error = ret;
            }
        } else {
            hw_error("vfio: Memory registering failed, unable to continue");
        }
    }
}

static void vfio_prereg_listener_region_del(MemoryListener *listener,
                                            MemoryRegionSection *section)
{
    VFIOContainer *container = container_of(listener, VFIOContainer,
                                            prereg_listener);
    const hwaddr gpa = section->offset_within_address_space;
    hwaddr end;
    int ret;
    hwaddr page_mask = qemu_real_host_page_mask;
    struct vfio_iommu_spapr_register_memory reg = {
        .argsz = sizeof(reg),
        .flags = 0,
    };

    if (vfio_prereg_listener_skipped_section(section)) {
        trace_vfio_prereg_listener_region_del_skip(
                section->offset_within_address_space,
                section->offset_within_address_space +
                int128_get64(int128_sub(section->size, int128_one())));
        return;
    }

    if (unlikely((section->offset_within_address_space & ~page_mask) ||
                 (section->offset_within_region & ~page_mask) ||
                 (int128_get64(section->size) & ~page_mask))) {
        error_report("%s received unaligned region", __func__);
        return;
    }

    end = section->offset_within_address_space + int128_get64(section->size);
    if (gpa >= end) {
        return;
    }

    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
    reg.size = end - gpa;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
    trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
}

const MemoryListener vfio_prereg_listener = {
    .region_add = vfio_prereg_listener_region_add,
    .region_del = vfio_prereg_listener_region_del,
};

int vfio_spapr_create_window(VFIOContainer *container,
                             MemoryRegionSection *section,
                             hwaddr *pgsize)
{
    int ret;
    unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr);
    unsigned entries, pages;
    struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };

    /*
     * FIXME: For VFIO iommu types which have KVM acceleration to
     * avoid bouncing all map/unmaps through qemu this way, this
     * would be the right place to wire that up (tell the KVM
     * device emulation the VFIO iommu handles to use).
     */
    create.window_size = int128_get64(section->size);
    create.page_shift = ctz64(pagesize);
    /*
     * SPAPR host supports multilevel TCE tables, there is some
     * heuristic to decide how many levels we want for our table:
     * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
     */
    entries = create.window_size >> create.page_shift;
    pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
    pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */
    create.levels = ctz64(pages) / 6 + 1;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
    if (ret) {
        error_report("Failed to create a window, ret = %d (%m)", ret);
        return -errno;
    }

    if (create.start_addr != section->offset_within_address_space) {
        vfio_spapr_remove_window(container, create.start_addr);

        error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
                     section->offset_within_address_space,
                     (uint64_t)create.start_addr);
        return -EINVAL;
    }
    trace_vfio_spapr_create_window(create.page_shift,
                                   create.window_size,
                                   create.start_addr);
    *pgsize = pagesize;

    return 0;
}

int vfio_spapr_remove_window(VFIOContainer *container,
                             hwaddr offset_within_address_space)
{
    struct vfio_iommu_spapr_tce_remove remove = {
        .argsz = sizeof(remove),
        .start_addr = offset_within_address_space,
    };
    int ret;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
    if (ret) {
        error_report("Failed to remove window at %"PRIx64,
                     (uint64_t)remove.start_addr);
        return -errno;
    }

    trace_vfio_spapr_remove_window(offset_within_address_space);

    return 0;
}
Exemplo n.º 3
0
static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
                           PvrdmaRing **rings, uint32_t scqe, uint32_t smax_sge,
                           uint32_t spages, uint32_t rcqe, uint32_t rmax_sge,
                           uint32_t rpages)
{
    uint64_t *dir = NULL, *tbl = NULL;
    PvrdmaRing *sr, *rr;
    int rc = -EINVAL;
    char ring_name[MAX_RING_NAME_SZ];
    uint32_t wqe_sz;

    pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma);
    dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE);
    if (!dir) {
        pr_dbg("Failed to map to CQ page directory\n");
        goto out;
    }

    tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE);
    if (!tbl) {
        pr_dbg("Failed to map to CQ page table\n");
        goto out;
    }

    sr = g_malloc(2 * sizeof(*rr));
    rr = &sr[1];
    pr_dbg("sring=%p\n", sr);
    pr_dbg("rring=%p\n", rr);

    *rings = sr;

    pr_dbg("scqe=%d\n", scqe);
    pr_dbg("smax_sge=%d\n", smax_sge);
    pr_dbg("spages=%d\n", spages);
    pr_dbg("rcqe=%d\n", rcqe);
    pr_dbg("rmax_sge=%d\n", rmax_sge);
    pr_dbg("rpages=%d\n", rpages);

    /* Create send ring */
    sr->ring_state = (struct pvrdma_ring *)
        rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
    if (!sr->ring_state) {
        pr_dbg("Failed to map to CQ ring state\n");
        goto out_free_sr_mem;
    }

    wqe_sz = pow2ceil(sizeof(struct pvrdma_sq_wqe_hdr) +
                      sizeof(struct pvrdma_sge) * smax_sge - 1);

    sprintf(ring_name, "qp_sring_%" PRIx64, pdir_dma);
    rc = pvrdma_ring_init(sr, ring_name, pci_dev, sr->ring_state,
                          scqe, wqe_sz, (dma_addr_t *)&tbl[1], spages);
    if (rc) {
        goto out_unmap_ring_state;
    }

    /* Create recv ring */
    rr->ring_state = &sr->ring_state[1];
    wqe_sz = pow2ceil(sizeof(struct pvrdma_rq_wqe_hdr) +
                      sizeof(struct pvrdma_sge) * rmax_sge - 1);
    sprintf(ring_name, "qp_rring_%" PRIx64, pdir_dma);
    rc = pvrdma_ring_init(rr, ring_name, pci_dev, rr->ring_state,
                          rcqe, wqe_sz, (dma_addr_t *)&tbl[1 + spages], rpages);
    if (rc) {
        goto out_free_sr;
    }

    goto out;

out_free_sr:
    pvrdma_ring_free(sr);

out_unmap_ring_state:
    rdma_pci_dma_unmap(pci_dev, sr->ring_state, TARGET_PAGE_SIZE);

out_free_sr_mem:
    g_free(sr);

out:
    rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE);
    rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE);

    return rc;
}
Exemplo n.º 4
0
static size_t buffer_req_size(Buffer *buffer, size_t len)
{
    return MAX(BUFFER_MIN_INIT_SIZE,
               pow2ceil(buffer->offset + len));
}
Exemplo n.º 5
0
/**
 * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
 * @env: CPU that owns the TLB
 * @mmu_idx: MMU index of the TLB
 *
 * Called with tlb_lock_held.
 *
 * We have two main constraints when resizing a TLB: (1) we only resize it
 * on a TLB flush (otherwise we'd have to take a perf hit by either rehashing
 * the array or unnecessarily flushing it), which means we do not control how
 * frequently the resizing can occur; (2) we don't have access to the guest's
 * future scheduling decisions, and therefore have to decide the magnitude of
 * the resize based on past observations.
 *
 * In general, a memory-hungry process can benefit greatly from an appropriately
 * sized TLB, since a guest TLB miss is very expensive. This doesn't mean that
 * we just have to make the TLB as large as possible; while an oversized TLB
 * results in minimal TLB miss rates, it also takes longer to be flushed
 * (flushes can be _very_ frequent), and the reduced locality can also hurt
 * performance.
 *
 * To achieve near-optimal performance for all kinds of workloads, we:
 *
 * 1. Aggressively increase the size of the TLB when the use rate of the
 * TLB being flushed is high, since it is likely that in the near future this
 * memory-hungry process will execute again, and its memory hungriness will
 * probably be similar.
 *
 * 2. Slowly reduce the size of the TLB as the use rate declines over a
 * reasonably large time window. The rationale is that if in such a time window
 * we have not observed a high TLB use rate, it is likely that we won't observe
 * it in the near future. In that case, once a time window expires we downsize
 * the TLB to match the maximum use rate observed in the window.
 *
 * 3. Try to keep the maximum use rate in a time window in the 30-70% range,
 * since in that range performance is likely near-optimal. Recall that the TLB
 * is direct mapped, so we want the use rate to be low (or at least not too
 * high), since otherwise we are likely to have a significant amount of
 * conflict misses.
 */
static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
{
    CPUTLBDesc *desc = &env->tlb_d[mmu_idx];
    size_t old_size = tlb_n_entries(env, mmu_idx);
    size_t rate;
    size_t new_size = old_size;
    int64_t now = get_clock_realtime();
    int64_t window_len_ms = 100;
    int64_t window_len_ns = window_len_ms * 1000 * 1000;
    bool window_expired = now > desc->window.begin_ns + window_len_ns;

    if (desc->n_used_entries > desc->window.max_entries) {
        desc->window.max_entries = desc->n_used_entries;
    }
    rate = desc->window.max_entries * 100 / old_size;

    if (rate > 70) {
        new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS);
    } else if (rate < 30 && window_expired) {
        size_t ceil = pow2ceil(desc->window.max_entries);
        size_t expected_rate = desc->window.max_entries * 100 / ceil;

        /*
         * Avoid undersizing when the max number of entries seen is just below
         * a pow2. For instance, if max_entries == 1025, the expected use rate
         * would be 1025/2048==50%. However, if max_entries == 1023, we'd get
         * 1023/1024==99.9% use rate, so we'd likely end up doubling the size
         * later. Thus, make sure that the expected use rate remains below 70%.
         * (and since we double the size, that means the lowest rate we'd
         * expect to get is 35%, which is still in the 30-70% range where
         * we consider that the size is appropriate.)
         */
        if (expected_rate > 70) {
            ceil *= 2;
        }
        new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS);
    }

    if (new_size == old_size) {
        if (window_expired) {
            tlb_window_reset(&desc->window, now, desc->n_used_entries);
        }
        return;
    }

    g_free(env->tlb_table[mmu_idx]);
    g_free(env->iotlb[mmu_idx]);

    tlb_window_reset(&desc->window, now, 0);
    /* desc->n_used_entries is cleared by the caller */
    env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
    env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
    env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
    /*
     * If the allocations fail, try smaller sizes. We just freed some
     * memory, so going back to half of new_size has a good chance of working.
     * Increased memory pressure elsewhere in the system might cause the
     * allocations to fail though, so we progressively reduce the allocation
     * size, aborting if we cannot even allocate the smallest TLB we support.
     */
    while (env->tlb_table[mmu_idx] == NULL || env->iotlb[mmu_idx] == NULL) {
        if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
            error_report("%s: %s", __func__, strerror(errno));
            abort();
        }
        new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
        env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;

        g_free(env->tlb_table[mmu_idx]);
        g_free(env->iotlb[mmu_idx]);
        env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
        env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
    }
}