int msix_init_exclusive_bar(PCIDevice *dev, unsigned short nentries, uint8_t bar_nr) { int ret; char *name; uint32_t bar_size = 4096; uint32_t bar_pba_offset = bar_size / 2; uint32_t bar_pba_size = (nentries / 8 + 1) * 8; /* * Migration compatibility dictates that this remains a 4k * BAR with the vector table in the lower half and PBA in * the upper half for nentries which is lower or equal to 128. * No need to care about using more than 65 entries for legacy * machine types who has at most 64 queues. */ if (nentries * PCI_MSIX_ENTRY_SIZE > bar_pba_offset) { bar_pba_offset = nentries * PCI_MSIX_ENTRY_SIZE; } if (bar_pba_offset + bar_pba_size > 4096) { bar_size = bar_pba_offset + bar_pba_size; } bar_size = pow2ceil(bar_size); name = g_strdup_printf("%s-msix", dev->name); memory_region_init(&dev->msix_exclusive_bar, OBJECT(dev), name, bar_size); g_free(name); ret = msix_init(dev, nentries, &dev->msix_exclusive_bar, bar_nr, 0, &dev->msix_exclusive_bar, bar_nr, bar_pba_offset, 0); if (ret) { return ret; } pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY, &dev->msix_exclusive_bar); return 0; }
static void vfio_prereg_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, prereg_listener); const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; hwaddr page_mask = qemu_real_host_page_mask; struct vfio_iommu_spapr_register_memory reg = { .argsz = sizeof(reg), .flags = 0, }; if (vfio_prereg_listener_skipped_section(section)) { trace_vfio_prereg_listener_region_add_skip( section->offset_within_address_space, section->offset_within_address_space + int128_get64(int128_sub(section->size, int128_one()))); return; } if (unlikely((section->offset_within_address_space & ~page_mask) || (section->offset_within_region & ~page_mask) || (int128_get64(section->size) & ~page_mask))) { error_report("%s received unaligned region", __func__); return; } end = section->offset_within_address_space + int128_get64(section->size); if (gpa >= end) { return; } memory_region_ref(section->mr); reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); reg.size = end - gpa; ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); if (ret) { /* * On the initfn path, store the first error in the container so we * can gracefully fail. Runtime, there's not much we can do other * than throw a hardware error. */ if (!container->initialized) { if (!container->error) { container->error = ret; } } else { hw_error("vfio: Memory registering failed, unable to continue"); } } } static void vfio_prereg_listener_region_del(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, prereg_listener); const hwaddr gpa = section->offset_within_address_space; hwaddr end; int ret; hwaddr page_mask = qemu_real_host_page_mask; struct vfio_iommu_spapr_register_memory reg = { .argsz = sizeof(reg), .flags = 0, }; if (vfio_prereg_listener_skipped_section(section)) { trace_vfio_prereg_listener_region_del_skip( section->offset_within_address_space, section->offset_within_address_space + int128_get64(int128_sub(section->size, int128_one()))); return; } if (unlikely((section->offset_within_address_space & ~page_mask) || (section->offset_within_region & ~page_mask) || (int128_get64(section->size) & ~page_mask))) { error_report("%s received unaligned region", __func__); return; } end = section->offset_within_address_space + int128_get64(section->size); if (gpa >= end) { return; } reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); reg.size = end - gpa; ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); } const MemoryListener vfio_prereg_listener = { .region_add = vfio_prereg_listener_region_add, .region_del = vfio_prereg_listener_region_del, }; int vfio_spapr_create_window(VFIOContainer *container, MemoryRegionSection *section, hwaddr *pgsize) { int ret; unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr); unsigned entries, pages; struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this * would be the right place to wire that up (tell the KVM * device emulation the VFIO iommu handles to use). */ create.window_size = int128_get64(section->size); create.page_shift = ctz64(pagesize); /* * SPAPR host supports multilevel TCE tables, there is some * heuristic to decide how many levels we want for our table: * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4 */ entries = create.window_size >> create.page_shift; pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1); pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */ create.levels = ctz64(pages) / 6 + 1; ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); if (ret) { error_report("Failed to create a window, ret = %d (%m)", ret); return -errno; } if (create.start_addr != section->offset_within_address_space) { vfio_spapr_remove_window(container, create.start_addr); error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, section->offset_within_address_space, (uint64_t)create.start_addr); return -EINVAL; } trace_vfio_spapr_create_window(create.page_shift, create.window_size, create.start_addr); *pgsize = pagesize; return 0; } int vfio_spapr_remove_window(VFIOContainer *container, hwaddr offset_within_address_space) { struct vfio_iommu_spapr_tce_remove remove = { .argsz = sizeof(remove), .start_addr = offset_within_address_space, }; int ret; ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); if (ret) { error_report("Failed to remove window at %"PRIx64, (uint64_t)remove.start_addr); return -errno; } trace_vfio_spapr_remove_window(offset_within_address_space); return 0; }
static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma, PvrdmaRing **rings, uint32_t scqe, uint32_t smax_sge, uint32_t spages, uint32_t rcqe, uint32_t rmax_sge, uint32_t rpages) { uint64_t *dir = NULL, *tbl = NULL; PvrdmaRing *sr, *rr; int rc = -EINVAL; char ring_name[MAX_RING_NAME_SZ]; uint32_t wqe_sz; pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma); dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE); if (!dir) { pr_dbg("Failed to map to CQ page directory\n"); goto out; } tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE); if (!tbl) { pr_dbg("Failed to map to CQ page table\n"); goto out; } sr = g_malloc(2 * sizeof(*rr)); rr = &sr[1]; pr_dbg("sring=%p\n", sr); pr_dbg("rring=%p\n", rr); *rings = sr; pr_dbg("scqe=%d\n", scqe); pr_dbg("smax_sge=%d\n", smax_sge); pr_dbg("spages=%d\n", spages); pr_dbg("rcqe=%d\n", rcqe); pr_dbg("rmax_sge=%d\n", rmax_sge); pr_dbg("rpages=%d\n", rpages); /* Create send ring */ sr->ring_state = (struct pvrdma_ring *) rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE); if (!sr->ring_state) { pr_dbg("Failed to map to CQ ring state\n"); goto out_free_sr_mem; } wqe_sz = pow2ceil(sizeof(struct pvrdma_sq_wqe_hdr) + sizeof(struct pvrdma_sge) * smax_sge - 1); sprintf(ring_name, "qp_sring_%" PRIx64, pdir_dma); rc = pvrdma_ring_init(sr, ring_name, pci_dev, sr->ring_state, scqe, wqe_sz, (dma_addr_t *)&tbl[1], spages); if (rc) { goto out_unmap_ring_state; } /* Create recv ring */ rr->ring_state = &sr->ring_state[1]; wqe_sz = pow2ceil(sizeof(struct pvrdma_rq_wqe_hdr) + sizeof(struct pvrdma_sge) * rmax_sge - 1); sprintf(ring_name, "qp_rring_%" PRIx64, pdir_dma); rc = pvrdma_ring_init(rr, ring_name, pci_dev, rr->ring_state, rcqe, wqe_sz, (dma_addr_t *)&tbl[1 + spages], rpages); if (rc) { goto out_free_sr; } goto out; out_free_sr: pvrdma_ring_free(sr); out_unmap_ring_state: rdma_pci_dma_unmap(pci_dev, sr->ring_state, TARGET_PAGE_SIZE); out_free_sr_mem: g_free(sr); out: rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE); rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE); return rc; }
static size_t buffer_req_size(Buffer *buffer, size_t len) { return MAX(BUFFER_MIN_INIT_SIZE, pow2ceil(buffer->offset + len)); }
/** * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary * @env: CPU that owns the TLB * @mmu_idx: MMU index of the TLB * * Called with tlb_lock_held. * * We have two main constraints when resizing a TLB: (1) we only resize it * on a TLB flush (otherwise we'd have to take a perf hit by either rehashing * the array or unnecessarily flushing it), which means we do not control how * frequently the resizing can occur; (2) we don't have access to the guest's * future scheduling decisions, and therefore have to decide the magnitude of * the resize based on past observations. * * In general, a memory-hungry process can benefit greatly from an appropriately * sized TLB, since a guest TLB miss is very expensive. This doesn't mean that * we just have to make the TLB as large as possible; while an oversized TLB * results in minimal TLB miss rates, it also takes longer to be flushed * (flushes can be _very_ frequent), and the reduced locality can also hurt * performance. * * To achieve near-optimal performance for all kinds of workloads, we: * * 1. Aggressively increase the size of the TLB when the use rate of the * TLB being flushed is high, since it is likely that in the near future this * memory-hungry process will execute again, and its memory hungriness will * probably be similar. * * 2. Slowly reduce the size of the TLB as the use rate declines over a * reasonably large time window. The rationale is that if in such a time window * we have not observed a high TLB use rate, it is likely that we won't observe * it in the near future. In that case, once a time window expires we downsize * the TLB to match the maximum use rate observed in the window. * * 3. Try to keep the maximum use rate in a time window in the 30-70% range, * since in that range performance is likely near-optimal. Recall that the TLB * is direct mapped, so we want the use rate to be low (or at least not too * high), since otherwise we are likely to have a significant amount of * conflict misses. */ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx) { CPUTLBDesc *desc = &env->tlb_d[mmu_idx]; size_t old_size = tlb_n_entries(env, mmu_idx); size_t rate; size_t new_size = old_size; int64_t now = get_clock_realtime(); int64_t window_len_ms = 100; int64_t window_len_ns = window_len_ms * 1000 * 1000; bool window_expired = now > desc->window.begin_ns + window_len_ns; if (desc->n_used_entries > desc->window.max_entries) { desc->window.max_entries = desc->n_used_entries; } rate = desc->window.max_entries * 100 / old_size; if (rate > 70) { new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS); } else if (rate < 30 && window_expired) { size_t ceil = pow2ceil(desc->window.max_entries); size_t expected_rate = desc->window.max_entries * 100 / ceil; /* * Avoid undersizing when the max number of entries seen is just below * a pow2. For instance, if max_entries == 1025, the expected use rate * would be 1025/2048==50%. However, if max_entries == 1023, we'd get * 1023/1024==99.9% use rate, so we'd likely end up doubling the size * later. Thus, make sure that the expected use rate remains below 70%. * (and since we double the size, that means the lowest rate we'd * expect to get is 35%, which is still in the 30-70% range where * we consider that the size is appropriate.) */ if (expected_rate > 70) { ceil *= 2; } new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS); } if (new_size == old_size) { if (window_expired) { tlb_window_reset(&desc->window, now, desc->n_used_entries); } return; } g_free(env->tlb_table[mmu_idx]); g_free(env->iotlb[mmu_idx]); tlb_window_reset(&desc->window, now, 0); /* desc->n_used_entries is cleared by the caller */ env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS; env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size); env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size); /* * If the allocations fail, try smaller sizes. We just freed some * memory, so going back to half of new_size has a good chance of working. * Increased memory pressure elsewhere in the system might cause the * allocations to fail though, so we progressively reduce the allocation * size, aborting if we cannot even allocate the smallest TLB we support. */ while (env->tlb_table[mmu_idx] == NULL || env->iotlb[mmu_idx] == NULL) { if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) { error_report("%s: %s", __func__, strerror(errno)); abort(); } new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS); env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS; g_free(env->tlb_table[mmu_idx]); g_free(env->iotlb[mmu_idx]); env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size); env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size); } }