Example #1
0
int qemu_memfd_create(const char *name, size_t size, bool hugetlb,
                      uint64_t hugetlbsize, unsigned int seals, Error **errp)
{
    int htsize = hugetlbsize ? ctz64(hugetlbsize) : 0;

    if (htsize && 1ULL << htsize != hugetlbsize) {
        error_setg(errp, "Hugepage size must be a power of 2");
        return -1;
    }

    htsize = htsize << MFD_HUGE_SHIFT;

#ifdef CONFIG_LINUX
    int mfd = -1;
    unsigned int flags = MFD_CLOEXEC;

    if (seals) {
        flags |= MFD_ALLOW_SEALING;
    }
    if (hugetlb) {
        flags |= MFD_HUGETLB;
        flags |= htsize;
    }
    mfd = memfd_create(name, flags);
    if (mfd < 0) {
        error_setg_errno(errp, errno,
                         "failed to create memfd with flags 0x%x", flags);
        goto err;
    }

    if (ftruncate(mfd, size) == -1) {
        error_setg_errno(errp, errno, "failed to resize memfd to %zu", size);
        goto err;
    }

    if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) {
        error_setg_errno(errp, errno, "failed to add seals 0x%x", seals);
        goto err;
    }

    return mfd;

err:
    if (mfd >= 0) {
        close(mfd);
    }
#else
    error_setg_errno(errp, ENOSYS, "failed to create memfd");
#endif
    return -1;
}
Example #2
0
static void spapr_cap_set_pagesize(Object *obj, Visitor *v, const char *name,
                                   void *opaque, Error **errp)
{
    sPAPRCapabilityInfo *cap = opaque;
    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
    uint64_t pagesize;
    uint8_t val;
    Error *local_err = NULL;

    visit_type_size(v, name, &pagesize, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        return;
    }

    if (!is_power_of_2(pagesize)) {
        error_setg(errp, "cap-%s must be a power of 2", cap->name);
        return;
    }

    val = ctz64(pagesize);
    spapr->cmd_line_caps[cap->index] = true;
    spapr->eff.caps[cap->index] = val;
}
Example #3
0
static void vfio_prereg_listener_region_add(MemoryListener *listener,
                                            MemoryRegionSection *section)
{
    VFIOContainer *container = container_of(listener, VFIOContainer,
                                            prereg_listener);
    const hwaddr gpa = section->offset_within_address_space;
    hwaddr end;
    int ret;
    hwaddr page_mask = qemu_real_host_page_mask;
    struct vfio_iommu_spapr_register_memory reg = {
        .argsz = sizeof(reg),
        .flags = 0,
    };

    if (vfio_prereg_listener_skipped_section(section)) {
        trace_vfio_prereg_listener_region_add_skip(
                section->offset_within_address_space,
                section->offset_within_address_space +
                int128_get64(int128_sub(section->size, int128_one())));
        return;
    }

    if (unlikely((section->offset_within_address_space & ~page_mask) ||
                 (section->offset_within_region & ~page_mask) ||
                 (int128_get64(section->size) & ~page_mask))) {
        error_report("%s received unaligned region", __func__);
        return;
    }

    end = section->offset_within_address_space + int128_get64(section->size);
    if (gpa >= end) {
        return;
    }

    memory_region_ref(section->mr);

    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
    reg.size = end - gpa;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
    trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
    if (ret) {
        /*
         * On the initfn path, store the first error in the container so we
         * can gracefully fail.  Runtime, there's not much we can do other
         * than throw a hardware error.
         */
        if (!container->initialized) {
            if (!container->error) {
                container->error = ret;
            }
        } else {
            hw_error("vfio: Memory registering failed, unable to continue");
        }
    }
}

static void vfio_prereg_listener_region_del(MemoryListener *listener,
                                            MemoryRegionSection *section)
{
    VFIOContainer *container = container_of(listener, VFIOContainer,
                                            prereg_listener);
    const hwaddr gpa = section->offset_within_address_space;
    hwaddr end;
    int ret;
    hwaddr page_mask = qemu_real_host_page_mask;
    struct vfio_iommu_spapr_register_memory reg = {
        .argsz = sizeof(reg),
        .flags = 0,
    };

    if (vfio_prereg_listener_skipped_section(section)) {
        trace_vfio_prereg_listener_region_del_skip(
                section->offset_within_address_space,
                section->offset_within_address_space +
                int128_get64(int128_sub(section->size, int128_one())));
        return;
    }

    if (unlikely((section->offset_within_address_space & ~page_mask) ||
                 (section->offset_within_region & ~page_mask) ||
                 (int128_get64(section->size) & ~page_mask))) {
        error_report("%s received unaligned region", __func__);
        return;
    }

    end = section->offset_within_address_space + int128_get64(section->size);
    if (gpa >= end) {
        return;
    }

    reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
    reg.size = end - gpa;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
    trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
}

const MemoryListener vfio_prereg_listener = {
    .region_add = vfio_prereg_listener_region_add,
    .region_del = vfio_prereg_listener_region_del,
};

int vfio_spapr_create_window(VFIOContainer *container,
                             MemoryRegionSection *section,
                             hwaddr *pgsize)
{
    int ret;
    unsigned pagesize = memory_region_iommu_get_min_page_size(section->mr);
    unsigned entries, pages;
    struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };

    /*
     * FIXME: For VFIO iommu types which have KVM acceleration to
     * avoid bouncing all map/unmaps through qemu this way, this
     * would be the right place to wire that up (tell the KVM
     * device emulation the VFIO iommu handles to use).
     */
    create.window_size = int128_get64(section->size);
    create.page_shift = ctz64(pagesize);
    /*
     * SPAPR host supports multilevel TCE tables, there is some
     * heuristic to decide how many levels we want for our table:
     * 0..64 = 1; 65..4096 = 2; 4097..262144 = 3; 262145.. = 4
     */
    entries = create.window_size >> create.page_shift;
    pages = MAX((entries * sizeof(uint64_t)) / getpagesize(), 1);
    pages = MAX(pow2ceil(pages) - 1, 1); /* Round up */
    create.levels = ctz64(pages) / 6 + 1;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
    if (ret) {
        error_report("Failed to create a window, ret = %d (%m)", ret);
        return -errno;
    }

    if (create.start_addr != section->offset_within_address_space) {
        vfio_spapr_remove_window(container, create.start_addr);

        error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
                     section->offset_within_address_space,
                     (uint64_t)create.start_addr);
        return -EINVAL;
    }
    trace_vfio_spapr_create_window(create.page_shift,
                                   create.window_size,
                                   create.start_addr);
    *pgsize = pagesize;

    return 0;
}

int vfio_spapr_remove_window(VFIOContainer *container,
                             hwaddr offset_within_address_space)
{
    struct vfio_iommu_spapr_tce_remove remove = {
        .argsz = sizeof(remove),
        .start_addr = offset_within_address_space,
    };
    int ret;

    ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
    if (ret) {
        error_report("Failed to remove window at %"PRIx64,
                     (uint64_t)remove.start_addr);
        return -errno;
    }

    trace_vfio_spapr_remove_window(offset_within_address_space);

    return 0;
}
Example #4
0
void cpu_save(QEMUFile *f, void *opaque)
{
    CPUState *env = opaque;
    uint16_t fptag, fpus, fpuc, fpregs_format;
    uint32_t hflags;
    int32_t a20_mask;
    int32_t pending_irq;
    int i, bit;

    if (kvm_enabled()) {
        kvm_save_registers(env);
        kvm_arch_save_mpstate(env);
    }

    for(i = 0; i < CPU_NB_REGS; i++)
        qemu_put_betls(f, &env->regs[i]);
    qemu_put_betls(f, &env->eip);
    qemu_put_betls(f, &env->eflags);
    hflags = env->hflags; /* XXX: suppress most of the redundant hflags */
    qemu_put_be32s(f, &hflags);

    /* FPU */
    fpuc = env->fpuc;
    fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
    fptag = 0;
    for(i = 0; i < 8; i++) {
        fptag |= ((!env->fptags[i]) << i);
    }

    qemu_put_be16s(f, &fpuc);
    qemu_put_be16s(f, &fpus);
    qemu_put_be16s(f, &fptag);

#ifdef USE_X86LDOUBLE
    fpregs_format = 0;
#else
    fpregs_format = 1;
#endif
    qemu_put_be16s(f, &fpregs_format);

    for(i = 0; i < 8; i++) {
#ifdef USE_X86LDOUBLE
        {
            uint64_t mant;
            uint16_t exp;
            /* we save the real CPU data (in case of MMX usage only 'mant'
               contains the MMX register */
            cpu_get_fp80(&mant, &exp, env->fpregs[i].d);
            qemu_put_be64(f, mant);
            qemu_put_be16(f, exp);
        }
#else
        /* if we use doubles for float emulation, we save the doubles to
           avoid losing information in case of MMX usage. It can give
           problems if the image is restored on a CPU where long
           doubles are used instead. */
        qemu_put_be64(f, env->fpregs[i].mmx.MMX_Q(0));
#endif
    }

    for(i = 0; i < 6; i++)
        cpu_put_seg(f, &env->segs[i]);
    cpu_put_seg(f, &env->ldt);
    cpu_put_seg(f, &env->tr);
    cpu_put_seg(f, &env->gdt);
    cpu_put_seg(f, &env->idt);

    qemu_put_be32s(f, &env->sysenter_cs);
    qemu_put_betls(f, &env->sysenter_esp);
    qemu_put_betls(f, &env->sysenter_eip);

    qemu_put_betls(f, &env->cr[0]);
    qemu_put_betls(f, &env->cr[2]);
    qemu_put_betls(f, &env->cr[3]);
    qemu_put_betls(f, &env->cr[4]);

    for(i = 0; i < 8; i++)
        qemu_put_betls(f, &env->dr[i]);

    /* MMU */
    a20_mask = (int32_t) env->a20_mask;
    qemu_put_sbe32s(f, &a20_mask);

    /* XMM */
    qemu_put_be32s(f, &env->mxcsr);
    for(i = 0; i < CPU_NB_REGS; i++) {
        qemu_put_be64s(f, &env->xmm_regs[i].XMM_Q(0));
        qemu_put_be64s(f, &env->xmm_regs[i].XMM_Q(1));
    }

#ifdef TARGET_X86_64
    qemu_put_be64s(f, &env->efer);
    qemu_put_be64s(f, &env->star);
    qemu_put_be64s(f, &env->lstar);
    qemu_put_be64s(f, &env->cstar);
    qemu_put_be64s(f, &env->fmask);
    qemu_put_be64s(f, &env->kernelgsbase);
#endif
    qemu_put_be32s(f, &env->smbase);
    qemu_put_be64s(f, &env->pat);
    qemu_put_be32s(f, &env->hflags2);
    
    qemu_put_be64s(f, &env->vm_hsave);
    qemu_put_be64s(f, &env->vm_vmcb);
    qemu_put_be64s(f, &env->tsc_offset);
    qemu_put_be64s(f, &env->intercept);
    qemu_put_be16s(f, &env->intercept_cr_read);
    qemu_put_be16s(f, &env->intercept_cr_write);
    qemu_put_be16s(f, &env->intercept_dr_read);
    qemu_put_be16s(f, &env->intercept_dr_write);
    qemu_put_be32s(f, &env->intercept_exceptions);
    qemu_put_8s(f, &env->v_tpr);

    /* MTRRs */
    for(i = 0; i < 11; i++)
        qemu_put_be64s(f, &env->mtrr_fixed[i]);
    qemu_put_be64s(f, &env->mtrr_deftype);
    for(i = 0; i < 8; i++) {
        qemu_put_be64s(f, &env->mtrr_var[i].base);
        qemu_put_be64s(f, &env->mtrr_var[i].mask);
    }

    /* KVM-related states */

    /* There can only be one pending IRQ set in the bitmap at a time, so try
       to find it and save its number instead (-1 for none). */
    pending_irq = -1;
    for (i = 0; i < ARRAY_SIZE(env->interrupt_bitmap); i++) {
        if (env->interrupt_bitmap[i]) {
            bit = ctz64(env->interrupt_bitmap[i]);
            pending_irq = i * 64 + bit;
            break;
        }
    }
    qemu_put_sbe32s(f, &pending_irq);
    qemu_put_be32s(f, &env->mp_state);
    qemu_put_be64s(f, &env->tsc);

    /* MCE */
    qemu_put_be64s(f, &env->mcg_cap);
    if (env->mcg_cap && !kvm_enabled()) {
        qemu_put_be64s(f, &env->mcg_status);
        qemu_put_be64s(f, &env->mcg_ctl);
        for (i = 0; i < (env->mcg_cap & 0xff); i++) {
            qemu_put_be64s(f, &env->mce_banks[4*i]);
            qemu_put_be64s(f, &env->mce_banks[4*i + 1]);
            qemu_put_be64s(f, &env->mce_banks[4*i + 2]);
            qemu_put_be64s(f, &env->mce_banks[4*i + 3]);
        }
    }
 }
Example #5
0
uint64_t HELPER(ctz_i64)(uint64_t arg, uint64_t zero_val)
{
    return arg ? ctz64(arg) : zero_val;
}