/* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. */ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(&mm->page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); }
/* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. */ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM /* * Make sure hardware valid bit is not set. We don't do * tlb flush for this update. */ WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); }
/* * set_pte stores a linux PTE into the linux page table. */ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* * When handling numa faults, we already have the pte marked * _PAGE_PRESENT, but we can be sure that it is not in hpte. * Hence we can use set_pte_at for them. */ VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); /* Add the pte bit when trying to set a pte */ pte = __pte(pte_val(pte) | _PAGE_PTE); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. */ pte = set_pte_filter(pte); /* Perform the setting of the PTE */ __set_pte_at(mm, addr, ptep, pte, 0); }
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { /* * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ pgmap = get_dev_pagemap(pte_pfn(pte), NULL); if (pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { int ret; ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_SPLIT && PageTransCompound(page)) { int ret; get_page(page); pte_unmap_unlock(ptep, ptl); lock_page(page); ret = split_huge_page(page); unlock_page(page); put_page(page); if (ret) return ERR_PTR(ret); goto retry; } if (flags & FOLL_GET) { get_page(page); /* drop the pgmap reference now that we hold the page */ if (pgmap) { put_dev_pagemap(pgmap); pgmap = NULL; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out; /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }
long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) { unsigned long i, pa, gpa, gfn, psize; unsigned long slot_fn, hva; __be64 *hpte; struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned hpage_shift; bool is_ci; unsigned long *rmap; pte_t *ptep; unsigned int writing; unsigned long mmu_seq; unsigned long rcbits, irq_flags = 0; if (kvm_is_radix(kvm)) return H_FUNCTION; psize = kvmppc_actual_pgsz(pteh, ptel); if (!psize) return H_PARAMETER; writing = hpte_is_writable(ptel); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); ptel &= ~HPTE_GR_RESERVED; g_ptel = ptel; /* used later to detect if we might have been invalidated */ mmu_seq = kvm->mmu_notifier_seq; smp_rmb(); /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; is_ci = false; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* Emulated MMIO - mark this with key=31 */ pteh |= HPTE_V_ABSENT; ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; goto do_insert; } /* Check if the requested page fits entirely in the memslot. */ if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; slot_fn = gfn - memslot->base_gfn; rmap = &memslot->arch.rmap[slot_fn]; /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); /* * If we had a page table table change after lookup, we would * retry via mmu_notifier_retry. */ if (!realmode) local_irq_save(irq_flags); /* * If called in real mode we have MSR_EE = 0. Otherwise * we disable irq above. */ ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift); if (ptep) { pte_t pte; unsigned int host_pte_size; if (hpage_shift) host_pte_size = 1ul << hpage_shift; else host_pte_size = PAGE_SIZE; /* * We should always find the guest page size * to <= host page size, if host is using hugepage */ if (host_pte_size < psize) { if (!realmode) local_irq_restore(flags); return H_PARAMETER; } pte = kvmppc_read_update_linux_pte(ptep, writing); if (pte_present(pte) && !pte_protnone(pte)) { if (writing && !__pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); is_ci = pte_ci(pte); pa = pte_pfn(pte) << PAGE_SHIFT; pa |= hva & (host_pte_size - 1); pa |= gpa & ~PAGE_MASK; } } if (!realmode) local_irq_restore(irq_flags); ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1); ptel |= pa; if (pa) pteh |= HPTE_V_VALID; else { pteh |= HPTE_V_ABSENT; ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); } /*If we had host pte mapping then Check WIMG */ if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { if (is_ci) return H_PARAMETER; /* * Allow guest to map emulated device memory as * uncacheable, but actually make it cacheable. */ ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); ptel |= HPTE_R_M; } /* Find and lock the HPTEG slot to use */ do_insert: if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); for (i = 0; i < 8; ++i) { if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) break; hpte += 2; } if (i == 8) { /* * Since try_lock_hpte doesn't retry (not even stdcx. * failures), it could be that there is a free slot * but we transiently failed to lock it. Try again, * actually locking each slot and checking it. */ hpte -= 16; for (i = 0; i < 8; ++i) { u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = be64_to_cpu(hpte[0]); if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) break; __unlock_hpte(hpte, pte); hpte += 2; } if (i == 8) return H_PTEG_FULL; } pte_index += i; } else { hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) { /* Lock the slot and check again */ u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = be64_to_cpu(hpte[0]); if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { __unlock_hpte(hpte, pte); return H_PTEG_FULL; } } } /* Save away the guest's idea of the second HPTE dword */ rev = &kvm->arch.hpt.rev[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); if (rev) { rev->guest_rpte = g_ptel; note_hpte_modification(kvm, rev); } /* Link HPTE into reverse-map chain */ if (pteh & HPTE_V_VALID) { if (realmode) rmap = real_vmalloc_addr(rmap); lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ if (mmu_notifier_retry(kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); unlock_rmap(rmap); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); /* Only set R/C in real HPTE if already set in *rmap */ rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); } } /* Convert to new format on P9 */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { ptel = hpte_old_to_new_r(pteh, ptel); pteh = hpte_old_to_new_v(pteh); } hpte[1] = cpu_to_be64(ptel); /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); __unlock_hpte(hpte, pteh); asm volatile("ptesync" : : : "memory"); *pte_idx_ret = pte_index; return H_SUCCESS; }