static int mfill_zeropage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) { pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; pgoff_t offset, max_off; struct inode *inode; _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { /* the shmem MAP_PRIVATE case requires checking the i_size */ inode = dst_vma->vm_file->f_inode; offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); ret = -EFAULT; if (unlikely(offset >= max_off)) goto out_unlock; } ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); return ret; }
static void mincore_unmapped_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, unsigned char *vec) { unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; if (vma->vm_file) { pgoff_t pgoff; pgoff = linear_page_index(vma, addr); for (i = 0; i < nr; i++, pgoff++) vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); } else { for (i = 0; i < nr; i++) vec[i] = 0; } }
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; sector_t block; int error, result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) { split_huge_pmd(vma, pmd, address); dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; } /* If the PMD would extend outside the VMA */ if (pmd_addr < vma->vm_start) { dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; } if ((pmd_addr + PMD_SIZE) > vma->vm_end) { dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(NULL, address, "offset + huge page size > file size"); return VM_FAULT_FALLBACK; } memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; if (!buffer_mapped(&bh) && write) { if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; } bdev = bh.b_bdev; /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ if (alloc) { loff_t lstart = pgoff << PAGE_SHIFT; loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ truncate_pagecache_range(inode, lstart, lend); } i_mmap_lock_read(mapping); /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't * take i_mutex here, so just leave them hanging; they'll be freed * when the file is deleted. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) { result = VM_FAULT_SIGBUS; goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(&bh, address, "offset + huge page size > file size"); goto fallback; } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: <zero> sect: %llx\n", __func__, current->comm, address, (unsigned long long) to_sector(&bh, inode)); entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { struct blk_dax_ctl dax = { .sector = to_sector(&bh, inode), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { dax_pmd_dbg(&bh, address, "pfn unaligned"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (!pfn_t_devmap(dax.pfn)) { dax_unmap_atomic(bdev, &dax); dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } if (buffer_unwritten(&bh) || buffer_new(&bh)) { clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } dax_unmap_atomic(bdev, &dax); /* * For PTE faults we insert a radix tree entry for reads, and * leave it clean. Then on the first write we dirty the radix * tree entry via the dax_pfn_mkwrite() path. This sequence * allows the dax_pfn_mkwrite() call to be simpler and avoid a * call into get_block() to translate the pgoff to a sector in * order to be able to create a new radix tree entry. * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a * write we traverse all the way through __dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. */ if (write) { error = dax_radix_entry(mapping, pgoff, dax.sector, true, true); if (error) { dax_pmd_dbg(&bh, address, "PMD radix insertion failed"); goto fallback; } } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, pfn_t_to_pfn(dax.pfn), (unsigned long long) dax.sector); result |= vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); } out: i_mmap_unlock_read(mapping); if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); return result; fallback: count_vm_event(THP_FAULT_FALLBACK); result = VM_FAULT_FALLBACK; goto out; } EXPORT_SYMBOL_GPL(__dax_pmd_fault); /** * dax_pmd_fault - handle a PMD fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; if (flags & FAULT_FLAG_WRITE) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } result = __dax_pmd_fault(vma, address, pmd, flags, get_block, complete_unwritten); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); return result; }
static int mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, struct page **pagep) { struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; void *page_kaddr; int ret; struct page *page; pgoff_t offset, max_off; struct inode *inode; if (!*pagep) { ret = -ENOMEM; page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); if (!page) goto out; page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); kunmap_atomic(page_kaddr); /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { ret = -ENOENT; *pagep = page; /* don't free the page */ goto out; } } else { page = *pagep; *pagep = NULL; } /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); ret = -ENOMEM; if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { /* the shmem MAP_PRIVATE case requires checking the i_size */ inode = dst_vma->vm_file->f_inode; offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); ret = -EFAULT; if (unlikely(offset >= max_off)) goto out_release_uncharge_unlock; } ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); pte_unmap_unlock(dst_pte, ptl); ret = 0; out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); mem_cgroup_cancel_charge(page, memcg, false); out_release: put_page(page); goto out; }
/* * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is * called with mmap_sem held, it will release mmap_sem before returning. */ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, bool zeropage) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; long copied; struct page *page; struct hstate *h; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; struct address_space *mapping; /* * There is no default zero huge page for all huge page sizes as * supported by hugetlb. A PMD_SIZE huge pages may exist as used * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ if (zeropage) { up_read(&dst_mm->mmap_sem); return -EINVAL; } src_addr = src_start; dst_addr = dst_start; copied = 0; page = NULL; vma_hpagesize = vma_kernel_pagesize(dst_vma); /* * Validate alignment based on huge page size */ err = -EINVAL; if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) goto out_unlock; retry: /* * On routine entry dst_vma is set. If we had to drop mmap_sem and * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; /* * Check the vma is registered in uffd, this is * required to enforce the VM_MAYWRITE check done at * uffd registration time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) goto out_unlock; vm_shared = dst_vma->vm_flags & VM_SHARED; } if (WARN_ON(dst_addr & (vma_hpagesize - 1) || (len - copied) & (vma_hpagesize - 1))) goto out_unlock; /* * If not shared, ensure the dst_vma has a anon_vma. */ err = -ENOMEM; if (!vm_shared) { if (unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; } h = hstate_vma(dst_vma); while (src_addr < src_start + len) { pte_t dst_pteval; BUG_ON(dst_addr >= dst_start + len); VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* * Serialize via hugetlb_fault_mutex */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } err = -EEXIST; dst_pteval = huge_ptep_get(dst_pte); if (!huge_pte_none(dst_pteval)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); vm_alloc_shared = vm_shared; cond_resched(); if (unlikely(err == -ENOENT)) { up_read(&dst_mm->mmap_sem); BUG_ON(!page); err = copy_huge_page_from_user(page, (const void __user *)src_addr, pages_per_huge_page(h), true); if (unlikely(err)) { err = -EFAULT; goto out; } down_read(&dst_mm->mmap_sem); dst_vma = NULL; goto retry; } else BUG_ON(page); if (!err) { dst_addr += vma_hpagesize; src_addr += vma_hpagesize; copied += vma_hpagesize; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&dst_mm->mmap_sem); out: if (page) { /* * We encountered an error and are about to free a newly * allocated huge page. * * Reservation handling is very subtle, and is different for * private and shared mappings. See the routine * restore_reserve_on_error for details. Unfortunately, we * can not call restore_reserve_on_error now as it would * require holding mmap_sem. * * If a reservation for the page existed in the reservation * map of a private mapping, the map was modified to indicate * the reservation was consumed when the page was allocated. * We clear the PagePrivate flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. * This is better than leaking a global reservation. If no * reservation existed, it is still safe to clear PagePrivate * as no adjustments to reservation counts were made during * allocation. * * The reservation map for shared mappings indicates which * pages have reservations. When a huge page is allocated * for an address with a reservation, no change is made to * the reserve map. In this case PagePrivate will be set * to indicate that the global reservation count should be * incremented when the page is freed. This is the desired * behavior. However, when a huge page is allocated for an * address without a reservation a reservation entry is added * to the reservation map, and PagePrivate will not be set. * When the page is freed, the global reserve count will NOT * be incremented and it will appear as though we have leaked * reserved page. In this case, set PagePrivate so that the * global reserve count will be incremented to match the * reservation map entry which was created. * * Note that vm_alloc_shared is based on the flags of the vma * for which the page was originally allocated. dst_vma could * be different or NULL on error. */ if (vm_alloc_shared) SetPagePrivate(page); else ClearPagePrivate(page); put_page(page); } BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); return copied ? copied : err; }