int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct drm_gem_object *obj = vma->vm_private_data; struct drm_device *dev = obj->dev; struct page **pages; unsigned long pfn; pgoff_t pgoff; int ret; /* Make sure we don't parallel update on a fault, nor move or remove * something from beneath our feet */ ret = mutex_lock_interruptible(&dev->struct_mutex); if (ret) goto out; /* make sure we have pages attached now */ pages = get_pages(obj); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out_unlock; } /* We don't use vmf->pgoff since that has the fake offset: */ pgoff = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; pfn = page_to_pfn(pages[pgoff]); VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, pfn, pfn << PAGE_SHIFT); ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, __pfn_to_pfn_t(pfn, PFN_DEV)); out_unlock: mutex_unlock(&dev->struct_mutex); out: switch (ret) { case -EAGAIN: case 0: case -ERESTARTSYS: case -EINTR: case -EBUSY: /* * EBUSY is ok: this just means that another thread * already did the job. */ return VM_FAULT_NOPAGE; case -ENOMEM: return VM_FAULT_OOM; default: return VM_FAULT_SIGBUS; } }
int simple_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { size_t size = vma->vm_end - vma->vm_start; int i; int ret; struct page *map_page; char *ptr; printk(KERN_NOTICE "vmf:fault=%p flag=%x offset=%lx\n", vmf->virtual_address, vmf->flags, vmf->pgoff); printk(KERN_NOTICE "vma:start=0x%lx size=%x pgoff=%lx\n", vma->vm_start, (int)size, vma->vm_pgoff); vma->vm_flags |= VM_MIXEDMAP; map_page = data_pages[vmf->pgoff]; ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, page_to_pfn(map_page)); printk(KERN_NOTICE "ret=%d pfn=%x vaddr=0x%lx cnt=%d\n", ret, (int)page_to_pfn(map_page), (unsigned long)vmf->virtual_address, page_count(map_page)); /* * Example of the "Direct I/O" */ ptr = kmap_atomic(map_page); for (i = 0; i < 100; i++) { ptr[i] = (unsigned char)vmf->pgoff; } kunmap_atomic(ptr); SetPageDirty(map_page); return VM_FAULT_NOPAGE; }
int MMapPMR(struct file *pFile, struct vm_area_struct *ps_vma) { PVRSRV_ERROR eError; IMG_HANDLE hSecurePMRHandle; IMG_SIZE_T uiLength; IMG_DEVMEM_OFFSET_T uiOffset; unsigned long uiPFN; IMG_HANDLE hPMRResmanHandle; PMR *psPMR; PMR_FLAGS_T ulPMRFlags; IMG_UINT32 ui32CPUCacheFlags; unsigned long ulNewFlags = 0; pgprot_t sPageProt; #if defined(SUPPORT_DRM) CONNECTION_DATA *psConnection = LinuxConnectionFromFile(PVR_DRM_FILE_FROM_FILE(pFile)); #else CONNECTION_DATA *psConnection = LinuxConnectionFromFile(pFile); #endif #if defined(PVR_MMAP_USE_VM_INSERT) IMG_BOOL bMixedMap = IMG_FALSE; #endif /* * The pmr lock used here to protect both handle related operations and PMR * operations. * This was introduced to fix lockdep issue. */ mutex_lock(&g_sMMapMutex); PMRLock(); #if defined(SUPPORT_DRM_DC_MODULE) psPMR = PVRSRVGEMMMapLookupPMR(pFile, ps_vma); if (!psPMR) #endif { hSecurePMRHandle = (IMG_HANDLE)((IMG_UINTPTR_T)ps_vma->vm_pgoff); eError = PVRSRVLookupHandle(psConnection->psHandleBase, (IMG_HANDLE *) &hPMRResmanHandle, hSecurePMRHandle, PVRSRV_HANDLE_TYPE_PHYSMEM_PMR); if (eError != PVRSRV_OK) { goto e0; } eError = ResManFindPrivateDataByPtr(hPMRResmanHandle, (void **)&psPMR); if (eError != PVRSRV_OK) { goto e0; } } /* * Take a reference on the PMR, make's sure that it can't be freed * while it's mapped into the user process */ PMRRefPMR(psPMR); PMRUnlock(); eError = PMRLockSysPhysAddresses(psPMR, PAGE_SHIFT); if (eError != PVRSRV_OK) { goto e1; } if (((ps_vma->vm_flags & VM_WRITE) != 0) && ((ps_vma->vm_flags & VM_SHARED) == 0)) { eError = PVRSRV_ERROR_INVALID_PARAMS; goto e1; } /* * We ought to call PMR_Flags() here to check the permissions * against the requested mode, and possibly to set up the cache * control protflags */ eError = PMR_Flags(psPMR, &ulPMRFlags); if (eError != PVRSRV_OK) { goto e1; } ulNewFlags = ps_vma->vm_flags; #if 0 /* Discard user read/write request, we will pull these flags from the PMR */ ulNewFlags &= ~(VM_READ | VM_WRITE); if (ulPMRFlags & PVRSRV_MEMALLOCFLAG_CPU_READABLE) { ulNewFlags |= VM_READ; } if (ulPMRFlags & PVRSRV_MEMALLOCFLAG_CPU_WRITEABLE) { ulNewFlags |= VM_WRITE; } #endif ps_vma->vm_flags = ulNewFlags; #if defined (CONFIG_ARM64) sPageProt = __pgprot_modify(ps_vma->vm_page_prot, 0, vm_get_page_prot(ulNewFlags)); #elif defined(CONFIG_ARM) sPageProt = __pgprot_modify(ps_vma->vm_page_prot, L_PTE_MT_MASK, vm_get_page_prot(ulNewFlags)); #elif defined(CONFIG_X86) sPageProt = pgprot_modify(ps_vma->vm_page_prot, vm_get_page_prot(ulNewFlags)); #elif defined(CONFIG_METAG) || defined(CONFIG_MIPS) sPageProt = vm_get_page_prot(ulNewFlags); #else #error Please add pgprot_modify equivalent for your system #endif ui32CPUCacheFlags = DevmemCPUCacheMode(ulPMRFlags); switch (ui32CPUCacheFlags) { case PVRSRV_MEMALLOCFLAG_CPU_UNCACHED: sPageProt = pgprot_noncached(sPageProt); break; case PVRSRV_MEMALLOCFLAG_CPU_WRITE_COMBINE: sPageProt = pgprot_writecombine(sPageProt); break; case PVRSRV_MEMALLOCFLAG_CPU_CACHED: break; default: eError = PVRSRV_ERROR_INVALID_PARAMS; goto e1; } ps_vma->vm_page_prot = sPageProt; uiLength = ps_vma->vm_end - ps_vma->vm_start; ps_vma->vm_flags |= VM_IO; /* Don't include the mapping in core dumps */ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)) ps_vma->vm_flags |= VM_DONTDUMP; #else ps_vma->vm_flags |= VM_RESERVED; #endif /* * Disable mremap because our nopage handler assumes all * page requests have already been validated. */ ps_vma->vm_flags |= VM_DONTEXPAND; /* Don't allow mapping to be inherited across a process fork */ ps_vma->vm_flags |= VM_DONTCOPY; #if defined(PVR_MMAP_USE_VM_INSERT) { /* Scan the map range for pfns without struct page* handling. If we find * one, this is a mixed map, and we can't use vm_insert_page(). */ for (uiOffset = 0; uiOffset < uiLength; uiOffset += 1ULL<<PAGE_SHIFT) { IMG_CPU_PHYADDR sCpuPAddr; IMG_BOOL bValid; eError = PMR_CpuPhysAddr(psPMR, uiOffset, &sCpuPAddr, &bValid); PVR_ASSERT(eError == PVRSRV_OK); if (eError) { goto e2; } if (bValid) { uiPFN = sCpuPAddr.uiAddr >> PAGE_SHIFT; PVR_ASSERT(((IMG_UINT64)uiPFN << PAGE_SHIFT) == sCpuPAddr.uiAddr); if (!pfn_valid(uiPFN) || page_count(pfn_to_page(uiPFN)) == 0) { bMixedMap = IMG_TRUE; } } } if (bMixedMap) { ps_vma->vm_flags |= VM_MIXEDMAP; } } #endif /* defined(PVR_MMAP_USE_VM_INSERT) */ for (uiOffset = 0; uiOffset < uiLength; uiOffset += 1ULL<<PAGE_SHIFT) { IMG_SIZE_T uiNumContiguousBytes; IMG_INT32 iStatus; IMG_CPU_PHYADDR sCpuPAddr; IMG_BOOL bValid; uiNumContiguousBytes = 1ULL<<PAGE_SHIFT; eError = PMR_CpuPhysAddr(psPMR, uiOffset, &sCpuPAddr, &bValid); PVR_ASSERT(eError == PVRSRV_OK); if (eError) { goto e2; } /* Only map in pages that are valid, any that aren't will be picked up by the nopage handler which will return a zeroed page for us */ if (bValid) { uiPFN = sCpuPAddr.uiAddr >> PAGE_SHIFT; PVR_ASSERT(((IMG_UINT64)uiPFN << PAGE_SHIFT) == sCpuPAddr.uiAddr); #if defined(PVR_MMAP_USE_VM_INSERT) if (bMixedMap) { /* This path is just for debugging. It should be equivalent * to the remap_pfn_range() path. */ iStatus = vm_insert_mixed(ps_vma, ps_vma->vm_start + uiOffset, uiPFN); } else { iStatus = vm_insert_page(ps_vma, ps_vma->vm_start + uiOffset, pfn_to_page(uiPFN)); } #else /* defined(PVR_MMAP_USE_VM_INSERT) */ iStatus = remap_pfn_range(ps_vma, ps_vma->vm_start + uiOffset, uiPFN, uiNumContiguousBytes, ps_vma->vm_page_prot); #endif /* defined(PVR_MMAP_USE_VM_INSERT) */ PVR_ASSERT(iStatus == 0); if(iStatus) { // N.B. not the right error code, but, it doesn't get propagated anyway... :( eError = PVRSRV_ERROR_OUT_OF_MEMORY; goto e2; } #if defined(PVRSRV_ENABLE_PROCESS_STATS) /* USER MAPPING*/ #if !defined(PVRSRV_ENABLE_MEMORY_STATS) PVRSRVStatsIncrMemAllocStat(PVRSRV_MEM_ALLOC_TYPE_MAP_UMA_LMA_PAGES, PAGE_SIZE); #else PVRSRVStatsAddMemAllocRecord(PVRSRV_MEM_ALLOC_TYPE_MAP_UMA_LMA_PAGES, (IMG_VOID*)(IMG_UINTPTR_T)(ps_vma->vm_start + uiOffset), sCpuPAddr, PAGE_SIZE, IMG_NULL); #endif #endif } (void)pFile; } /* let us see the PMR so we can unlock it later */ ps_vma->vm_private_data = psPMR; /* Install open and close handlers for ref-counting */ ps_vma->vm_ops = &gsMMapOps; mutex_unlock(&g_sMMapMutex); return 0; /* error exit paths follow */ e2: PVR_DPF((PVR_DBG_ERROR, "don't know how to handle this error. Abort!")); PMRUnlockSysPhysAddresses(psPMR); e1: PMRUnrefPMR(psPMR); goto em1; e0: PVR_DPF((PVR_DBG_ERROR, "Error in MMapPMR critical section")); PMRUnlock(); em1: PVR_ASSERT(eError != PVRSRV_OK); PVR_DPF((PVR_DBG_ERROR, "unable to translate error %d", eError)); mutex_unlock(&g_sMMapMutex); return -ENOENT; // -EAGAIN // or what? }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }