static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg) { enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); /* * Page becomes evictable in two ways: * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. * 2) Before acquiring LRU lock to put the page to correct LRU and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] * * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need * following strict ordering: * * #0: __pagevec_lru_add_fn #1: clear_page_mlock * * SetPageLRU() TestClearPageMlocked() * smp_mb() // explicit ordering // above provides strict * // ordering * PageMlocked() PageLRU() * * * if '#1' does not observe setting of PG_lru by '#0' and fails * isolation, the explicit barrier will make sure that page_evictable * check will put the page in correct LRU. Without smp_mb(), SetPageLRU * can be reordered after PageMlocked check and can make '#1' to fail * the isolation of the page whose Mlocked bit is cleared (#0 is also * looking at the same page) and the evictable page will be stranded * in an unevictable LRU. */ smp_mb(); if (page_evictable(page)) { lru = page_lru(page); update_page_reclaim_stat(lruvec, page_is_file_cache(page), PageActive(page)); if (was_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); } else { lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); } add_page_to_lru_list(page, lruvec, lru); trace_mm_lru_insertion(page, lru); }
/* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ int swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret = 0, rw = WRITE; if (try_to_free_swap(page)) { unlock_page(page); goto out; } if (frontswap_put_page(page) == 0) { set_page_writeback(page); unlock_page(page); end_page_writeback(page); goto out; } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); ret = -ENOMEM; goto out; } if (wbc->sync_mode == WB_SYNC_ALL) rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); out: return ret; }
int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); if (frontswap_get_page(page) == 0) { SetPageUptodate(page); unlock_page(page); goto out; } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; goto out; } count_vm_event(PSWPIN); submit_bio(READ, bio); out: return ret; }
/* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ int swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret = 0, rw = WRITE; if (try_to_free_swap(page)) { set_swapped_bit(page_private(page), 0); unlock_page(page); goto out; } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); ret = -ENOMEM; goto out; } if (wbc->sync_mode == WB_SYNC_ALL) rw |= REQ_SYNC; bio->bi_io_vec[0].bv_page = priv_encrypt_page(page); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); out: return ret; }
/* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ int swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret = 0, rw = WRITE; if (try_to_free_swap(page)) { unlock_page(page); goto out; } #ifdef CONFIG_FRONTSWAP if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); end_page_writeback(page); goto out; } #endif bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); ret = -ENOMEM; goto out; } if (wbc->sync_mode == WB_SYNC_ALL) rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); out: return ret; }
/** * lru_cache_add_active_or_unevictable * @page: the page to be added to LRU * @vma: vma in which page is mapped for determining reclaimability * * Place @page on the active or unevictable LRU list, depending on its * evictability. Note that if the page is not evictable, it goes * directly back onto it's zone's unevictable list, it does NOT use a * per cpu pagevec. */ void lru_cache_add_active_or_unevictable(struct page *page, struct vm_area_struct *vma) { VM_BUG_ON_PAGE(PageLRU(page), page); if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) SetPageActive(page); else if (!TestSetPageMlocked(page)) { /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ __mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } lru_cache_add(page); }
int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)) { struct bio *bio; int ret = 0, rw = WRITE; struct swap_info_struct *sis = page_swap_info(page); bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); unlock_page(page); ret = -ENOMEM; goto out; } if (wbc->sync_mode == WB_SYNC_ALL) rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); submit_bio(rw, bio); out: return ret; }
static inline void count_compact_event(enum vm_event_item item) { count_vm_event(item); }
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; sector_t block; int error, result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) { split_huge_pmd(vma, pmd, address); dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; } /* If the PMD would extend outside the VMA */ if (pmd_addr < vma->vm_start) { dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; } if ((pmd_addr + PMD_SIZE) > vma->vm_end) { dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(NULL, address, "offset + huge page size > file size"); return VM_FAULT_FALLBACK; } memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; if (!buffer_mapped(&bh) && write) { if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; } bdev = bh.b_bdev; /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ if (alloc) { loff_t lstart = pgoff << PAGE_SHIFT; loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ truncate_pagecache_range(inode, lstart, lend); } i_mmap_lock_read(mapping); /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't * take i_mutex here, so just leave them hanging; they'll be freed * when the file is deleted. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) { result = VM_FAULT_SIGBUS; goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(&bh, address, "offset + huge page size > file size"); goto fallback; } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: <zero> sect: %llx\n", __func__, current->comm, address, (unsigned long long) to_sector(&bh, inode)); entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { struct blk_dax_ctl dax = { .sector = to_sector(&bh, inode), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { dax_pmd_dbg(&bh, address, "pfn unaligned"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (!pfn_t_devmap(dax.pfn)) { dax_unmap_atomic(bdev, &dax); dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } if (buffer_unwritten(&bh) || buffer_new(&bh)) { clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } dax_unmap_atomic(bdev, &dax); /* * For PTE faults we insert a radix tree entry for reads, and * leave it clean. Then on the first write we dirty the radix * tree entry via the dax_pfn_mkwrite() path. This sequence * allows the dax_pfn_mkwrite() call to be simpler and avoid a * call into get_block() to translate the pgoff to a sector in * order to be able to create a new radix tree entry. * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a * write we traverse all the way through __dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. */ if (write) { error = dax_radix_entry(mapping, pgoff, dax.sector, true, true); if (error) { dax_pmd_dbg(&bh, address, "PMD radix insertion failed"); goto fallback; } } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, pfn_t_to_pfn(dax.pfn), (unsigned long long) dax.sector); result |= vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); } out: i_mmap_unlock_read(mapping); if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); return result; fallback: count_vm_event(THP_FAULT_FALLBACK); result = VM_FAULT_FALLBACK; goto out; } EXPORT_SYMBOL_GPL(__dax_pmd_fault); /** * dax_pmd_fault - handle a PMD fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; if (flags & FAULT_FLAG_WRITE) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } result = __dax_pmd_fault(vma, address, pmd, flags, get_block, complete_unwritten); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); return result; }
static int copy_user_bh(struct page *to, struct inode *inode, struct buffer_head *bh, unsigned long vaddr) { struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); vto = kmap_atomic(to); copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); dax_unmap_atomic(bdev, &dax); return 0; } #define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) static int dax_radix_entry(struct address_space *mapping, pgoff_t index, sector_t sector, bool pmd_entry, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; pgoff_t pmd_index = DAX_PMD_INDEX(index); int type, error = 0; void *entry; WARN_ON_ONCE(pmd_entry && !dirty); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); spin_lock_irq(&mapping->tree_lock); entry = radix_tree_lookup(page_tree, pmd_index); if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { index = pmd_index; goto dirty; } entry = radix_tree_lookup(page_tree, index); if (entry) { type = RADIX_DAX_TYPE(entry); if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { error = -EIO; goto unlock; } if (!pmd_entry || type == RADIX_DAX_PMD) goto dirty; /* * We only insert dirty PMD entries into the radix tree. This * means we don't need to worry about removing a dirty PTE * entry and inserting a clean PMD entry, thus reducing the * range we would flush with a follow-up fsync/msync call. */ radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; } if (sector == NO_SECTOR) { /* * This can happen during correct operation if our pfn_mkwrite * fault raced against a hole punch operation. If this * happens the pte that was hole punched will have been * unmapped and the radix tree entry will have been removed by * the time we are called, but the call will still happen. We * will return all the way up to wp_pfn_shared(), where the * pte_same() check will fail, eventually causing page fault * to be retried by the CPU. */ goto unlock; } error = radix_tree_insert(page_tree, index, RADIX_DAX_ENTRY(sector, pmd_entry)); if (error) goto unlock; mapping->nrexceptional++; dirty: if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); return error; } static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; int ret = 0; spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ if (!__radix_tree_lookup(page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; /* another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { ret = -EIO; goto unlock; } dax.sector = RADIX_DAX_SECTOR(entry); dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); if (ret < 0) return ret; if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } wb_cache_pmem(dax.addr, dax.size); spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); return ret; unlock: spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; pgoff_t start_index, end_index, pmd_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; pmd_index = DAX_PMD_INDEX(start_index); rcu_read_lock(); entry = radix_tree_lookup(&mapping->page_tree, pmd_index); rcu_read_unlock(); /* see if the start of our range is covered by a PMD entry */ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) return ret; } } wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { .sector = to_sector(bh, inode), .size = bh->b_size, }; pgoff_t size; int error; i_mmap_lock_read(mapping); /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the * file. We can't tell the filesystem to free it because we can't * take i_mutex here. In the worst case, the file still has blocks * allocated past the end of the file. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { error = -EIO; goto out; } if (dax_map_atomic(bdev, &dax) < 0) { error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } dax_unmap_atomic(bdev, &dax); error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, vmf->flags & FAULT_FLAG_WRITE); if (error) goto out; error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); return error; } /** * __dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * @complete_unwritten: The filesystem method used to convert unwritten blocks * to written so the data written to them is exposed. This is required for * required by write faults for filesystems that will return unwritten * extent mappings from @get_block, but it is optional for reads as * dax_insert_mapping() will always zero unwritten blocks. If the fs does * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. __dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct page *page; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; int error; int major = 0; size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: page = find_get_page(mapping, vmf->pgoff); if (page) { if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { put_page(page); return VM_FAULT_RETRY; } if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (unlikely(vmf->pgoff >= size)) { /* * We have a struct page covering a hole in the file * from a read fault and we've raced with a truncate */ error = -EIO; goto unlock_page; } } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) goto unlock_page; } else { return dax_load_hole(mapping, page, vmf); } } if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) goto unlock_page; vmf->page = page; if (!page) { i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { i_mmap_unlock_read(mapping); error = -EIO; goto out; } } return VM_FAULT_LOCKED; } /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); if (page) { unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); delete_from_page_cache(page); unlock_page(page); put_page(page); page = NULL; } /* * If we successfully insert the new mapping over an unwritten extent, * we need to ensure we convert the unwritten extent. If there is an * error inserting the mapping, the filesystem needs to leave it as * unwritten to prevent exposure of the stale underlying data to * userspace, but we still need to call the completion function so * the private resources on the mapping buffer can be released. We * indicate what the callback should do via the uptodate variable, same * as for normal BH based IO completions. */ error = dax_insert_mapping(inode, &bh, vma, vmf); if (buffer_unwritten(&bh)) { if (complete_unwritten) complete_unwritten(&bh, !error); else WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; unlock_page: if (page) { unlock_page(page); put_page(page); } goto out; }
/* * Fill in the supplied page for mmap * XXX: how are we excluding truncate/invalidate here? Maybe need to lock * page? */ static int ncp_file_mmap_fault(struct vm_area_struct *area, struct vm_fault *vmf) { struct file *file = area->vm_file; struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; char *pg_addr; unsigned int already_read; unsigned int count; int bufsize; int pos; /* XXX: loff_t ? */ /* * ncpfs has nothing against high pages as long * as recvmsg and memset works on it */ vmf->page = alloc_page(GFP_HIGHUSER); if (!vmf->page) return VM_FAULT_OOM; pg_addr = kmap(vmf->page); pos = vmf->pgoff << PAGE_SHIFT; count = PAGE_SIZE; /* what we can read in one go */ bufsize = NCP_SERVER(inode)->buffer_size; already_read = 0; if (ncp_make_open(inode, O_RDONLY) >= 0) { while (already_read < count) { int read_this_time; int to_read; to_read = bufsize - (pos % bufsize); to_read = min_t(unsigned int, to_read, count - already_read); if (ncp_read_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, pos, to_read, pg_addr + already_read, &read_this_time) != 0) { read_this_time = 0; } pos += read_this_time; already_read += read_this_time; if (read_this_time < to_read) { break; } } ncp_inode_close(inode); } if (already_read < PAGE_SIZE) memset(pg_addr + already_read, 0, PAGE_SIZE - already_read); flush_dcache_page(vmf->page); kunmap(vmf->page); /* * If I understand ncp_read_kernel() properly, the above always * fetches from the network, here the analogue of disk. * -- nyc */ count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; }
static void __oom_kill_process(struct task_struct *victim, const char *message) { struct task_struct *p; struct mm_struct *mm; bool can_oom_reap = true; p = find_lock_task_mm(victim); if (!p) { put_task_struct(victim); return; } else if (victim != p) { get_task_struct(p); put_task_struct(victim); victim = p; } /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; mmgrab(mm); /* Raise event before sending signal: task reaper must see this */ count_vm_event(OOM_KILL); memcg_memory_event_mm(mm, MEMCG_OOM_KILL); /* * We should send SIGKILL before granting access to memory reserves * in order to prevent the OOM victim from depleting the memory * reserves from the user space under its control. */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", message, task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES)), K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); task_unlock(victim); /* * Kill all user processes sharing victim->mm in other thread groups, if * any. They don't get access to memory reserves, though, to avoid * depletion of all memory. This prevents mm->mmap_sem livelock when an * oom killed thread cannot exit because it requires the semaphore and * its contended by another thread trying to allocate memory itself. * That thread will now get access to memory reserves since it has a * pending fatal signal. */ rcu_read_lock(); for_each_process(p) { if (!process_shares_mm(p, mm)) continue; if (same_thread_group(p, victim)) continue; if (is_global_init(p)) { can_oom_reap = false; set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); continue; } /* * No use_mm() user needs to read from the userspace so we are * ok to reap it. */ if (unlikely(p->flags & PF_KTHREAD)) continue; do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); } rcu_read_unlock(); if (can_oom_reap) wake_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); }