/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */ static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; int reset_overflow = 0; int shmem; /* * When searching mms for an entry, a good strategy is to * start at the first mm we freed the previous entry from * (though actually we don't notice whether we or coincidence * freed the entry). Initialize this start_mm with a hold. * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less * advantage of mmlist ordering, which clusters forked mms * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert * that. Though it's only a serious concern when an overflowed * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; } /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry * has been freed independently, and will not be * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ if (!*swap_map) continue; retval = -ENOMEM; break; } /* * Don't hold on to start_mm if it looks like exiting. */ if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; atomic_inc(&init_mm.mm_users); } /* * Wait for and lock page. When do_swap_page races with * try_to_unuse, do_swap_page can handle the fault much * faster than try_to_unuse can locate the entry. This * apparently redundant "wait_on_page_locked" lets try_to_unuse * defer to do_swap_page in such a case - in some tests, * do_swap_page and try_to_unuse repeatedly compete. */ wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); wait_on_page_writeback(page); /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ shmem = 0; swcount = *swap_map; if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; struct mm_struct *prev_mm = start_mm; struct mm_struct *mm; atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; cond_resched(); swcount = *swap_map; if (swcount <= 1) ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; set_start_mm = 0; } spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); mmput(start_mm); start_mm = new_start_mm; } if (retval) { unlock_page(page); page_cache_release(page); break; } /* * How could swap count reach 0x7fff when the maximum * pid is 0x7fff, and there's no way to repeat a swap * page within an mm (except in shmem, where it's the * shared object which takes the reference count)? * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. * * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ if (*swap_map == SWAP_MAP_MAX) { spin_lock(&swap_lock); *swap_map = 1; spin_unlock(&swap_lock); reset_overflow = 1; } /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, * so we might loop indefinitely; also, that page could * not be swapped out to other storage meanwhile. So: * delete from cache even if there's another reference, * after ensuring that the data has been saved to disk - * since if the reference remains (rarer), it will be * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. * * Note shmem_unuse already deleted a swappage from * the swap cache, unless the move to filepage failed: * in which case it left swappage in cache, lowered its * swap count to pass quickly through the loops above, * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; swap_writepage(page, &wbc); lock_page(page); wait_on_page_writeback(page); } if (PageSwapCache(page)) { if (shmem) swap_duplicate(entry); else delete_from_swap_cache(page); } /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must * mark page dirty so shrink_list will preserve it. */ SetPageDirty(page); unlock_page(page); page_cache_release(page); /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); }
/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, bool *retry) { struct lu_env *env; struct cl_io *io; struct vvp_io *vio; struct cl_env_nest nest; int result; sigset_t set; struct inode *inode; struct ll_inode_info *lli; io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); if (IS_ERR(io)) { result = PTR_ERR(io); goto out; } result = io->ci_result; if (result < 0) goto out_io; io->u.ci_fault.ft_mkwrite = 1; io->u.ci_fault.ft_writable = 1; vio = vvp_env_io(env); vio->u.fault.ft_vma = vma; vio->u.fault.ft_vmpage = vmpage; set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); inode = vvp_object_inode(io->ci_obj); lli = ll_i2info(inode); result = cl_io_loop(env, io); cfs_restore_sigs(set); if (result == 0) { struct inode *inode = file_inode(vma->vm_file); struct ll_inode_info *lli = ll_i2info(inode); lock_page(vmpage); if (!vmpage->mapping) { unlock_page(vmpage); /* page was truncated and lock was cancelled, return * ENODATA so that VM_FAULT_NOPAGE will be returned * to handle_mm_fault(). */ if (result == 0) result = -ENODATA; } else if (!PageDirty(vmpage)) { /* race, the page has been cleaned by ptlrpcd after * it was unlocked, it has to be added into dirty * cache again otherwise this soon-to-dirty page won't * consume any grants, even worse if this page is being * transferred because it will break RPC checksum. */ unlock_page(vmpage); CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has been written out, retry.\n", vmpage, vmpage->index); *retry = true; result = -EAGAIN; } if (result == 0) { spin_lock(&lli->lli_lock); lli->lli_flags |= LLIF_DATA_MODIFIED; spin_unlock(&lli->lli_lock); } } out_io: cl_io_fini(env, io); cl_env_nested_put(&nest, env); out: CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); LASSERT(ergo(result == 0, PageLocked(vmpage))); return result; }
/* * For address_spaces which do not use buffers nor write back. */ int yramfs_file_set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) return !TestSetPageDirty(page); return 0; }
/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @writepage: function called for each page * @data: data passed to writepage function * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. * * To avoid livelocks (when other process dirties new pages), we first tag * pages which should be written back with TOWRITE tag and only then start * writing them. For data-integrity sync we have to be careful so that we do * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ done = 1; break; } done_index = page->index; lock_page(page); /* * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no * real expectation of this data interity operation * even if there is now a new, dirty page at the same * pagecache address. */ if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ done_index = page->index + 1; done = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
/** * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache * @dmap: destination page cache * @smap: source page cache * * No pages must no be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, struct address_space *smap) { struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; int err; pagevec_init(&pvec, 0); repeat: n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); if (!n) return; index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; pgoff_t offset = page->index; lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { /* override existing page on the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); page_cache_release(dpage); } else { struct page *page2; /* move the page to the destination cache */ spin_lock_irq(&smap->tree_lock); page2 = radix_tree_delete(&smap->page_tree, offset); WARN_ON(page2 != page); smap->nrpages--; spin_unlock_irq(&smap->tree_lock); spin_lock_irq(&dmap->tree_lock); err = radix_tree_insert(&dmap->page_tree, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; page_cache_release(page); /* for cache */ } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) radix_tree_tag_set(&dmap->page_tree, offset, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&dmap->tree_lock); } unlock_page(page); } pagevec_release(&pvec); cond_resched(); goto repeat; }
/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */ static int try_to_unuse(unsigned int type) { struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; struct page *page; swp_entry_t entry; int i = 0; int retval = 0; int reset_overflow = 0; /* * When searching mms for an entry, a good strategy is to * start at the first mm we freed the previous entry from * (though actually we don't notice whether we or coincidence * freed the entry). Initialize this start_mm with a hold. * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less * advantage of mmlist ordering (now preserved by swap_out()), * which clusters forked address spaces together, most recent * child immediately after parent. If we race with dup_mmap(), * we very much want to resolve parent before child, otherwise * we may miss some entries: using last mm would invert that. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: * mmput() removes mm from mmlist before exit_mmap() and its * zap_page_range(). That's not too bad, those entries are * on their way out, and handled faster there than here. * do_munmap() behaves similarly, taking the range out of mm's * vma list before zap_page_range(). But unfortunately, when * unmapping a part of a vma, it takes the whole out first, * then reinserts what's left after (might even reschedule if * open() method called) - so swap entries may be invisible * to swapoff for a while, then reappear - but that is rare. */ while ((i = find_next_to_unuse(si, i))) { /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = SWP_ENTRY(type, i); page = read_swap_cache_async(entry); if (!page) { /* * Either swap_duplicate() failed because entry * has been freed independently, and will not be * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ if (!*swap_map) continue; retval = -ENOMEM; break; } /* * Don't hold on to start_mm if it looks like exiting. */ if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; atomic_inc(&init_mm.mm_users); } /* * Wait for and lock page. When do_swap_page races with * try_to_unuse, do_swap_page can handle the fault much * faster than try_to_unuse can locate the entry. This * apparently redundant "wait_on_page" lets try_to_unuse * defer to do_swap_page in such a case - in some tests, * do_swap_page and try_to_unuse repeatedly compete. */ wait_on_page(page); lock_page(page); /* * Remove all references to entry, without blocking. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ swcount = *swap_map; if (swcount > 1) { flush_page_to_ram(page); if (start_mm == &init_mm) shmem_unuse(entry, page); else unuse_process(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; struct mm_struct *mm; spin_lock(&mmlist_lock); while (*swap_map > 1 && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); swcount = *swap_map; if (mm == &init_mm) { set_start_mm = 1; shmem_unuse(entry, page); } else unuse_process(mm, entry, page); if (set_start_mm && *swap_map < swcount) { new_start_mm = mm; set_start_mm = 0; } } atomic_inc(&new_start_mm->mm_users); spin_unlock(&mmlist_lock); mmput(start_mm); start_mm = new_start_mm; } /* * How could swap count reach 0x7fff when the maximum * pid is 0x7fff, and there's no way to repeat a swap * page within an mm (except in shmem, where it's the * shared object which takes the reference count)? * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. * * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ if (*swap_map == SWAP_MAP_MAX) { swap_list_lock(); swap_device_lock(si); nr_swap_pages++; *swap_map = 1; swap_device_unlock(si); swap_list_unlock(); reset_overflow = 1; } /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_swap_out could * then re-duplicate the entry once we drop page lock, * so we might loop indefinitely; also, that page could * not be swapped out to other storage meanwhile. So: * delete from cache even if there's another reference, * after ensuring that the data has been saved to disk - * since if the reference remains (rarer), it will be * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. * Note shmem_unuse already deleted its from swap cache. */ swcount = *swap_map; if ((swcount > 0) != PageSwapCache(page)) BUG(); if ((swcount > 1) && PageDirty(page)) { rw_swap_page(WRITE, page); lock_page(page); } if (PageSwapCache(page)) delete_from_swap_cache(page); /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must * mark page dirty so try_to_swap_out will preserve it. */ SetPageDirty(page); UnlockPage(page); page_cache_release(page); /* * Make sure that we aren't completely killing * interactive performance. Interruptible check on * signal_pending() would be nice, but changes the spec? */ if (current->need_resched) schedule(); }
/* * Synchronously write back the locked page and any subsequent non-locked dirty * pages. */ static int afs_write_back_from_locked_page(struct address_space *mapping, struct writeback_control *wbc, struct page *primary_page, pgoff_t final_page) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct page *pages[8], *page; unsigned long count, priv; unsigned n, offset, to, f, t; pgoff_t start, first, last; int loop, ret; _enter(",%lx", primary_page->index); count = 1; if (test_set_page_writeback(primary_page)) BUG(); /* Find all consecutive lockable dirty pages that have contiguous * written regions, stopping when we find a page that is not * immediately lockable, is not dirty or is missing, or we reach the * end of the range. */ start = primary_page->index; priv = page_private(primary_page); offset = priv & AFS_PRIV_MAX; to = priv >> AFS_PRIV_SHIFT; trace_afs_page_dirty(vnode, tracepoint_string("store"), primary_page->index, priv); WARN_ON(offset == to); if (offset == to) trace_afs_page_dirty(vnode, tracepoint_string("WARN"), primary_page->index, priv); if (start >= final_page || (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) goto no_more; start++; do { _debug("more %lx [%lx]", start, count); n = final_page - start + 1; if (n > ARRAY_SIZE(pages)) n = ARRAY_SIZE(pages); n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages); _debug("fgpc %u", n); if (n == 0) goto no_more; if (pages[0]->index != start) { do { put_page(pages[--n]); } while (n > 0); goto no_more; } for (loop = 0; loop < n; loop++) { page = pages[loop]; if (to != PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) break; if (page->index > final_page) break; if (!trylock_page(page)) break; if (!PageDirty(page) || PageWriteback(page)) { unlock_page(page); break; } priv = page_private(page); f = priv & AFS_PRIV_MAX; t = priv >> AFS_PRIV_SHIFT; if (f != 0 && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); break; } to = t; trace_afs_page_dirty(vnode, tracepoint_string("store+"), page->index, priv); if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) BUG(); unlock_page(page); put_page(page); } count += loop; if (loop < n) { for (; loop < n; loop++) put_page(pages[loop]); goto no_more; } start += loop; } while (start <= final_page && count < 65536); no_more: /* We now have a contiguous set of dirty pages, each with writeback * set; the first page is still locked at this point, but all the rest * have been unlocked. */ unlock_page(primary_page); first = primary_page->index; last = first + count - 1; _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); ret = afs_store_data(mapping, first, last, offset, to); switch (ret) { case 0: ret = count; break; default: pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); /* Fall through */ case -EACCES: case -EPERM: case -ENOKEY: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: afs_redirty_pages(wbc, mapping, first, last); mapping_set_error(mapping, ret); break; case -EDQUOT: case -ENOSPC: afs_redirty_pages(wbc, mapping, first, last); mapping_set_error(mapping, -ENOSPC); break; case -EROFS: case -EIO: case -EREMOTEIO: case -EFBIG: case -ENOENT: case -ENOMEDIUM: case -ENXIO: afs_kill_pages(mapping, first, last); mapping_set_error(mapping, ret); break; } _leave(" = %d", ret); return ret; }
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte) || pte_file(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { if ((flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(pte))) goto bad_page; page = pte_page(pte); } if (flags & FOLL_GET) get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } pte_unmap_unlock(ptep, ptl); return page; bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT); no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }
static int tagfs_set_page_dirty(struct page *page) { if (!PageDirty(page)) return !TestSetPageDirty(page); return 0; }
// check_swap - check the correctness of swap & page replacement algorithm static void check_swap(void) { size_t nr_free_pages_store = nr_free_pages(); size_t slab_allocated_store = slab_allocated(); size_t offset; for (offset = 2; offset < max_swap_offset; offset ++) { mem_map[offset] = 1; } struct mm_struct *mm = mm_create(); assert(mm != NULL); extern struct mm_struct *check_mm_struct; assert(check_mm_struct == NULL); check_mm_struct = mm; pgd_t *pgdir = mm->pgdir = boot_pgdir; assert(pgdir[0] == 0); struct vma_struct *vma = vma_create(0, PTSIZE, VM_WRITE | VM_READ); assert(vma != NULL); insert_vma_struct(mm, vma); struct Page *rp0 = alloc_page(), *rp1 = alloc_page(); assert(rp0 != NULL && rp1 != NULL); uint32_t perm = PTE_U | PTE_W; int ret = page_insert(pgdir, rp1, 0, perm); assert(ret == 0 && page_ref(rp1) == 1); page_ref_inc(rp1); ret = page_insert(pgdir, rp0, 0, perm); assert(ret == 0 && page_ref(rp1) == 1 && page_ref(rp0) == 1); // check try_alloc_swap_entry swap_entry_t entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); mem_map[1] = 1; assert(try_alloc_swap_entry() == 0); // set rp1, Swap, Active, add to hash_list, active_list swap_page_add(rp1, entry); swap_active_list_add(rp1); assert(PageSwap(rp1)); mem_map[1] = 0; entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); assert(!PageSwap(rp1)); // check swap_remove_entry assert(swap_hash_find(entry) == NULL); mem_map[1] = 2; swap_remove_entry(entry); assert(mem_map[1] == 1); swap_page_add(rp1, entry); swap_inactive_list_add(rp1); swap_remove_entry(entry); assert(PageSwap(rp1)); assert(rp1->index == entry && mem_map[1] == 0); // check page_launder, move page from inactive_list to active_list assert(page_ref(rp1) == 1); assert(nr_active_pages == 0 && nr_inactive_pages == 1); assert(list_next(&(inactive_list.swap_list)) == &(rp1->swap_link)); page_launder(); assert(nr_active_pages == 1 && nr_inactive_pages == 0); assert(PageSwap(rp1) && PageActive(rp1)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1); assert(!PageSwap(rp1) && nr_active_pages == 0); assert(list_empty(&(active_list.swap_list))); // set rp1 inactive again assert(page_ref(rp1) == 1); swap_page_add(rp1, 0); assert(PageSwap(rp1) && swap_offset(rp1->index) == 1); swap_inactive_list_add(rp1); mem_map[1] = 1; assert(nr_inactive_pages == 1); page_ref_dec(rp1); size_t count = nr_free_pages(); swap_remove_entry(entry); assert(nr_inactive_pages == 0 && nr_free_pages() == count + 1); // check swap_out_mm pte_t *ptep0 = get_pte(pgdir, 0, 0), *ptep1; assert(ptep0 != NULL && pte2page(*ptep0) == rp0); ret = swap_out_mm(mm, 0); assert(ret == 0); ret = swap_out_mm(mm, 10); assert(ret == 1 && mm->swap_address == PGSIZE); ret = swap_out_mm(mm, 10); assert(ret == 0 && *ptep0 == entry && mem_map[1] == 1); assert(PageDirty(rp0) && PageActive(rp0) && page_ref(rp0) == 0); assert(nr_active_pages == 1 && list_next(&(active_list.swap_list)) == &(rp0->swap_link)); // check refill_inactive_scan() refill_inactive_scan(); assert(!PageActive(rp0) && page_ref(rp0) == 0); assert(nr_inactive_pages == 1 && list_next(&(inactive_list.swap_list)) == &(rp0->swap_link)); page_ref_inc(rp0); page_launder(); assert(PageActive(rp0) && page_ref(rp0) == 1); assert(nr_active_pages == 1 && list_next(&(active_list.swap_list)) == &(rp0->swap_link)); page_ref_dec(rp0); refill_inactive_scan(); assert(!PageActive(rp0)); // save data in rp0 int i; for (i = 0; i < PGSIZE; i ++) { ((char *)page2kva(rp0))[i] = (char)i; } page_launder(); assert(nr_inactive_pages == 0 && list_empty(&(inactive_list.swap_list))); assert(mem_map[1] == 1); rp1 = alloc_page(); assert(rp1 != NULL); ret = swapfs_read(entry, rp1); assert(ret == 0); for (i = 0; i < PGSIZE; i ++) { assert(((char *)page2kva(rp1))[i] == (char)i); } // page fault now *(char *)0 = 0xEF; rp0 = pte2page(*ptep0); assert(page_ref(rp0) == 1); assert(PageSwap(rp0) && PageActive(rp0)); entry = try_alloc_swap_entry(); assert(swap_offset(entry) == 1 && mem_map[1] == SWAP_UNUSED); assert(!PageSwap(rp0) && nr_active_pages == 0 && nr_inactive_pages == 0); // clear accessed flag assert(rp0 == pte2page(*ptep0)); assert(!PageSwap(rp0)); ret = swap_out_mm(mm, 10); assert(ret == 0); assert(!PageSwap(rp0) && (*ptep0 & PTE_P)); // change page table ret = swap_out_mm(mm, 10); assert(ret == 1); assert(*ptep0 == entry && page_ref(rp0) == 0 && mem_map[1] == 1); count = nr_free_pages(); refill_inactive_scan(); page_launder(); assert(count + 1 == nr_free_pages()); ret = swapfs_read(entry, rp1); assert(ret == 0 && *(char *)(page2kva(rp1)) == (char)0xEF); free_page(rp1); // duplictate *ptep0 ptep1 = get_pte(pgdir, PGSIZE, 0); assert(ptep1 != NULL && *ptep1 == 0); swap_duplicate(*ptep0); *ptep1 = *ptep0; // page fault again // update for copy on write *(char *)1 = 0x88; *(char *)(PGSIZE) = 0x8F; *(char *)(PGSIZE + 1) = 0xFF; assert(pte2page(*ptep0) != pte2page(*ptep1)); assert(*(char *)0 == (char)0xEF); assert(*(char *)1 == (char)0x88); assert(*(char *)(PGSIZE) == (char)0x8F); assert(*(char *)(PGSIZE + 1) == (char)0xFF); rp0 = pte2page(*ptep0); rp1 = pte2page(*ptep1); assert(!PageSwap(rp0) && PageSwap(rp1) && PageActive(rp1)); entry = try_alloc_swap_entry(); assert(!PageSwap(rp0) && !PageSwap(rp1)); assert(swap_offset(entry) == 1 && mem_map[1] == SWAP_UNUSED); assert(list_empty(&(active_list.swap_list))); assert(list_empty(&(inactive_list.swap_list))); page_insert(pgdir, rp0, PGSIZE, perm | PTE_A); // check swap_out_mm *(char *)0 = *(char *)PGSIZE = 0xEE; mm->swap_address = PGSIZE * 2; ret = swap_out_mm(mm, 2); assert(ret == 0); assert((*ptep0 & PTE_P) && !(*ptep0 & PTE_A)); assert((*ptep1 & PTE_P) && !(*ptep1 & PTE_A)); ret = swap_out_mm(mm, 2); assert(ret == 2); assert(mem_map[1] == 2 && page_ref(rp0) == 0); refill_inactive_scan(); page_launder(); assert(mem_map[1] == 2 && swap_hash_find(entry) == NULL); // check copy entry swap_remove_entry(entry); *ptep1 = 0; assert(mem_map[1] == 1); swap_entry_t store; ret = swap_copy_entry(entry, &store); assert(ret == -E_NO_MEM); mem_map[2] = SWAP_UNUSED; ret = swap_copy_entry(entry, &store); assert(ret == 0 && swap_offset(store) == 2 && mem_map[2] == 0); mem_map[2] = 1; *ptep1 = store; assert(*(char *)PGSIZE == (char)0xEE && *(char *)(PGSIZE + 1)== (char)0x88); *(char *)PGSIZE = 1, *(char *)(PGSIZE + 1) = 2; assert(*(char *)0 == (char)0xEE && *(char *)1 == (char)0x88); ret = swap_in_page(entry, &rp0); assert(ret == 0); ret = swap_in_page(store, &rp1); assert(ret == 0); assert(rp1 != rp0); // free memory swap_list_del(rp0), swap_list_del(rp1); swap_page_del(rp0), swap_page_del(rp1); assert(page_ref(rp0) == 1 && page_ref(rp1) == 1); assert(nr_active_pages == 0 && list_empty(&(active_list.swap_list))); assert(nr_inactive_pages == 0 && list_empty(&(inactive_list.swap_list))); for (i = 0; i < HASH_LIST_SIZE; i ++) { assert(list_empty(hash_list + i)); } page_remove(pgdir, 0); page_remove(pgdir, PGSIZE); free_page(pa2page(PMD_ADDR(*get_pmd(pgdir, 0, 0)))); free_page(pa2page(PUD_ADDR(*get_pud(pgdir, 0, 0)))); free_page(pa2page(PGD_ADDR(*get_pgd(pgdir, 0, 0)))); pgdir[0] = 0; mm->pgdir = NULL; mm_destroy(mm); check_mm_struct = NULL; assert(nr_active_pages == 0 && nr_inactive_pages == 0); for (offset = 0; offset < max_swap_offset; offset ++) { mem_map[offset] = SWAP_UNUSED; } assert(nr_free_pages_store == nr_free_pages()); assert(slab_allocated_store == slab_allocated()); cprintf("check_swap() succeeded.\n"); }
static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, int nr_pages, pgoff_t end, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); int i; int ret; ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; /* * At this point, the page may be truncated or * invalidated (changing page->mapping to NULL), or * even swizzled back from swapper_space to tmpfs file * mapping. However, page->index will not change * because we have a reference on the page. */ if (page->index > end) { /* * can't be range_cyclic (1st pass) because * end == -1 in that case. */ ret = 1; break; } *done_index = page->index; lock_page(page); if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = __gfs2_jdata_writepage(page, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ *done_index = page->index + 1; ret = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { ret = 1; break; } } gfs2_trans_end(sdp); return ret; }
static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, .page = page, .encrypted_page = NULL, }; trace_f2fs_writepage(page, DATA); if (page->index < end_index) goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ offset = i_size & (PAGE_CACHE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && available_free_memory(sbi, BASE_CHECK)) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; err = do_write_data_page(&fio); goto done; } /* we should bypass data pages to proceed the kworkder jobs */ if (unlikely(f2fs_cp_error(sbi))) { SetPageError(page); goto out; } if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; err = -EAGAIN; f2fs_lock_op(sbi); if (f2fs_has_inline_data(inode)) err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) goto redirty_out; clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); if (wbc->for_reclaim) f2fs_submit_merged_bio(sbi, DATA, WRITE); return 0; redirty_out: redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); return ret; } /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { int ret = 0; int done = 0; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int cycled; int range_whole = 0; int tag; int step = 0; pagevec_init(&pvec, 0); next: if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; if (page->index > end) { done = 1; break; } done_index = page->index; lock_page(page); if (unlikely(page->mapping != mapping)) { continue_unlock: unlock_page(page); continue; } if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } if (step == is_cold_data(page)) goto continue_unlock; if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, DATA); else goto continue_unlock; } BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); ret = 0; } else { done_index = page->index + 1; done = 1; break; } } if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } } pagevec_release(&pvec); cond_resched(); } if (step < 1) { step++; goto next; } if (!cycled && !done) { cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; }
void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); struct stack_trace trace = { .nr_entries = 0, .max_entries = ARRAY_SIZE(page_ext->trace_entries), .entries = &page_ext->trace_entries[0], .skip = 3, }; save_stack_trace(&trace); page_ext->order = order; page_ext->gfp_mask = gfp_mask; page_ext->nr_entries = trace.nr_entries; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) { int ret; int pageblock_mt, page_mt; char *kbuf; struct stack_trace trace = { .nr_entries = page_ext->nr_entries, .entries = &page_ext->trace_entries[0], }; kbuf = kmalloc(count, GFP_KERNEL); if (!kbuf) return -ENOMEM; ret = snprintf(kbuf, count, "Page allocated via order %u, mask 0x%x\n", page_ext->order, page_ext->gfp_mask); if (ret >= count) goto err; /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", pfn, pfn >> pageblock_order, pageblock_mt, pageblock_mt != page_mt ? "Fallback" : " ", PageLocked(page) ? "K" : " ", PageError(page) ? "E" : " ", PageReferenced(page) ? "R" : " ", PageUptodate(page) ? "U" : " ", PageDirty(page) ? "D" : " ", PageLRU(page) ? "L" : " ", PageActive(page) ? "A" : " ", PageSlab(page) ? "S" : " ", PageWriteback(page) ? "W" : " ", PageCompound(page) ? "C" : " ", PageSwapCache(page) ? "B" : " ", PageMappedToDisk(page) ? "M" : " "); if (ret >= count) goto err; ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); if (ret >= count) goto err; ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; if (copy_to_user(buf, kbuf, ret)) ret = -EFAULT; kfree(kbuf); return ret; err: kfree(kbuf); return -ENOMEM; } static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long pfn; struct page *page; struct page_ext *page_ext; if (!page_owner_inited) return -EINVAL; page = NULL; pfn = min_low_pfn + *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; drain_all_pages(NULL); /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not */ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { pfn += MAX_ORDER_NR_PAGES - 1; continue; } /* Check for holes within a MAX_ORDER area */ if (!pfn_valid_within(pfn)) continue; page = pfn_to_page(pfn); if (PageBuddy(page)) { unsigned long freepage_order = page_order_unsafe(page); if (freepage_order < MAX_ORDER) pfn += (1UL << freepage_order) - 1; continue; } page_ext = lookup_page_ext(page); /* * Some pages could be missed by concurrent allocation or free, * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; return print_page_owner(buf, count, pfn, page, page_ext); } return 0; }
/* * write a region of pages back to the server */ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { struct page *page; int ret, n; _enter(",,%lx,%lx,", index, end); do { n = find_get_pages_range_tag(mapping, &index, end, PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; _debug("wback %lx", page->index); /* * at this point we hold neither the i_pages lock nor the * page lock: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ ret = lock_page_killable(page); if (ret < 0) { put_page(page); _leave(" = %d", ret); return ret; } if (page->mapping != mapping || !PageDirty(page)) { unlock_page(page); put_page(page); continue; } if (PageWriteback(page)) { unlock_page(page); if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); put_page(page); continue; } if (!clear_page_dirty_for_io(page)) BUG(); ret = afs_write_back_from_locked_page(mapping, wbc, page, end); put_page(page); if (ret < 0) { _leave(" = %d", ret); return ret; } wbc->nr_to_write -= ret; cond_resched(); } while (index < end && wbc->nr_to_write > 0); *_next = index; _leave(" = 0 [%lx]", *_next); return 0; }
static unsigned int shrink_pages(struct mm_struct *mm, struct zone **zone_id_0, struct list_head *zone0_page_list, struct zone **zone_id_1, struct list_head *zone1_page_list, unsigned int num_to_scan) { unsigned long addr; unsigned int isolate_pages_countter = 0; struct vm_area_struct *vma = mm->mmap; while (vma != NULL) { for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { struct page *page; /*get the page address from virtual memory address */ page = follow_page(vma, addr, FOLL_GET); if (page && !IS_ERR(page)) { put_page(page); /* only moveable, anonymous and not dirty pages can be swapped */ if ((!PageUnevictable(page)) && (!PageDirty(page)) && ((PageAnon(page))) && (0 == page_is_file_cache(page))) { switch (page_zone_id(page)) { case 0: if (!isolate_lru_page(page)) { /* isolate page from LRU and add to temp list */ *zone_id_0 = page_zone(page); /*create new page list, it will be used in shrink_page_list */ spin_lock_irq(&(*zone_id_0)->lru_lock); list_add_tail(&page->lru, zone0_page_list); spin_unlock_irq(&(*zone_id_0)->lru_lock); isolate_pages_countter++; } break; case 1: if (!isolate_lru_page(page)) { /* isolate page from LRU and add to temp list */ *zone_id_1 = page_zone(page); /*create new page list, it will be used in shrink_page_list */ spin_lock_irq(&(*zone_id_1)->lru_lock); list_add_tail(&page->lru, zone1_page_list); spin_unlock_irq(&(*zone_id_1)->lru_lock); isolate_pages_countter++; } break; default: break; } } } if (isolate_pages_countter >= num_to_scan) { return isolate_pages_countter; } } vma = vma->vm_next; } return isolate_pages_countter; }
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { /* * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ pgmap = get_dev_pagemap(pte_pfn(pte), NULL); if (pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { int ret; ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_SPLIT && PageTransCompound(page)) { int ret; get_page(page); pte_unmap_unlock(ptep, ptl); lock_page(page); ret = split_huge_page(page); unlock_page(page); put_page(page); if (ret) return ERR_PTR(ret); goto retry; } if (flags & FOLL_GET) { get_page(page); /* drop the pgmap reference now that we hold the page */ if (pgmap) { put_dev_pagemap(pgmap); pgmap = NULL; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out; /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }
/** * write_one_page - write out a single page and optionally wait on I/O * @page: the page to write * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } page_cache_release(page); } else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) SetPageDirty(page); return 0; } /* * Helper function for set_page_dirty family. * NOTE: This relies on being atomic wrt interrupts. */ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * * Most callers have locked the page, which pins the address_space in memory. * But zap_pte_range() does not lock the page, however in that case the * mapping is pinned by the vma's ->vm_file reference. * * We take care to handle the case where the page was truncated from the * mapping by re-checking page_mapping() inside tree_lock. */ int __set_page_dirty_nobuffers(struct page *page) { if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; if (!mapping) return 1; spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* * When a writepage implementation decides that it doesn't want to write this * page for some reason, it should redirty the locked page via * redirty_page_for_writepage() and it should then unlock the page and return 0 */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { wbc->pages_skipped++; return __set_page_dirty_nobuffers(page); }
void ub_io_save_context(struct page *page, size_t bytes_dirtied) { struct user_beancounter *ub; struct page_beancounter *pb, *mapped_pb, *io_pb; if (unlikely(in_interrupt())) { WARN_ON_ONCE(1); return; } /* * FIXME - this can happen from atomic context and * it's probably not that good to loose some requests */ pb = io_pb_alloc(); io_pb = NULL; spin_lock(&pb_lock); if (io_debug_precheck_save(page)) goto out_unlock; mapped_pb = page_pbc(page); io_pb = iopb_to_pb(mapped_pb); if (io_pb != NULL) { /* * this page has an IO - release it and force a new one * We could also race with page cleaning - see below */ mapped_pb = io_pb->page_pb_list; put_page_io(page, io_pb); } /* * If the page is mapped we must save the context * it maps to. If the page isn't mapped we use current * context as this is a regular write. */ if (mapped_pb != NULL) ub = top_beancounter(mapped_pb->ub); else ub = get_io_ub(); if (!PageDirty(page)) { /* * race with clear_page_dirty(_for_io) - account * writes for ub_io_release_context() */ if (io_pb != NULL) io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE; if (pb != NULL) io_pb_free(pb); goto out_unlock; } if (pb == NULL) { ub->bytes_dirty_missed += bytes_dirtied; goto out_unlock; } /* * the page may become clean here, but the context will be seen * in ub_io_release_context() */ pb->ub = get_beancounter(ub); pb->page_pb_list = mapped_pb; ub->bytes_dirtied += bytes_dirtied; set_page_io(page, pb, mapped_pb); out_unlock: spin_unlock(&pb_lock); if (io_pb != NULL) { put_beancounter(io_pb->ub); io_pb_free(io_pb); } }
/** * write_one_page - write out a single page and optionally wait on I/O * @page: the page to write * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * * write_one_page() returns a negative error code if I/O failed. */ int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } page_cache_release(page); } else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); /* * For address_spaces which do not use buffers nor write back. */ int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) return !TestSetPageDirty(page); return 0; } /* * Helper function for set_page_dirty family. * NOTE: This relies on being atomic wrt interrupts. */ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for set_page_writeback family. * NOTE: Unlike account_page_dirtied this does not rely on being atomic * wrt interrupts. */ void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); }
int write_one_page(struct page *page, int wait) { struct address_space *mapping = page->mapping; int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, }; BUG_ON(!PageLocked(page)); if (wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } page_cache_release(page); } else { unlock_page(page); } return ret; } EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) SetPageDirty(page); return 0; } void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } int __set_page_dirty_nobuffers(struct page *page) { if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; if (!mapping) return 1; spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { wbc->pages_skipped++; return __set_page_dirty_nobuffers(page); }