/** * free_swap_cache:page从交换区高速缓存中删除 */ static inline void free_swap_cache(struct page *page) { if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { try_to_free_swap(page); unlock_page(page); } }
/* * If we are the only user, then try to free up the swap cache. * * Its ok to check for PageSwapCache without the page lock * here because we are going to recheck again inside * exclusive_swap_page() _with_ the lock. * - Marcelo */ static inline void free_swap_cache(struct page *page) { if (PageSwapCache(page) && trylock_page(page)) { remove_exclusive_swap_page(page); unlock_page(page); } }
/* * When an ext3-ordered file is truncated, it is possible that many pages are * not successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes * the numbers in /proc/meminfo look odd. * * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * * Called under journal->j_list_lock. The caller provided us with a ref * against the buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { struct page *page; if (buffer_dirty(bh)) goto nope; if (atomic_read(&bh->b_count) != 1) goto nope; page = bh->b_page; if (!page) goto nope; if (page->mapping) goto nope; /* OK, it's a truncated page */ if (!trylock_page(page)) goto nope; page_cache_get(page); __brelse(bh); try_to_free_buffers(page); unlock_page(page); page_cache_release(page); return; nope: __brelse(bh); }
/* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ void free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct * p; struct page *page = NULL; if (is_migration_entry(entry)) return; p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { page = find_get_page(&swapper_space, entry.val); if (page && unlikely(!trylock_page(page))) { page_cache_release(page); page = NULL; } } spin_unlock(&swap_lock); } if (page) { int one_user; BUG_ON(PagePrivate(page)); one_user = (page_count(page) == 2); /* Only cache user (+us), or swap space full? Free it! */ /* Also recheck PageSwapCache after page is locked (above) */ if (PageSwapCache(page) && !PageWriteback(page) && (one_user || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } unlock_page(page); page_cache_release(page); } }
/* * see if a page needs releasing upon read_cache_pages() failure * - the caller of read_cache_pages() may have set PG_private or PG_fscache * before calling, such as the NFS fs marking pages that are cached locally * on disk, thus we need to give the fs a chance to clean up in the event of * an error */ static void read_cache_pages_invalidate_page(struct address_space *mapping, struct page *page) { if (page_has_private(page)) { if (!trylock_page(page)) BUG(); page->mapping = mapping; do_invalidatepage(page, 0); page->mapping = NULL; unlock_page(page); } page_cache_release(page); }
/* * Try to drop buffers from the pages in a pagevec */ void pagevec_strip(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; if (page_has_private(page) && trylock_page(page)) { if (page_has_private(page)) try_to_release_page(page, 0); unlock_page(page); } } }
/** * pagevec_swap_free - try to free swap space from the pages in a pagevec * @pvec: pagevec with swapcache pages to free the swap space of * * The caller needs to hold an extra reference to each page and * not hold the page lock on the pages. This function uses a * trylock on the page lock so it may not always free the swap * space associated with a page. */ void pagevec_swap_free(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; if (PageSwapCache(page) && trylock_page(page)) { if (PageSwapCache(page)) remove_exclusive_swap_page_ref(page); unlock_page(page); } } }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); while (index <= end && __pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) deactivate_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; } return count; }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; /* * Note: this function may get called on a shmem/tmpfs mapping: * pagevec_lookup() might then return 0 prematurely (because it * got a gangful of swap entries); but it's hardly worth worrying * about - it can rarely have anything to free from such a mapping * (most pages are dirty), and already skips over any difficulties. */ pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = page->index; if (index > end) break; if (!trylock_page(page)) continue; WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) deactivate_page(page); count += ret; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; } return count; }
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { /* * Only return device mapping pages in the FOLL_GET case since * they are only valid while holding the pgmap reference. */ pgmap = get_dev_pagemap(pte_pfn(pte), NULL); if (pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { int ret; ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_SPLIT && PageTransCompound(page)) { int ret; get_page(page); pte_unmap_unlock(ptep, ptl); lock_page(page); ret = split_huge_page(page); unlock_page(page); put_page(page); if (ret) return ERR_PTR(ret); goto retry; } if (flags & FOLL_GET) { get_page(page); /* drop the pgmap reference now that we hold the page */ if (pgmap) { put_dev_pagemap(pgmap); pgmap = NULL; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out; /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }
/* * journal_commit_transaction * * The primary function for committing a transaction to the log. This * function is called by the journal thread to begin a complete commit. */ void journal_commit_transaction(journal_t *journal) { transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; int flags; int err; unsigned int blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; int i; struct blk_plug plug; /* * First job: lock down the current transaction and wait for * all outstanding updates to complete. */ /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); journal_update_superblock(journal, 1); } else { jbd_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (commit_transaction->t_updates) { spin_unlock(&commit_transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); schedule(); spin_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); J_ASSERT (commit_transaction->t_outstanding_credits <= journal->j_max_transaction_buffers); /* * First thing we are allowed to do is to discard any remaining * BJ_Reserved buffers. Note, it is _not_ permissible to assume * that there are no such buffers: if a large filesystem * operation like a truncate needs to split itself over multiple * transactions, then it may try to do a journal_restart() while * there are still BJ_Reserved buffers outstanding. These must * be released cleanly from the current transaction. * * In this case, the filesystem must still reserve write access * again before modifying the buffer in the new transaction, but * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple journal_get_write_access() calls to the same * buffer are perfectly permissible. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; JBUFFER_TRACE(jh, "reserved, unused: refile"); /* * A journal_get_undo_access()+journal_release_buffer() may * leave undo-committed data. */ if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); jbd_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } journal_refile_buffer(journal, jh); } /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially * frees some memory */ spin_lock(&journal->j_list_lock); __journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); jbd_debug (3, "JBD: commit phase 1\n"); /* * Clear revoked flag to reflect there is no revoked buffers * in the next transaction which is going to be started. */ journal_clear_buffer_revoked_flags(journal); /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, WRITE_SYNC); blk_finish_plug(&plug); /* * Wait for all previously submitted IO to complete. */ spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; jh = commit_transaction->t_locked_list->b_tprev; bh = jh2bh(jh); get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); spin_lock(&journal->j_list_lock); } if (unlikely(!buffer_uptodate(bh))) { if (!trylock_page(bh->b_page)) { spin_unlock(&journal->j_list_lock); lock_page(bh->b_page); spin_lock(&journal->j_list_lock); } if (bh->b_page->mapping) set_bit(AS_EIO, &bh->b_page->mapping->flags); unlock_page(bh->b_page); SetPageError(bh->b_page); err = -EIO; } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); continue; } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); if (err) { char b[BDEVNAME_SIZE]; printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) journal_abort(journal, err); err = 0; } blk_start_plug(&plug); journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); /* * If we found any dirty or locked buffers, then we should have * looped back up to the write_out_data label. If there weren't * any then journal_clean_data_list should have wiped the list * clean by now, so check that it is in fact empty. */ J_ASSERT (commit_transaction->t_sync_datalist == NULL); jbd_debug (3, "JBD: commit phase 3\n"); /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and release it. */ if (is_journal_aborted(journal)) { clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been * already allocated, even if we are now * aborting. */ if (!commit_transaction->t_buffers) goto start_journal_io; continue; } /* Make sure we have a descriptor block in which to record the metadata buffer. */ if (!descriptor) { struct buffer_head *bh; J_ASSERT (bufs == 0); jbd_debug(4, "JBD: get descriptor\n"); descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { journal_abort(journal, -EIO); continue; } bh = jh2bh(descriptor); jbd_debug(4, "JBD: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); tagp = &bh->b_data[sizeof(journal_header_t)]; space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; set_buffer_jwrite(bh); set_buffer_dirty(bh); wbuf[bufs++] = bh; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(bh, "ph3: file as descriptor"); journal_file_buffer(descriptor, commit_transaction, BJ_LogCtl); } /* Where is the buffer to be written? */ err = journal_next_log_block(journal, &blocknr); /* If the block mapping failed, just abandon the buffer and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { journal_abort(journal, err); continue; } /* * start_this_handle() uses t_outstanding_credits to determine * the free space in the log, but this counter is changed * by journal_next_log_block() also. */ commit_transaction->t_outstanding_credits--; /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ get_bh(jh2bh(jh)); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ set_buffer_jwrite(jh2bh(jh)); /* * akpm: journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. * We need to clean this up before we release new_bh * (which is of type BJ_IO) */ JBUFFER_TRACE(jh, "ph3: write metadata"); flags = journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); set_buffer_jwrite(jh2bh(new_jh)); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor buffer */ tag_flag = 0; if (flags & 1) tag_flag |= JFS_FLAG_ESCAPE; if (!first_tag) tag_flag |= JFS_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be32(tag_flag); tagp += sizeof(journal_block_tag_t); space_left -= sizeof(journal_block_tag_t); if (first_tag) { memcpy (tagp, journal->j_uuid, 16); tagp += 16; space_left -= 16; first_tag = 0; } /* If there's no more to do, or if the descriptor is full, let the IO rip! */ if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || space_left < sizeof(journal_block_tag_t) + 16) { jbd_debug(4, "JBD: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to the last tag we set up. */ tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; submit_bh(WRITE_SYNC, bh); } cond_resched(); /* Force a new descriptor to be generated next time round the loop. */ descriptor = NULL; bufs = 0; } } blk_finish_plug(&plug); /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the transaction's t_log_list queue, and metadata buffers are on the t_iobuf_list queue. Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and so we incur less scheduling load. */ jbd_debug(3, "JBD: commit phase 4\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. * See __journal_try_to_free_buffer. */ wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_iobuf; } if (cond_resched()) goto wait_for_iobuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; clear_buffer_jwrite(bh); JBUFFER_TRACE(jh, "ph4: unfile after journal write"); journal_unfile_buffer(journal, jh); /* * ->t_iobuf_list should contain only dummy buffer_heads * which were created by journal_write_metadata_buffer(). */ BUFFER_TRACE(bh, "dumping temporary bh"); journal_put_journal_head(jh); __brelse(bh); J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); free_buffer_head(bh); /* We also have to unlock and free the corresponding shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); clear_buffer_jwrite(bh); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need to remember it against this transaction so that when we finally commit, we can do any checkpointing required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); /* * Wake up any transactions which were waiting for this * IO to complete. The barrier must be here so that changes * by journal_file_buffer() take effect before wake_up_bit() * does the waitqueue check. */ smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } J_ASSERT (commit_transaction->t_shadow_list == NULL); jbd_debug(3, "JBD: commit phase 5\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: while (commit_transaction->t_log_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_log_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_ctlbuf; } if (cond_resched()) goto wait_for_ctlbuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); journal_unfile_buffer(journal, jh); journal_put_journal_head(jh); __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } if (err) journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 6\n"); /* All metadata is written, now write commit record and do cleanup */ spin_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_state = T_COMMIT_RECORD; spin_unlock(&journal->j_state_lock); if (journal_write_commit_record(journal, commit_transaction)) err = -EIO; if (err) journal_abort(journal, err); /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on before. */ jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_sync_datalist == NULL); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); J_ASSERT(commit_transaction->t_log_list == NULL); restart_loop: /* * As there are other places (journal_unmap_buffer()) adding buffers * to this list we have to be careful and hold the j_list_lock. */ spin_lock(&journal->j_list_lock); while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); /* * Get a reference so that bh cannot be freed before we are * done with it. */ get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); /* * If there is undo-protected committed data against * this buffer, then we can remove it now. If it is a * buffer needing such protection, the old frozen_data * field now points to a committed version of the * buffer, so rotate that field to the new committed * data. * * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { jbd_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { jbd_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } spin_lock(&journal->j_list_lock); cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); __journal_remove_checkpoint(jh); } /* Only re-checkpoint the buffer_head if it is marked * dirty. If the buffer was added to the BJ_Forget list * by journal_forget, it may no longer be dirty and * there's no point in keeping a checkpoint record for * it. */ /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back * that buffer in the future after the "add to orphan" * operation been committed, That's not only a performance * gain, it also stops aliasing problems if the buffer is * left behind for writeback and gets reallocated for another * use in a different page. */ if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); /* * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget * list. */ if (!jh->b_next_transaction) try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile freed buffer"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); if (try_to_free) release_buffer_page(bh); else __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); /* * This is a bit sleazy. We use j_list_lock to protect transition * of a transaction into T_FINISHED state and calling * __journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction * while the lock was dropped... */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); goto restart_loop; } /* Done with this transaction! */ jbd_debug(3, "JBD: commit phase 8\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); /* * weight the commit time higher than the average time so we don't * react too strongly to vast changes in commit time */ if (likely(journal->j_average_commit_time)) journal->j_average_commit_time = (commit_time*3 + journal->j_average_commit_time) / 4; else journal->j_average_commit_time = commit_time; spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; commit_transaction->t_cpnext = commit_transaction; commit_transaction->t_cpprev = commit_transaction; } else { commit_transaction->t_cpnext = journal->j_checkpoint_transactions; commit_transaction->t_cpprev = commit_transaction->t_cpnext->t_cpprev; commit_transaction->t_cpnext->t_cpprev = commit_transaction; commit_transaction->t_cpprev->t_cpnext = commit_transaction; } } spin_unlock(&journal->j_list_lock); trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); wake_up(&journal->j_wait_done_commit); }
/** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidatepage() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ partial_start = lstart & (PAGE_CACHE_SIZE - 1); partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; } truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { unsigned int top = PAGE_CACHE_SIZE; if (start > end) { /* Truncation within a single page */ top = partial_end; partial_end = 0; } wait_on_page_writeback(page); zero_user_segment(page, partial_start, top); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial_start, top - partial_start); unlock_page(page); page_cache_release(page); } } if (partial_end) { struct page *page = find_lock_page(mapping, end); if (page) { wait_on_page_writeback(page); zero_user_segment(page, 0, partial_end); cleancache_invalidate_page(mapping, page); if (page_has_private(page)) do_invalidatepage(page, 0, partial_end); unlock_page(page); page_cache_release(page); } } /* * If the truncation happened within a single page no pages * will be released, just zeroed, so we can bail out now. */ if (start >= end) return; index = start; for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } if (index == start && indices[0] >= end) { /* All gone out of hole to be punched, we're done */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index >= end) { /* Restart punch to make sure all gone */ index = start - 1; break; } if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); index++; } cleancache_invalidate_inode(mapping); }
/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; if (index > end) break; if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); continue; } if (!trylock_page(page)) continue; WARN_ON(page_to_index(page) != index); /* Middle of THP: skip */ if (PageTransTail(page)) { unlock_page(page); continue; } else if (PageTransHuge(page)) { index += HPAGE_PMD_NR - 1; i += HPAGE_PMD_NR - 1; /* 'end' is in the middle of THP */ if (index == round_down(end, HPAGE_PMD_NR)) continue; } ret = invalidate_inode_page(page); unlock_page(page); /* * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ if (!ret) deactivate_file_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); index++; } return count; }
/* * Synchronously write back the locked page and any subsequent non-locked dirty * pages. */ static int afs_write_back_from_locked_page(struct address_space *mapping, struct writeback_control *wbc, struct page *primary_page, pgoff_t final_page) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct page *pages[8], *page; unsigned long count, priv; unsigned n, offset, to, f, t; pgoff_t start, first, last; int loop, ret; _enter(",%lx", primary_page->index); count = 1; if (test_set_page_writeback(primary_page)) BUG(); /* Find all consecutive lockable dirty pages that have contiguous * written regions, stopping when we find a page that is not * immediately lockable, is not dirty or is missing, or we reach the * end of the range. */ start = primary_page->index; priv = page_private(primary_page); offset = priv & AFS_PRIV_MAX; to = priv >> AFS_PRIV_SHIFT; trace_afs_page_dirty(vnode, tracepoint_string("store"), primary_page->index, priv); WARN_ON(offset == to); if (offset == to) trace_afs_page_dirty(vnode, tracepoint_string("WARN"), primary_page->index, priv); if (start >= final_page || (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) goto no_more; start++; do { _debug("more %lx [%lx]", start, count); n = final_page - start + 1; if (n > ARRAY_SIZE(pages)) n = ARRAY_SIZE(pages); n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages); _debug("fgpc %u", n); if (n == 0) goto no_more; if (pages[0]->index != start) { do { put_page(pages[--n]); } while (n > 0); goto no_more; } for (loop = 0; loop < n; loop++) { page = pages[loop]; if (to != PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) break; if (page->index > final_page) break; if (!trylock_page(page)) break; if (!PageDirty(page) || PageWriteback(page)) { unlock_page(page); break; } priv = page_private(page); f = priv & AFS_PRIV_MAX; t = priv >> AFS_PRIV_SHIFT; if (f != 0 && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); break; } to = t; trace_afs_page_dirty(vnode, tracepoint_string("store+"), page->index, priv); if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) BUG(); unlock_page(page); put_page(page); } count += loop; if (loop < n) { for (; loop < n; loop++) put_page(pages[loop]); goto no_more; } start += loop; } while (start <= final_page && count < 65536); no_more: /* We now have a contiguous set of dirty pages, each with writeback * set; the first page is still locked at this point, but all the rest * have been unlocked. */ unlock_page(primary_page); first = primary_page->index; last = first + count - 1; _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); ret = afs_store_data(mapping, first, last, offset, to); switch (ret) { case 0: ret = count; break; default: pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); /* Fall through */ case -EACCES: case -EPERM: case -ENOKEY: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: afs_redirty_pages(wbc, mapping, first, last); mapping_set_error(mapping, ret); break; case -EDQUOT: case -ENOSPC: afs_redirty_pages(wbc, mapping, first, last); mapping_set_error(mapping, -ENOSPC); break; case -EROFS: case -EIO: case -EREMOTEIO: case -EFBIG: case -ENOENT: case -ENOMEDIUM: case -ENXIO: afs_kill_pages(mapping, first, last); mapping_set_error(mapping, ret); break; } _leave(" = %d", ret); return ret; }
int __m4u_get_user_pages(int eModuleID, struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { int i; unsigned long vm_flags; int trycnt; if (nr_pages <= 0) return 0; //VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); if(!!pages != !!(gup_flags & FOLL_GET)) { M4UMSG(" error: __m4u_get_user_pages !!pages != !!(gup_flags & FOLL_GET), pages=0x%x, gup_flags & FOLL_GET=0x%x \n", (unsigned int)pages, gup_flags & FOLL_GET); } /* * Require read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (gup_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; M4UDBG("Trying to get_user_pages from start vaddr 0x%08x with %d pages\n", start, nr_pages); do { struct vm_area_struct *vma; M4UDBG("For a new vma area from 0x%08x\n", start); vma = find_extend_vma(mm, start); if (!vma) { M4UMSG("error: the vma is not found, start=0x%x, module=%d \n", (unsigned int)start, eModuleID); return i ? i : -EFAULT; } if( ((~vma->vm_flags) & (VM_IO|VM_PFNMAP|VM_SHARED|VM_WRITE)) == 0 ) { M4UMSG("error: m4u_get_pages(): bypass pmem garbage pages! vma->vm_flags=0x%x, start=0x%x, module=%d \n", (unsigned int)(vma->vm_flags), (unsigned int)start, eModuleID); return i ? i : -EFAULT;; } if(vma->vm_flags & VM_IO) { M4UDBG("warning: vma is marked as VM_IO \n"); } if(vma->vm_flags & VM_PFNMAP) { M4UMSG("error: vma permission is not correct, vma->vm_flags=0x%x, start=0x%x, module=%d \n", (unsigned int)(vma->vm_flags), (unsigned int)start, eModuleID); M4UMSG("hint: maybe the memory is remapped with un-permitted vma->vm_flags! \n"); //m4u_dump_maps(start); return i ? i : -EFAULT;; } if(!(vm_flags & vma->vm_flags)) { M4UMSG("error: vm_flags invalid, vm_flags=0x%x, vma->vm_flags=0x%x, start=0x%x, module=%d \n", (unsigned int)vm_flags, (unsigned int)(vma->vm_flags), (unsigned int)start, eModuleID); //m4u_dump_maps(start); return i ? : -EFAULT; } do { struct page *page; unsigned int foll_flags = gup_flags; /* * If we have a pending SIGKILL, don't keep faulting * pages and potentially allocating memory. */ if (unlikely(fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; MMProfileLogEx(M4U_MMP_Events[PROFILE_FOLLOW_PAGE], MMProfileFlagStart, eModuleID, start&(~0xFFF)); page = follow_page(vma, start, foll_flags); MMProfileLogEx(M4U_MMP_Events[PROFILE_FOLLOW_PAGE], MMProfileFlagEnd, eModuleID, 0x1000); while (!page) { int ret; M4UDBG("Trying to allocate for %dth page(vaddr: 0x%08x)\n", i, start); MMProfileLogEx(M4U_MMP_Events[PROFILE_FORCE_PAGING], MMProfileFlagStart, eModuleID, start&(~0xFFF)); ret = handle_mm_fault(mm, vma, start, (foll_flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); MMProfileLogEx(M4U_MMP_Events[PROFILE_FORCE_PAGING], MMProfileFlagEnd, eModuleID, 0x1000); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) { M4UMSG("handle_mm_fault() error: no memory, aaddr:0x%08lx (%d pages are allocated), module=%d\n", start, i, eModuleID); //m4u_dump_maps(start); return i ? i : -ENOMEM; } if (ret & (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) { M4UMSG("handle_mm_fault() error: invalide memory address, vaddr:0x%lx (%d pages are allocated), module=%d\n", start, i, eModuleID); //m4u_dump_maps(start); return i ? i : -EFAULT; } BUG(); } if (ret & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set * pte_write. We can thus safely do subsequent * page lookups as if they were reads. But only * do so when looping for pte_write is futile: * in some cases userspace may also be wanting * to write to the gotten user page, which a * read fault here might prevent (a readonly * page might get reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; MMProfileLogEx(M4U_MMP_Events[PROFILE_FOLLOW_PAGE], MMProfileFlagStart, eModuleID, start&(~0xFFF)); page = follow_page(vma, start, foll_flags); MMProfileLogEx(M4U_MMP_Events[PROFILE_FOLLOW_PAGE], MMProfileFlagEnd, eModuleID, 0x1000); } if (IS_ERR(page)) { M4UMSG("handle_mm_fault() error: faulty page is returned, vaddr:0x%lx (%d pages are allocated), module=%d \n", start, i, eModuleID); //m4u_dump_maps(start); return i ? i : PTR_ERR(page); } if (pages) { pages[i] = page; MMProfileLogEx(M4U_MMP_Events[PROFILE_MLOCK], MMProfileFlagStart, eModuleID, start&(~0xFFF)); /* Use retry version to guarantee it will succeed in getting the lock */ trycnt = 3000; do { if (trylock_page(page)) { mlock_vma_page(page); unlock_page(page); //make sure hw pte is not 0 { int i; for(i=0; i<3000; i++) { if(!m4u_user_v2p(start)) { handle_mm_fault(mm, vma, start, (foll_flags & FOLL_WRITE)? FAULT_FLAG_WRITE : 0); cond_resched(); } else break; } if(i==3000) M4UMSG("error: cannot handle_mm_fault to get hw pte: va=0x%x\n", start); } break; } } while (trycnt-- > 0); if(PageMlocked(page)==0) { M4UMSG("Can't mlock page\n"); dump_page(page); } else { unsigned int pfn = page_to_pfn(page); if(pfn < mlock_cnt_size) { pMlock_cnt[page_to_pfn(page)]++; } else { M4UERR("mlock_cnt_size is too small: pfn=%d, size=%d\n", pfn, mlock_cnt_size); } //M4UMSG("lock page:\n"); //dump_page(page); } MMProfileLogEx(M4U_MMP_Events[PROFILE_MLOCK], MMProfileFlagEnd, eModuleID, 0x1000); } if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; nr_pages--; } while (nr_pages && start < vma->vm_end); } while (nr_pages);
/* * journal_commit_transaction * * The primary function for committing a transaction to the log. This * function is called by the journal thread to begin a complete commit. */ void journal_commit_transaction(journal_t *journal) { transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; int flags; int err; unsigned int blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; int i; struct blk_plug plug; /* * First job: lock down the current transaction and wait for * all outstanding updates to complete. */ /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); journal_update_superblock(journal, 1); } else { jbd_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (commit_transaction->t_updates) { spin_unlock(&commit_transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); schedule(); spin_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); J_ASSERT (commit_transaction->t_outstanding_credits <= journal->j_max_transaction_buffers); /* * First thing we are allowed to do is to discard any remaining * BJ_Reserved buffers. Note, it is _not_ permissible to assume * that there are no such buffers: if a large filesystem * operation like a truncate needs to split itself over multiple * transactions, then it may try to do a journal_restart() while * there are still BJ_Reserved buffers outstanding. These must * be released cleanly from the current transaction. * * In this case, the filesystem must still reserve write access * again before modifying the buffer in the new transaction, but * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple journal_get_write_access() calls to the same * buffer are perfectly permissible. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; JBUFFER_TRACE(jh, "reserved, unused: refile"); /* * A journal_get_undo_access()+journal_release_buffer() may * leave undo-committed data. */ if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); jbd_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } journal_refile_buffer(journal, jh); } /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially * frees some memory */ spin_lock(&journal->j_list_lock); __journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); jbd_debug (3, "JBD: commit phase 1\n"); /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, WRITE_SYNC); blk_finish_plug(&plug); /* * Wait for all previously submitted IO to complete. */ spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; jh = commit_transaction->t_locked_list->b_tprev; bh = jh2bh(jh); get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); spin_lock(&journal->j_list_lock); } if (unlikely(!buffer_uptodate(bh))) { if (!trylock_page(bh->b_page)) { spin_unlock(&journal->j_list_lock); lock_page(bh->b_page); spin_lock(&journal->j_list_lock); } if (bh->b_page->mapping) set_bit(AS_EIO, &bh->b_page->mapping->flags); unlock_page(bh->b_page); SetPageError(bh->b_page); err = -EIO; } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); continue; } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); if (err) { char b[BDEVNAME_SIZE]; printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) journal_abort(journal, err); err = 0; } blk_start_plug(&plug); journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); /* * If we found any dirty or locked buffers, then we should have * looped back up to the write_out_data label. If there weren't * any then journal_clean_data_list should have wiped the list * clean by now, so check that it is in fact empty. */ J_ASSERT (commit_transaction->t_sync_datalist == NULL); jbd_debug (3, "JBD: commit phase 3\n"); /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and release it. */ if (is_journal_aborted(journal)) { clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been * already allocated, even if we are now * aborting. */ if (!commit_transaction->t_buffers) goto start_journal_io; continue; } /* Make sure we have a descriptor block in which to record the metadata buffer. */ if (!descriptor) { struct buffer_head *bh; J_ASSERT (bufs == 0); jbd_debug(4, "JBD: get descriptor\n"); descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { journal_abort(journal, -EIO); continue; } bh = jh2bh(descriptor); jbd_debug(4, "JBD: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); tagp = &bh->b_data[sizeof(journal_header_t)]; space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; set_buffer_jwrite(bh); set_buffer_dirty(bh); wbuf[bufs++] = bh; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(bh, "ph3: file as descriptor"); journal_file_buffer(descriptor, commit_transaction, BJ_LogCtl); } /* Where is the buffer to be written? */ err = journal_next_log_block(journal, &blocknr); /* If the block mapping failed, just abandon the buffer and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { journal_abort(journal, err); continue; } /*
static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; /* * KSM's break_ksm() relies upon recognizing a ksm page * even while it is being migrated, so for that case we * need migration_entry_wait(). */ if (likely(!(flags & FOLL_MIGRATION))) goto no_page; if (pte_none(pte) || pte_file(pte)) goto no_page; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); goto retry; } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); return NULL; } page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { if ((flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(pte))) goto bad_page; page = pte_page(pte); } if (flags & FOLL_GET) get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE * which might bounce very badly if there is contention. * * If the page is already locked, we don't need to * handle it now - vmscan will handle it later if and * when it attempts to reclaim the page. */ if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* * Because we lock page here, and migration is * blocked by the pte's page reference, and we * know the page is still mapped, we don't even * need to check for file-cache page truncation. */ mlock_vma_page(page); unlock_page(page); } } pte_unmap_unlock(ptep, ptl); return page; bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT); no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); }