static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; int ret; BUFFER_TRACE(bh, "entry"); if (!buffer_jbd(bh)) goto zap_buffer_unlocked; write_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); jh = jbd2_journal_grab_journal_head(bh); if (!jh) goto zap_buffer_no_jh; transaction = jh->b_transaction; if (transaction == NULL) { if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "not on any transaction: zap"); goto zap_buffer; } if (!buffer_dirty(bh)) { goto zap_buffer; } if (journal->j_running_transaction) { JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); ret = __dispose_buffer(jh, journal->j_running_transaction); jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); return ret; } else { if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); ret = __dispose_buffer(jh, journal->j_committing_transaction); jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); return ret; } else { clear_buffer_jbddirty(bh); goto zap_buffer; } } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); return 0; } else { J_ASSERT_JH(jh, transaction == journal->j_running_transaction); JBUFFER_TRACE(jh, "on running transaction"); may_free = __dispose_buffer(jh, transaction); } zap_buffer: jbd2_journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); bh->b_bdev = NULL; return may_free; }
/** * write_mft_record_nolock - write out a mapped (extent) mft record * @ni: ntfs inode describing the mapped (extent) mft record * @m: mapped (extent) mft record to write * @sync: if true, wait for i/o completion * * Write the mapped (extent) mft record @m described by the (regular or extent) * ntfs inode @ni to backing store. If the mft record @m has a counterpart in * the mft mirror, that is also updated. * * On success, clean the mft record and return 0. On error, leave the mft * record dirty and return -errno. The caller should call make_bad_inode() on * the base inode to ensure no more access happens to this inode. We do not do * it here as the caller may want to finish writing other extent mft records * first to minimize on-disk metadata inconsistencies. * * NOTE: We always perform synchronous i/o and ignore the @sync parameter. * However, if the mft record has a counterpart in the mft mirror and @sync is * true, we write the mft record, wait for i/o completion, and only then write * the mft mirror copy. This ensures that if the system crashes either the mft * or the mft mirror will contain a self-consistent mft record @m. If @sync is * false on the other hand, we start i/o on both and then wait for completion * on them. This provides a speedup but no longer guarantees that you will end * up with a self-consistent mft record in the case of a crash but if you asked * for asynchronous writing you probably do not care about that anyway. * * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just * schedule i/o via ->writepage or do it via kntfsd or whatever. */ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) { ntfs_volume *vol = ni->vol; struct page *page = ni->page; unsigned int blocksize = vol->sb->s_blocksize; int max_bhs = vol->mft_record_size / blocksize; struct buffer_head *bhs[max_bhs]; struct buffer_head *bh, *head; unsigned int block_start, block_end, m_start, m_end; int i_bhs, nr_bhs, err = 0; ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); BUG_ON(NInoAttr(ni)); BUG_ON(!max_bhs); BUG_ON(!PageLocked(page)); /* * If the ntfs_inode is clean no need to do anything. If it is dirty, * mark it as clean now so that it can be redirtied later on if needed. * There is no danger of races since the caller is holding the locks * for the mft record @m and the page it is in. */ if (!NInoTestClearDirty(ni)) goto done; /* Make sure we have mapped buffers. */ if (!page_has_buffers(page)) { no_buffers_err_out: ntfs_error(vol->sb, "Writing mft records without existing " "buffers is not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; goto err_out; } bh = head = page_buffers(page); if (!bh) goto no_buffers_err_out; nr_bhs = 0; block_start = 0; m_start = ni->page_ofs; m_end = m_start + vol->mft_record_size; do { block_end = block_start + blocksize; /* * If the buffer is outside the mft record, just skip it, * clearing it if it is dirty to make sure it is not written * out. It should never be marked dirty but better be safe. */ if ((block_end <= m_start) || (block_start >= m_end)) { if (buffer_dirty(bh)) { ntfs_warning(vol->sb, "Clearing dirty mft " "record page buffer. %s", ntfs_please_email); clear_buffer_dirty(bh); } continue; } if (!buffer_mapped(bh)) { ntfs_error(vol->sb, "Writing mft records without " "existing mapped buffers is not " "implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } if (!buffer_uptodate(bh)) { ntfs_error(vol->sb, "Writing mft records without " "existing uptodate buffers is not " "implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } BUG_ON(!nr_bhs && (m_start != block_start)); BUG_ON(nr_bhs >= max_bhs); bhs[nr_bhs++] = bh; BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); } while (block_start = block_end, (bh = bh->b_this_page) != head); if (unlikely(err)) goto cleanup_out; /* Apply the mst protection fixups. */ err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); if (err) { ntfs_error(vol->sb, "Failed to apply mst fixups!"); goto cleanup_out; } flush_dcache_mft_record_page(ni); /* Lock buffers and start synchronous write i/o on them. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; if (unlikely(test_set_buffer_locked(tbh))) BUG(); BUG_ON(!buffer_uptodate(tbh)); if (buffer_dirty(tbh)) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (!sync && ni->mft_no < vol->mftmirr_size) sync_mft_mirror(ni, m, sync); /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; wait_on_buffer(tbh); if (unlikely(!buffer_uptodate(tbh))) { err = -EIO; /* * Set the buffer uptodate so the page & buffer states * don't become out of sync. */ if (PageUptodate(page)) set_buffer_uptodate(tbh); } } /* If @sync, now synchronize the mft mirror. */ if (sync && ni->mft_no < vol->mftmirr_size) sync_mft_mirror(ni, m, sync); /* Remove the mst protection fixups again. */ post_write_mst_fixup((NTFS_RECORD*)m); flush_dcache_mft_record_page(ni); if (unlikely(err)) { /* I/O error during writing. This is really bad! */ ntfs_error(vol->sb, "I/O error while writing mft record " "0x%lx! Marking base inode as bad. You " "should unmount the volume and run chkdsk.", ni->mft_no); goto err_out; } done: ntfs_debug("Done."); return 0; cleanup_out: /* Clean the buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) clear_buffer_dirty(bhs[i_bhs]); err_out: /* * Current state: all buffers are clean, unlocked, and uptodate. * The caller should mark the base inode as bad so that no more i/o * happens. ->clear_inode() will still be invoked so all extent inodes * and other allocated memory will be freed. */ if (err == -ENOMEM) { ntfs_error(vol->sb, "Not enough memory to write mft record. " "Redirtying so the write is retried later."); mark_mft_record_dirty(ni); err = 0; } return err; }
void cmd_show (char *cp) { char c, curbf, dirty, *p; const char *fname, *name; int l, rf; Buffer *buffer; uLong lines; if (*cp == 0) goto usage; do { p = uptospace (cp); /* Show buffer info */ if (strncasecmp ("buffers", cp, p - cp) == 0) { outfmt (0, "\nBuffers:\n"); l = 0; for (buffer = NULL; (buffer = buffer_next (buffer)) != NULL;) { name = buffer_name (buffer); if (strlen (name) > l) l = strlen (name); } for (buffer = NULL; (buffer = buffer_next (buffer)) != NULL;) { name = buffer_name (buffer); fname = buffer_filename (buffer); lines = buffer_linecount (buffer); rf = (buffer_getreadfile (buffer) != NULL); curbf = (buffer == cur_position.buffer) ? '>' : ' '; dirty = buffer_dirty (buffer, -1) ? '*' : ' '; outfmt (strlen (name) + 16, " %c %c %*.*s: %5u%c line%c", curbf, dirty, l, l, name, lines, rf ? '+' : ' ', (lines == 1) ? ' ' : 's'); if (fname != NULL) outfmt (strlen (fname), " => %s", fname); if (buffer == main_buffer) outstr (" (main buffer)"); outchr ('\n'); } continue; } /* Show info about files */ if (strncasecmp ("files", cp, p - cp) == 0) { outfmt (0, "\nFiles:\n"); outfmt (strlen (help_name), " Help: %s\n", help_name); outfmt (strlen (journal_name), " Journal: %s\n", journal_name[0] == 0 ? "<none>" : journal_name); continue; } /* Show keypad definitions */ if (strncasecmp ("keypad", cp, p - cp) == 0) { show_keypad (); continue; } goto usage; } while (*(cp = skipspaces (p)) != 0); return; usage: outerr (0, "specify BUFFERS, FILES, KEYPAD\n"); }
/* * journal_commit_transaction * * The primary function for committing a transaction to the log. This * function is called by the journal thread to begin a complete commit. */ void journal_commit_transaction(journal_t *journal) { transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; int flags; int err; unsigned int blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; int i; int write_op = WRITE; /* * First job: lock down the current transaction and wait for * all outstanding updates to complete. */ #ifdef COMMIT_STATS spin_lock(&journal->j_list_lock); summarise_journal_usage(journal); spin_unlock(&journal->j_list_lock); #endif /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); journal_update_superblock(journal, 1); } else { jbd_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; if (commit_transaction->t_synchronous_commit) write_op = WRITE_SYNC; spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (commit_transaction->t_updates) { spin_unlock(&commit_transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); schedule(); spin_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); J_ASSERT (commit_transaction->t_outstanding_credits <= journal->j_max_transaction_buffers); /* * First thing we are allowed to do is to discard any remaining * BJ_Reserved buffers. Note, it is _not_ permissible to assume * that there are no such buffers: if a large filesystem * operation like a truncate needs to split itself over multiple * transactions, then it may try to do a journal_restart() while * there are still BJ_Reserved buffers outstanding. These must * be released cleanly from the current transaction. * * In this case, the filesystem must still reserve write access * again before modifying the buffer in the new transaction, but * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple journal_get_write_access() calls to the same * buffer are perfectly permissable. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; JBUFFER_TRACE(jh, "reserved, unused: refile"); /* * A journal_get_undo_access()+journal_release_buffer() may * leave undo-committed data. */ if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); jbd_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } journal_refile_buffer(journal, jh); } /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially * frees some memory */ spin_lock(&journal->j_list_lock); __journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); jbd_debug (3, "JBD: commit phase 1\n"); /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ err = journal_submit_data_buffers(journal, commit_transaction, write_op); /* * Wait for all previously submitted IO to complete. */ spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; jh = commit_transaction->t_locked_list->b_tprev; bh = jh2bh(jh); get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); spin_lock(&journal->j_list_lock); } if (unlikely(!buffer_uptodate(bh))) { if (!trylock_page(bh->b_page)) { spin_unlock(&journal->j_list_lock); lock_page(bh->b_page); spin_lock(&journal->j_list_lock); } if (bh->b_page->mapping) set_bit(AS_EIO, &bh->b_page->mapping->flags); unlock_page(bh->b_page); SetPageError(bh->b_page); err = -EIO; } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); continue; } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); if (err) { char b[BDEVNAME_SIZE]; printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) journal_abort(journal, err); err = 0; } journal_write_revoke_records(journal, commit_transaction); /* * If we found any dirty or locked buffers, then we should have * looped back up to the write_out_data label. If there weren't * any then journal_clean_data_list should have wiped the list * clean by now, so check that it is in fact empty. */ J_ASSERT (commit_transaction->t_sync_datalist == NULL); jbd_debug (3, "JBD: commit phase 3\n"); /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and release it. */ if (is_journal_aborted(journal)) { clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been * already allocated, even if we are now * aborting. */ if (!commit_transaction->t_buffers) goto start_journal_io; continue; } /* Make sure we have a descriptor block in which to record the metadata buffer. */ if (!descriptor) { struct buffer_head *bh; J_ASSERT (bufs == 0); jbd_debug(4, "JBD: get descriptor\n"); descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { journal_abort(journal, -EIO); continue; } bh = jh2bh(descriptor); jbd_debug(4, "JBD: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); tagp = &bh->b_data[sizeof(journal_header_t)]; space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; set_buffer_jwrite(bh); set_buffer_dirty(bh); wbuf[bufs++] = bh; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(bh, "ph3: file as descriptor"); journal_file_buffer(descriptor, commit_transaction, BJ_LogCtl); } /* Where is the buffer to be written? */ err = journal_next_log_block(journal, &blocknr); /* If the block mapping failed, just abandon the buffer and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { journal_abort(journal, err); continue; } /* * start_this_handle() uses t_outstanding_credits to determine * the free space in the log, but this counter is changed * by journal_next_log_block() also. */ commit_transaction->t_outstanding_credits--; /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ get_bh(jh2bh(jh)); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ set_bit(BH_JWrite, &jh2bh(jh)->b_state); /* * akpm: journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. * We need to clean this up before we release new_bh * (which is of type BJ_IO) */ JBUFFER_TRACE(jh, "ph3: write metadata"); flags = journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor buffer */ tag_flag = 0; if (flags & 1) tag_flag |= JFS_FLAG_ESCAPE; if (!first_tag) tag_flag |= JFS_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be32(tag_flag); tagp += sizeof(journal_block_tag_t); space_left -= sizeof(journal_block_tag_t); if (first_tag) { memcpy (tagp, journal->j_uuid, 16); tagp += 16; space_left -= 16; first_tag = 0; } /* If there's no more to do, or if the descriptor is full, let the IO rip! */ if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || space_left < sizeof(journal_block_tag_t) + 16) { jbd_debug(4, "JBD: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to the last tag we set up. */ tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; submit_bh(write_op, bh); } cond_resched(); /* Force a new descriptor to be generated next time round the loop. */ descriptor = NULL; bufs = 0; } } /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the transaction's t_log_list queue, and metadata buffers are on the t_iobuf_list queue. Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and so we incur less scheduling load. */ jbd_debug(3, "JBD: commit phase 4\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. * See __journal_try_to_free_buffer. */ wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_iobuf; } if (cond_resched()) goto wait_for_iobuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; clear_buffer_jwrite(bh); JBUFFER_TRACE(jh, "ph4: unfile after journal write"); journal_unfile_buffer(journal, jh); /* * ->t_iobuf_list should contain only dummy buffer_heads * which were created by journal_write_metadata_buffer(). */ BUFFER_TRACE(bh, "dumping temporary bh"); journal_put_journal_head(jh); __brelse(bh); J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); free_buffer_head(bh); /* We also have to unlock and free the corresponding shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); clear_bit(BH_JWrite, &bh->b_state); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need to remember it against this transaction so that when we finally commit, we can do any checkpointing required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); /* * Wake up any transactions which were waiting for this * IO to complete. The barrier must be here so that changes * by journal_file_buffer() take effect before wake_up_bit() * does the waitqueue check. */ smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } J_ASSERT (commit_transaction->t_shadow_list == NULL); jbd_debug(3, "JBD: commit phase 5\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: while (commit_transaction->t_log_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_log_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_ctlbuf; } if (cond_resched()) goto wait_for_ctlbuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); journal_unfile_buffer(journal, jh); journal_put_journal_head(jh); __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } if (err) journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 6\n"); if (journal_write_commit_record(journal, commit_transaction)) err = -EIO; if (err) journal_abort(journal, err); /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on before. */ jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_sync_datalist == NULL); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); J_ASSERT(commit_transaction->t_log_list == NULL); restart_loop: /* * As there are other places (journal_unmap_buffer()) adding buffers * to this list we have to be careful and hold the j_list_lock. */ spin_lock(&journal->j_list_lock); while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); /* * Get a reference so that bh cannot be freed before we are * done with it. */ get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); /* * If there is undo-protected committed data against * this buffer, then we can remove it now. If it is a * buffer needing such protection, the old frozen_data * field now points to a committed version of the * buffer, so rotate that field to the new committed * data. * * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { jbd_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { jbd_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } spin_lock(&journal->j_list_lock); cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); __journal_remove_checkpoint(jh); } /* Only re-checkpoint the buffer_head if it is marked * dirty. If the buffer was added to the BJ_Forget list * by journal_forget, it may no longer be dirty and * there's no point in keeping a checkpoint record for * it. */ /* * A buffer which has been freed while still being journaled by * a previous transaction. */ if (buffer_freed(bh)) { /* * If the running transaction is the one containing * "add to orphan" operation (b_next_transaction != * NULL), we have to wait for that transaction to * commit before we can really get rid of the buffer. * So just clear b_modified to not confuse transaction * credit accounting and refile the buffer to * BJ_Forget of the running transaction. If the just * committed transaction contains "add to orphan" * operation, we can completely invalidate the buffer * now. We are rather throughout in that since the * buffer may be still accessible when blocksize < * pagesize and it is attached to the last partial * page. */ jh->b_modified = 0; if (!jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); clear_buffer_mapped(bh); clear_buffer_new(bh); clear_buffer_req(bh); bh->b_bdev = NULL; } } if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); /* * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget * list. */ if (!jh->b_next_transaction) try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile freed buffer"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); if (try_to_free) release_buffer_page(bh); else __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); /* * This is a bit sleazy. We use j_list_lock to protect transition * of a transaction into T_FINISHED state and calling * __journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction * while the lock was dropped... */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); goto restart_loop; } /* Done with this transaction! */ jbd_debug(3, "JBD: commit phase 8\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); /* * weight the commit time higher than the average time so we don't * react too strongly to vast changes in commit time */ if (likely(journal->j_average_commit_time)) journal->j_average_commit_time = (commit_time*3 + journal->j_average_commit_time) / 4; else journal->j_average_commit_time = commit_time; spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; commit_transaction->t_cpnext = commit_transaction; commit_transaction->t_cpprev = commit_transaction; } else { commit_transaction->t_cpnext = journal->j_checkpoint_transactions; commit_transaction->t_cpprev = commit_transaction->t_cpnext->t_cpprev; commit_transaction->t_cpnext->t_cpprev = commit_transaction; commit_transaction->t_cpprev->t_cpnext = commit_transaction; } } spin_unlock(&journal->j_list_lock); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); wake_up(&journal->j_wait_done_commit); }
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, sector_t pblocknr, struct buffer_head **pbh, int newblk) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); int err; btnode_debug(3, "called: blocknr=%llu pblocknr=%llu new=%d ino=%lu\n", (unsigned long long)blocknr, (unsigned long long)pblocknr, newblk, inode->i_ino); bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); if (unlikely(!bh)) return -ENOMEM; err = -EEXIST; /* internal code */ if (newblk) { if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || buffer_dirty(bh))) { BH_DEBUG(bh, "invalid new bh"); BUG(); } bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); goto found; } if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); /* blocknr is a virtual block number */ err = nilfs_dat_translate(dat, blocknr, &pblocknr); if (unlikely(err)) { brelse(bh); btnode_debug(1, "return %d (xlate).\n", err); goto out_locked; } } } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); err = -EEXIST; /* internal code */ goto found; } set_buffer_mapped(bh); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(READ, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ err = 0; found: *pbh = bh; out_locked: unlock_page(bh->b_page); page_cache_release(bh->b_page); btnode_debug(3, "done (err=%d)\n", err); return err; }
static void go_through (reiserfs_filsys_t fs) { struct buffer_head * bh; int i; int what_node; unsigned long done = 0, total; if (fsck_mode (fs) == DO_TEST) { /* just to test pass0_correct_leaf */ bh = bread (fs->s_dev, stats(fs)->test, fs->s_blocksize); /* if (is_leaf_bad (bh)) { fsck_progress ("############### bad #################\n"); } */ pass0_correct_leaf (fs, bh); print_block (stdout, fs, bh, 3, -1, -1); if (is_leaf_bad (bh)) { fsck_progress ("############### still bad #################\n"); } brelse (bh); reiserfs_free (fs); exit(4); } total = reiserfs_bitmap_ones (fsck_disk_bitmap (fs)); fsck_progress ("\nPass 0 (%lu (of %lu) blocks will be read):\n", total, SB_BLOCK_COUNT (fs)); for (i = 0; i < SB_BLOCK_COUNT (fs); i ++) { if (!is_to_be_read (fs, i)) continue; print_how_far (&done, total, 1, fsck_quiet (fs)); bh = bread (fs->s_dev, i, fs->s_blocksize); if (!bh) { /* we were reading one block at time, and failed, so mark block bad */ fsck_progress ("pass0: reading block %lu failed\n", i); continue; } if (not_data_block (fs, i)) reiserfs_panic ("not data block found"); stats (fs)->analyzed ++; what_node = who_is_this (bh->b_data, fs->s_blocksize); if ( what_node != THE_LEAF ) { brelse (bh); continue; } pass0_correct_leaf (fs, bh); brelse (bh); } #if 0 for (i = 0; i < SB_BLOCK_COUNT (fs); i += nr_to_read) { to_scan = how_many_to_scan (fs, i, nr_to_read); if (to_scan) { print_how_far (&done, total, to_scan, fsck_quiet (fs)); /* at least one of nr_to_read blocks is to be checked */ bbh = bread (fs->s_dev, i / nr_to_read, fs->s_blocksize * nr_to_read); if (bbh) { for (j = 0; j < nr_to_read; j ++) { if (!is_to_be_read (fs, i + j)) continue; if (not_data_block (fs, i + j)) reiserfs_panic ("not data block found"); stats (fs)->analyzed ++; data = bbh->b_data + j * fs->s_blocksize; what_node = who_is_this (data, fs->s_blocksize); if ( what_node != THE_LEAF ) { continue; } /* the node looks like a leaf, but it still can be not perfect */ bh = make_buffer (fs->s_dev, i + j, fs->s_blocksize, data); /*printf ("block %lu .. ", bh->b_blocknr);fflush(stdout);*/ pass0_correct_leaf (fs, bh); /*printf ("ok\n");fflush(stdout);*/ brelse (bh); } if (buffer_dirty (bbh)) bwrite (bbh); bforget (bbh); } else { done -= to_scan; /* bread failed */ if (nr_to_read != 1) { /* we tryied to read bunch of blocks. Try to read them by one */ nr_to_read = 1; i --; continue; } else { /* we were reading one block at time, and failed, so mark block bad */ fsck_progress ("pass0: block %lu is bad, marked used\n", i); } } } if (nr_to_read == 1 && ((i + 1) % NR_TO_READ) == 0) { /* we have read NR_TO_READ blocks one at time, switch back to reading NR_TO_READ blocks at time */ i -= (NR_TO_READ - 1); nr_to_read = NR_TO_READ; } } #endif /* just in case */ mark_objectid_really_used (proper_id_map (fs), REISERFS_ROOT_OBJECTID); fsck_progress ("\n"); if (fsck_save_leaf_bitmap (fs)) { reiserfs_bitmap_save (stats (fs)->new_bitmap_file_name, leaves_bitmap); } }
/* recursive function processing all tree nodes */ static unsigned long move_formatted_block(unsigned long block, unsigned long bnd, int h) { struct buffer_head * bh; struct item_head *ih; unsigned long new_blocknr = 0; int node_is_internal = 0; int i, j; bh = bread(fs->s_dev, block, fs->s_blocksize); if (is_leaf_node (bh)) { leaf_node_cnt++; for (i=0; i < B_NR_ITEMS(bh); i++) { ih = B_N_PITEM_HEAD(bh, i); if (is_indirect_ih(ih)) { __u32 * indirect; indirect = (__u32 *)B_I_PITEM (bh, ih); for (j = 0; j < I_UNFM_NUM(ih); j++) { unsigned long unfm_block; if (indirect [j] == 0) /* hole */ continue; unfm_block = move_unformatted_block(le32_to_cpu (indirect [j]), bnd, h + 1); if (unfm_block) { indirect [j] = cpu_to_le32 (unfm_block); mark_buffer_dirty(bh); } } } } } else if (is_internal_node (bh)) { /* internal node */ int_node_cnt++; node_is_internal = 1; for (i=0; i <= B_NR_ITEMS(bh); i++) { unsigned long moved_block; moved_block = move_formatted_block(B_N_CHILD_NUM(bh, i), bnd, h+1); if (moved_block) { set_child_block_number (bh, i, moved_block); mark_buffer_dirty(bh); } } } else { die ("resize_reiserfs: block (%lu) has invalid format\n", block); } if (buffer_dirty(bh)) { mark_buffer_uptodate(bh,1); bwrite(bh); } brelse(bh); new_blocknr = move_generic_block(block, bnd, h); if (new_blocknr) { if (node_is_internal) int_moved_cnt++; else leaf_moved_cnt++; } return new_blocknr; }
loff_t tux_create_entry(struct inode *dir, const char *name, unsigned len, inum_t inum, umode_t mode, loff_t *size) { unsigned delta = tux3_get_current_delta(); struct sb *sb = tux_sb(dir->i_sb); tux_dirent *entry; struct buffer_head *buffer, *clone; unsigned reclen = TUX_REC_LEN(len), rec_len, name_len, offset; unsigned blocksize = sb->blocksize; block_t block, blocks = *size >> sb->blockbits; void *olddata; for (block = 0; block < blocks; block++) { buffer = blockread(mapping(dir), block); if (!buffer) return -EIO; entry = bufdata(buffer); tux_dirent *limit = bufdata(buffer) + blocksize - reclen; while (entry <= limit) { if (entry->rec_len == 0) { blockput(buffer); tux_zero_len_error(dir, block); return -EIO; } name_len = TUX_REC_LEN(entry->name_len); rec_len = tux_rec_len_from_disk(entry->rec_len); if (is_deleted(entry) && rec_len >= reclen) goto create; if (rec_len >= name_len + reclen) goto create; entry = (void *)entry + rec_len; } blockput(buffer); } entry = NULL; buffer = blockget(mapping(dir), block); assert(!buffer_dirty(buffer)); create: /* * The directory is protected by i_mutex. * blockdirty() should never return -EAGAIN. */ olddata = bufdata(buffer); clone = blockdirty(buffer, delta); if (IS_ERR(clone)) { assert(PTR_ERR(clone) != -EAGAIN); blockput(buffer); return PTR_ERR(clone); } if (!entry) { /* Expanding the directory size. Initialize block. */ entry = bufdata(clone); memset(entry, 0, blocksize); entry->rec_len = tux_rec_len_to_disk(blocksize); assert(is_deleted(entry)); *size += blocksize; } else { entry = ptr_redirect(entry, olddata, bufdata(clone)); if (!is_deleted(entry)) { tux_dirent *newent = (void *)entry + name_len; unsigned rest_rec_len = rec_len - name_len; newent->rec_len = tux_rec_len_to_disk(rest_rec_len); entry->rec_len = tux_rec_len_to_disk(name_len); entry = newent; } } entry->name_len = len; memcpy(entry->name, name, len); offset = (void *)entry - bufdata(clone); /* this releases buffer */ tux_update_entry(clone, entry, inum, mode); return (block << sb->blockbits) + offset; /* only for xattr create */ }
/* * This function writes length bytes of data from a specified buffer to the destination address within the device * parameters * offset : start physical page number * length : bytes to write * buffer : buffer to write data * return value * The return code should indicate ERROR (1) or OK(0). If the return code indicates ERROR, the dev_ptr->status field should indicate * the return error code. */ int FlashDevWrite(j4fs_device_info *dev_ptr, DWORD offset, DWORD length, BYTE *buffer) { // J4FS for Block Devices fron NiTRo #ifndef J4FS_USE_BLK DWORD nVol=0; int part_id=dev_ptr->device; // J4FS for moviNAND merged from ROSSI int ret=-1; #endif // J4FS for Block Devices fron NiTRo #if defined(J4FS_USE_MOVI) // J4FS for Block Devices fron NiTRo mm_segment_t oldfs; #elif defined(J4FS_USE_BLK) struct buffer_head *bh; unsigned long boffset; struct super_block *sb = dev_ptr->device; #endif // J4FS for moviNAND merged from ROSSI #if defined(J4FS_USE_XSR) ret = STL_Write(nVol, part_id, offset/512, length/512, buffer); if (ret != STL_SUCCESS) { T(J4FS_TRACE_ALWAYS,("%s %d: Error(offset,length,j4fs_end,nErr)=(0x%x,0x%x,0x%x,0x%x)\n",__FUNCTION__,__LINE__,offset,length,device_info.j4fs_end,ret)); return J4FS_FAIL; } #elif defined(J4FS_USE_FSR) ret = FSR_STL_Write(nVol, part_id, offset/512, length/512, buffer, FSR_STL_FLAG_DEFAULT); if (ret != FSR_STL_SUCCESS) { T(J4FS_TRACE_ALWAYS,("%s %d: Error(offset,length,j4fs_end,nErr)=(0x%x,0x%x,0x%x,0x%x)\n",__FUNCTION__,__LINE__,offset,length,device_info.j4fs_end,ret)); return J4FS_FAIL; } // J4FS for Block Devices fron NiTRo #elif defined(J4FS_USE_MOVI) // J4FS for Block Devices fron NiTRo if (!j4fs_filp) { printk("J4FS not available\n"); return J4FS_FAIL; } j4fs_filp->f_flags |= O_NONBLOCK; oldfs = get_fs(); set_fs(get_ds()); ret = j4fs_filp->f_op->llseek(j4fs_filp, offset, SEEK_SET); ret = j4fs_filp->f_op->write(j4fs_filp, buffer, length, &j4fs_filp->f_pos); set_fs(oldfs); j4fs_filp->f_flags &= ~O_NONBLOCK; if (ret < 0) { printk("j4fs_filp->write() failed: %d\n", ret); return J4FS_FAIL; } // J4FS for moviNAND merged from ROSSI #elif defined(J4FS_USE_BLK) boffset = offset / sb->s_blocksize; bh = sb_getblk(dev_ptr->device, boffset); if (!bh) { T(J4FS_TRACE_ALWAYS,("%s %d: cannot get block=%lu, length=%lu, bh=%p\n",__FUNCTION__,__LINE__,boffset,(unsigned long)length,bh)); return J4FS_FAIL; } lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); memcpy(bh->b_data, buffer, length); if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); if (buffer_dirty(bh)) clear_buffer_dirty(bh); submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { T(J4FS_TRACE_ALWAYS,("%s %d: cannot write block %lu\n",__FUNCTION__,__LINE__,boffset)); return J4FS_FAIL; } brelse(bh); #else #error #endif return J4FS_SUCCESS; }
/* * Try to flush one buffer from the checkpoint list to disk. * * Return 1 if something happened which requires us to abort the current * scan of the checkpoint list. Return <0 if the buffer has failed to * be written out. * * Called with j_list_lock held and drops it if 1 is returned */ static int __process_buffer(journal_t *journal, struct journal_head *jh, int *batch_count, transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); ret = 1; } else if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so starting and * waiting for a commit to finish will cause * us to wait for a _very_ long time. */ printk(KERN_ERR "JBD2: %s: " "Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); jbd2_log_start_commit(journal, tid); jbd2_log_wait_commit(journal, tid); ret = 1; } else if (!buffer_dirty(bh)) { ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; get_bh(bh); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); __brelse(bh); } else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. * We cannot afford to let the transaction logic start * messing around with this buffer before we write it to * disk, as that would break recoverability. */ BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == JBD2_NR_BATCH) { spin_unlock(&journal->j_list_lock); __flush_batch(journal, batch_count); ret = 1; } } return ret; }
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]) { int status = 0; unsigned int i; struct buffer_head *bh; if (!nr) { mlog(ML_BH_IO, "No buffers will be read!\n"); goto bail; } for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { status = -EIO; mlog_errno(status); goto bail; } } bh = bhs[i]; if (buffer_jbd(bh)) { mlog(ML_ERROR, "trying to sync read a jbd " "managed bh (blocknr = %llu), skipping\n", (unsigned long long)bh->b_blocknr); continue; } if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ mlog(ML_ERROR, "trying to sync read a dirty " "buffer! (blocknr = %llu), skipping\n", (unsigned long long)bh->b_blocknr); continue; } lock_buffer(bh); if (buffer_jbd(bh)) { mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); } clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); } for (i = nr; i > 0; i--) { bh = bhs[i - 1]; if (buffer_jbd(bh)) { mlog(ML_ERROR, "the journal got the buffer while it was " "locked for io! (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); BUG(); } wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back * to cleanup the other buffers. */ status = -EIO; put_bh(bh); bhs[i - 1] = NULL; } } bail: return status; }
int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, struct buffer_head *bhs[], int flags) { int status = 0; int i, ignore_cache = 0; struct buffer_head *bh; mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", inode, (unsigned long long)block, nr, flags); BUG_ON(!inode); BUG_ON((flags & OCFS2_BH_READAHEAD) && (flags & OCFS2_BH_IGNORE_CACHE)); if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; } if (nr < 0) { mlog(ML_ERROR, "asked to read %d blocks!\n", nr); status = -EINVAL; mlog_errno(status); goto bail; } if (nr == 0) { mlog(ML_BH_IO, "No buffers will be read!\n"); status = 0; goto bail; } mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(inode->i_sb, block++); if (bhs[i] == NULL) { mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; } } bh = bhs[i]; ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has * previously been submitted with OCFS2_BH_READAHEAD * and it hasn't yet completed I/O. * * 1) The current request is sync to disk. This rarely * happens these days, and never when performance * matters - the code can just wait on the buffer * lock and re-submit. * * 2) The current request is cached, but not * readahead. ocfs2_buffer_uptodate() will return * false anyway, so we'll wind up waiting on the * buffer lock to do I/O. We re-check the request * with after getting the lock to avoid a re-submit. * * 3) The current request is readahead (and so must * also be a caching one). We short circuit if the * buffer is locked (under I/O) and if it's in the * uptodate cache. The re-check from #2 catches the * case that the previous read-ahead completes just * before our is-it-in-flight check. */ if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { mlog(ML_UPTODATE, "bh (%llu), inode %llu not uptodate\n", (unsigned long long)bh->b_blocknr, (unsigned long long)OCFS2_I(inode)->ip_blkno); /* We're using ignore_cache here to say * "go to disk" */ ignore_cache = 1; } /* XXX: Can we ever get this and *not* have the cached * flag set? */ if (buffer_jbd(bh)) { if (ignore_cache) mlog(ML_BH_IO, "trying to sync read a jbd " "managed bh (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); continue; } if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ mlog(ML_BH_IO, "asking me to sync read a dirty " "buffer! (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); continue; } /* A read-ahead request was made - if the * buffer is already under read-ahead from a * previously submitted request than we are * done here. */ if ((flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_read_ahead(inode, bh)) continue; lock_buffer(bh); if (buffer_jbd(bh)) { #ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); #else unlock_buffer(bh); continue; #endif } /* Re-check ocfs2_buffer_uptodate() as a * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(inode, bh)) { unlock_buffer(bh); continue; } clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); continue; } } status = 0; for (i = (nr - 1); i >= 0; i--) { bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { /* We know this can't have changed as we hold the * inode sem. Avoid doing any work on the bh if the * journal has it. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back * to cleanup the other buffers. Don't need to * remove the clustered uptodate information * for this bh as it's not marked locally * uptodate. */ status = -EIO; put_bh(bh); bhs[i] = NULL; continue; } } /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ ocfs2_set_buffer_uptodate(inode, bh); } mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", flags); bail: mlog_exit(status); return status; }
/* * journal_commit_transaction * * The primary function for committing a transaction to the log. This * function is called by the journal thread to begin a complete commit. */ void journal_commit_transaction(journal_t *journal) { transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head *wbuf[64]; int bufs; int flags; int err; unsigned long blocknr; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; int i; /* * First job: lock down the current transaction and wait for * all outstanding updates to complete. */ #ifdef COMMIT_STATS spin_lock(&journal->j_list_lock); summarise_journal_usage(journal); spin_unlock(&journal->j_list_lock); #endif /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); journal_update_superblock(journal, 1); } else { jbd_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (commit_transaction->t_updates) { spin_unlock(&commit_transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); schedule(); spin_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); J_ASSERT (commit_transaction->t_outstanding_credits <= journal->j_max_transaction_buffers); /* * First thing we are allowed to do is to discard any remaining * BJ_Reserved buffers. Note, it is _not_ permissible to assume * that there are no such buffers: if a large filesystem * operation like a truncate needs to split itself over multiple * transactions, then it may try to do a journal_restart() while * there are still BJ_Reserved buffers outstanding. These must * be released cleanly from the current transaction. * * In this case, the filesystem must still reserve write access * again before modifying the buffer in the new transaction, but * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple journal_get_write_access() calls to the same * buffer are perfectly permissable. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; JBUFFER_TRACE(jh, "reserved, unused: refile"); /* * A journal_get_undo_access()+journal_release_buffer() may * leave undo-committed data. */ if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); if (jh->b_committed_data) { kfree(jh->b_committed_data); jh->b_committed_data = NULL; } jbd_unlock_bh_state(bh); } journal_refile_buffer(journal, jh); } /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially * frees some memory */ spin_lock(&journal->j_list_lock); __journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); jbd_debug (3, "JBD: commit phase 1\n"); /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ err = 0; /* * Whenever we unlock the journal and sleep, things can get added * onto ->t_sync_datalist, so we have to keep looping back to * write_out_data until we *know* that the list is empty. */ bufs = 0; /* * Cleanup any flushed data buffers from the data list. Even in * abort mode, we want to flush this out as soon as possible. */ write_out_data: cond_resched(); spin_lock(&journal->j_list_lock); while (commit_transaction->t_sync_datalist) { struct buffer_head *bh; jh = commit_transaction->t_sync_datalist; commit_transaction->t_sync_datalist = jh->b_tnext; bh = jh2bh(jh); if (buffer_locked(bh)) { BUFFER_TRACE(bh, "locked"); if (!inverted_lock(journal, bh)) goto write_out_data; __journal_unfile_buffer(jh); __journal_file_buffer(jh, commit_transaction, BJ_Locked); jbd_unlock_bh_state(bh); if (lock_need_resched(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } } else { if (buffer_dirty(bh)) { BUFFER_TRACE(bh, "start journal writeout"); get_bh(bh); wbuf[bufs++] = bh; if (bufs == ARRAY_SIZE(wbuf)) { jbd_debug(2, "submit %d writes\n", bufs); spin_unlock(&journal->j_list_lock); ll_rw_block(WRITE, bufs, wbuf); journal_brelse_array(wbuf, bufs); bufs = 0; goto write_out_data; } } else { BUFFER_TRACE(bh, "writeout complete: unfile"); if (!inverted_lock(journal, bh)) goto write_out_data; __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); put_bh(bh); if (lock_need_resched(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } } } } if (bufs) { spin_unlock(&journal->j_list_lock); ll_rw_block(WRITE, bufs, wbuf); journal_brelse_array(wbuf, bufs); spin_lock(&journal->j_list_lock); } /* * Wait for all previously submitted IO to complete. */ while (commit_transaction->t_locked_list) { struct buffer_head *bh; jh = commit_transaction->t_locked_list->b_tprev; bh = jh2bh(jh); get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) err = -EIO; spin_lock(&journal->j_list_lock); } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); continue; } if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); put_bh(bh); } else { jbd_unlock_bh_state(bh); } put_bh(bh); // cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); journal_write_revoke_records(journal, commit_transaction); jbd_debug(3, "JBD: commit phase 2\n"); /* * If we found any dirty or locked buffers, then we should have * looped back up to the write_out_data label. If there weren't * any then journal_clean_data_list should have wiped the list * clean by now, so check that it is in fact empty. */ J_ASSERT (commit_transaction->t_sync_datalist == NULL); jbd_debug (3, "JBD: commit phase 3\n"); /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ commit_transaction->t_state = T_COMMIT; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and release it for background writing. */ if (is_journal_aborted(journal)) { JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been * already allocated, even if we are now * aborting. */ if (!commit_transaction->t_buffers) goto start_journal_io; continue; } /* Make sure we have a descriptor block in which to record the metadata buffer. */ if (!descriptor) { struct buffer_head *bh; J_ASSERT (bufs == 0); jbd_debug(4, "JBD: get descriptor\n"); descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { __journal_abort_hard(journal); continue; } bh = jh2bh(descriptor); jbd_debug(4, "JBD: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); tagp = &bh->b_data[sizeof(journal_header_t)]; space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; set_buffer_jwrite(bh); set_buffer_dirty(bh); wbuf[bufs++] = bh; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(bh, "ph3: file as descriptor"); journal_file_buffer(descriptor, commit_transaction, BJ_LogCtl); } /* Where is the buffer to be written? */ err = journal_next_log_block(journal, &blocknr); /* If the block mapping failed, just abandon the buffer and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { __journal_abort_hard(journal); continue; } /* * start_this_handle() uses t_outstanding_credits to determine * the free space in the log, but this counter is changed * by journal_next_log_block() also. */ commit_transaction->t_outstanding_credits--; /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ atomic_inc(&jh2bh(jh)->b_count); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ set_bit(BH_JWrite, &jh2bh(jh)->b_state); /* * akpm: journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. * We need to clean this up before we release new_bh * (which is of type BJ_IO) */ JBUFFER_TRACE(jh, "ph3: write metadata"); flags = journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor buffer */ tag_flag = 0; if (flags & 1) tag_flag |= JFS_FLAG_ESCAPE; if (!first_tag) tag_flag |= JFS_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be32(tag_flag); tagp += sizeof(journal_block_tag_t); space_left -= sizeof(journal_block_tag_t); if (first_tag) { memcpy (tagp, journal->j_uuid, 16); tagp += 16; space_left -= 16; first_tag = 0; } /* If there's no more to do, or if the descriptor is full, let the IO rip! */ if (bufs == ARRAY_SIZE(wbuf) || commit_transaction->t_buffers == NULL || space_left < sizeof(journal_block_tag_t) + 16) { jbd_debug(4, "JBD: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to the last tag we set up. */ tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; submit_bh(WRITE, bh); } cond_resched(); /* Force a new descriptor to be generated next time round the loop. */ descriptor = NULL; bufs = 0; } } /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the transaction's t_log_list queue, and metadata buffers are on the t_iobuf_list queue. Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and so we incur less scheduling load. */ jbd_debug(3, "JBD: commit phase 4\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. * See __journal_try_to_free_buffer. */ wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_iobuf; } if (cond_resched()) goto wait_for_iobuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; clear_buffer_jwrite(bh); JBUFFER_TRACE(jh, "ph4: unfile after journal write"); journal_unfile_buffer(journal, jh); /* * ->t_iobuf_list should contain only dummy buffer_heads * which were created by journal_write_metadata_buffer(). */ BUFFER_TRACE(bh, "dumping temporary bh"); journal_put_journal_head(jh); __brelse(bh); J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); free_buffer_head(bh); /* We also have to unlock and free the corresponding shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); clear_bit(BH_JWrite, &bh->b_state); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need to remember it against this transaction so that when we finally commit, we can do any checkpointing required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); /* Wake up any transactions which were waiting for this IO to complete */ wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } J_ASSERT (commit_transaction->t_shadow_list == NULL); jbd_debug(3, "JBD: commit phase 5\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: while (commit_transaction->t_log_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_log_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_ctlbuf; } if (cond_resched()) goto wait_for_ctlbuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); journal_unfile_buffer(journal, jh); journal_put_journal_head(jh); __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } jbd_debug(3, "JBD: commit phase 6\n"); if (is_journal_aborted(journal)) goto skip_commit; /* Done it all: now write the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write * entirely. */ descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { __journal_abort_hard(journal); goto skip_commit; } /* AKPM: buglet - add `i' to tmp! */ for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { journal_header_t *tmp = (journal_header_t*)jh2bh(descriptor)->b_data; tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); } JBUFFER_TRACE(descriptor, "write commit block"); { struct buffer_head *bh = jh2bh(descriptor); int ret; int barrier_done = 0; set_buffer_dirty(bh); if (journal->j_flags & JFS_BARRIER) { set_buffer_ordered(bh); barrier_done = 1; } ret = sync_dirty_buffer(bh); /* is it possible for another commit to fail at roughly * the same time as this one? If so, we don't want to * trust the barrier flag in the super, but instead want * to remember if we sent a barrier request */ if (ret == -EOPNOTSUPP && barrier_done) { char b[BDEVNAME_SIZE]; printk(KERN_WARNING "JBD: barrier-based sync failed on %s - " "disabling barriers\n", bdevname(journal->j_dev, b)); spin_lock(&journal->j_state_lock); journal->j_flags &= ~JFS_BARRIER; spin_unlock(&journal->j_state_lock); /* And try again, without the barrier */ clear_buffer_ordered(bh); set_buffer_uptodate(bh); set_buffer_dirty(bh); ret = sync_dirty_buffer(bh); } if (unlikely(ret == -EIO)) err = -EIO; put_bh(bh); /* One for getblk() */ journal_put_journal_head(descriptor); } /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on before. */ skip_commit: /* The journal should be unlocked by now. */ if (err) __journal_abort_hard(journal); jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_sync_datalist == NULL); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); J_ASSERT(commit_transaction->t_log_list == NULL); restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; jh = commit_transaction->t_forget; bh = jh2bh(jh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); /* * If there is undo-protected committed data against * this buffer, then we can remove it now. If it is a * buffer needing such protection, the old frozen_data * field now points to a committed version of the * buffer, so rotate that field to the new committed * data. * * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { kfree(jh->b_committed_data); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { kfree(jh->b_frozen_data); jh->b_frozen_data = NULL; } spin_lock(&journal->j_list_lock); cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); __journal_remove_checkpoint(jh); } /* Only re-checkpoint the buffer_head if it is marked * dirty. If the buffer was added to the BJ_Forget list * by journal_forget, it may no longer be dirty and * there's no point in keeping a checkpoint record for * it. */ /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back * that buffer in the future now that the last use has * been committed. That's not only a performance gain, * it also stops aliasing problems if the buffer is left * behind for writeback and gets reallocated for another * use in a different page. */ if (buffer_freed(bh)) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); /* needs a brelse */ release_buffer_page(bh); } spin_unlock(&journal->j_list_lock); if (cond_resched()) goto restart_loop; } /* Done with this transaction! */ jbd_debug(3, "JBD: commit phase 8\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); /* * This is a bit sleazy. We borrow j_list_lock to protect * journal->j_committing_transaction in __journal_remove_checkpoint. * Really, __jornal_remove_checkpoint should be using j_state_lock but * it's a bit hassle to hold that across __journal_remove_checkpoint */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; commit_transaction->t_cpnext = commit_transaction; commit_transaction->t_cpprev = commit_transaction; } else { commit_transaction->t_cpnext = journal->j_checkpoint_transactions; commit_transaction->t_cpprev = commit_transaction->t_cpnext->t_cpprev; commit_transaction->t_cpnext->t_cpprev = commit_transaction; commit_transaction->t_cpprev->t_cpnext = commit_transaction; } } spin_unlock(&journal->j_list_lock); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); wake_up(&journal->j_wait_done_commit); }
int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)) { int status = 0; int i, ignore_cache = 0; struct buffer_head *bh; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags); BUG_ON(!ci); BUG_ON((flags & OCFS2_BH_READAHEAD) && (flags & OCFS2_BH_IGNORE_CACHE)); if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; } if (nr < 0) { mlog(ML_ERROR, "asked to read %d blocks!\n", nr); status = -EINVAL; mlog_errno(status); goto bail; } if (nr == 0) { status = 0; goto bail; } ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { ocfs2_metadata_cache_io_unlock(ci); status = -ENOMEM; mlog_errno(status); goto bail; } } bh = bhs[i]; ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has * previously been submitted with OCFS2_BH_READAHEAD * and it hasn't yet completed I/O. * * 1) The current request is sync to disk. This rarely * happens these days, and never when performance * matters - the code can just wait on the buffer * lock and re-submit. * * 2) The current request is cached, but not * readahead. ocfs2_buffer_uptodate() will return * false anyway, so we'll wind up waiting on the * buffer lock to do I/O. We re-check the request * with after getting the lock to avoid a re-submit. * * 3) The current request is readahead (and so must * also be a caching one). We short circuit if the * buffer is locked (under I/O) and if it's in the * uptodate cache. The re-check from #2 catches the * case that the previous read-ahead completes just * before our is-it-in-flight check. */ if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { trace_ocfs2_read_blocks_from_disk( (unsigned long long)bh->b_blocknr, (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* We're using ignore_cache here to say * "go to disk" */ ignore_cache = 1; } trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr, ignore_cache, buffer_jbd(bh), buffer_dirty(bh)); if (buffer_jbd(bh)) { continue; } if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ continue; } /* A read-ahead request was made - if the * buffer is already under read-ahead from a * previously submitted request than we are * done here. */ if ((flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_read_ahead(ci, bh)) continue; lock_buffer(bh); if (buffer_jbd(bh)) { #ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); #else unlock_buffer(bh); continue; #endif } /* Re-check ocfs2_buffer_uptodate() as a * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(ci, bh)) { unlock_buffer(bh); continue; } clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, 0, bh); continue; } } status = 0; for (i = (nr - 1); i >= 0; i--) { bh = bhs[i]; if (!(flags & OCFS2_BH_READAHEAD)) { if (status) { /* Clear the rest of the buffers on error */ put_bh(bh); bhs[i] = NULL; continue; } /* We know this can't have changed as we hold the * owner sem. Avoid doing any work on the bh if the * journal has it. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back * to cleanup the other buffers. Don't need to * remove the clustered uptodate information * for this bh as it's not marked locally * uptodate. */ status = -EIO; put_bh(bh); bhs[i] = NULL; continue; } if (buffer_needs_validate(bh)) { /* We never set NeedsValidate if the * buffer was held by the journal, so * that better not have changed */ BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); status = validate(sb, bh); if (status) { put_bh(bh); bhs[i] = NULL; continue; } } } /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ ocfs2_set_buffer_uptodate(ci, bh); } ocfs2_metadata_cache_io_unlock(ci); trace_ocfs2_read_blocks_end((unsigned long long)block, nr, flags, ignore_cache); bail: return status; }
/* * Clean up a transaction's checkpoint list. * * We wait for any pending IO to complete and make sure any clean * buffers are removed from the transaction. * * Return 1 if we performed any actions which might have destroyed the * checkpoint. (journal_remove_checkpoint() deletes the transaction when * the last checkpoint buffer is cleansed) * * Called with j_list_lock held. */ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) { struct journal_head *jh, *next_jh, *last_jh; struct buffer_head *bh; int ret = 0; assert_spin_locked(&journal->j_list_lock); jh = transaction->t_checkpoint_list; if (!jh) return 0; last_jh = jh->b_cpprev; next_jh = jh; do { jh = next_jh; bh = jh2bh(jh); if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); goto out_return_1; } /* * This is foul */ if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); goto out_return_1; } if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); log_start_commit(journal, tid); log_wait_commit(journal, tid); goto out_return_1; } /* * AKPM: I think the buffer_jbddirty test is redundant - it * shouldn't have NULL b_transaction? */ next_jh = jh->b_cpnext; if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); __brelse(bh); ret = 1; } else { jbd_unlock_bh_state(bh); } } while (jh != last_jh); return ret; out_return_1: spin_lock(&journal->j_list_lock); return 1; }
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, sector_t pblocknr, int mode, int mode_flags, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); struct page *page; int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) return -ENOMEM; err = -EEXIST; /* internal code */ page = bh->b_page; if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; /* blocknr is a virtual block number */ err = nilfs_dat_translate(nilfs->ns_dat, blocknr, &pblocknr); if (unlikely(err)) { brelse(bh); goto out_locked; } } } if (mode_flags & REQ_RAHEAD) { if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { err = -EBUSY; /* internal code */ brelse(bh); goto out_locked; } } else { /* mode == READ */ lock_buffer(bh); } if (buffer_uptodate(bh)) { unlock_buffer(bh); err = -EEXIST; /* internal code */ goto found; } set_buffer_mapped(bh); bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(mode, mode_flags, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ *submit_ptr = pblocknr; err = 0; found: *pbh = bh; out_locked: unlock_page(page); put_page(page); return err; }
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, sector_t pblocknr, struct buffer_head **pbh, int newblk) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); if (unlikely(!bh)) return -ENOMEM; err = -EEXIST; /* internal code */ if (newblk) { if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || buffer_dirty(bh))) { brelse(bh); BUG(); } memset(bh->b_data, 0, 1 << inode->i_blkbits); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); goto found; } if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); /* blocknr is a virtual block number */ err = nilfs_dat_translate(dat, blocknr, &pblocknr); if (unlikely(err)) { brelse(bh); goto out_locked; } } } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); err = -EEXIST; /* internal code */ goto found; } set_buffer_mapped(bh); bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(READ, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ err = 0; found: *pbh = bh; out_locked: unlock_page(bh->b_page); page_cache_release(bh->b_page); return err; }
/* * Clean up a transaction's checkpoint list. * * We wait for any pending IO to complete and make sure any clean * buffers are removed from the transaction. * * Return 1 if we performed any actions which might have destroyed the * checkpoint. (journal_remove_checkpoint() deletes the transaction when * the last checkpoint buffer is cleansed) * * Called with the journal locked. * Called with journal_datalist_lock held. */ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) { struct journal_head *jh, *next_jh, *last_jh; struct buffer_head *bh; int ret = 0; assert_spin_locked(&journal_datalist_lock); jh = transaction->t_checkpoint_list; if (!jh) return 0; last_jh = jh->b_cpprev; next_jh = jh; do { jh = next_jh; bh = jh2bh(jh); if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal_datalist_lock); unlock_journal(journal); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); goto out_return_1; } if (jh->b_transaction != NULL) { transaction_t *transaction = jh->b_transaction; tid_t tid = transaction->t_tid; spin_unlock(&journal_datalist_lock); log_start_commit(journal, transaction); unlock_journal(journal); log_wait_commit(journal, tid); goto out_return_1; } /* * We used to test for (jh->b_list != BUF_CLEAN) here. * But unmap_underlying_metadata() can place buffer onto * BUF_CLEAN. Since refile_buffer() no longer takes buffers * off checkpoint lists, we cope with it here */ /* * AKPM: I think the buffer_jdirty test is redundant - it * shouldn't have NULL b_transaction? */ next_jh = jh->b_cpnext; if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); ret = 1; } jh = next_jh; } while (jh != last_jh); return ret; out_return_1: lock_journal(journal); spin_lock(&journal_datalist_lock); return 1; }
static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args) { struct super_block *sb; struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; sb = inode->i_sb; osb = OCFS2_SB(sb); /* * To improve performance of cold-cache inode stats, we take * the cluster lock here if possible. * * Generally, OCFS2 never trusts the contents of an inode * unless it's holding a cluster lock, so taking it here isn't * a correctness issue as much as it is a performance * improvement. * * There are three times when taking the lock is not a good idea: * * 1) During startup, before we have initialized the DLM. * * 2) If we are reading certain system files which never get * cluster locks (local alloc, truncate log). * * 3) If the process doing the iget() is responsible for * orphan dir recovery. We're holding the orphan dir lock and * can get into a deadlock with another process on another * node in ->delete_inode(). * * #1 and #2 can be simply solved by never taking the lock * here for system files (which are the only type we read * during mount). It's a heavier approach, but our main * concern is user-accessible files anyway. * * #3 works itself out because we'll eventually take the * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); trace_ocfs2_read_locked_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); /* * To maintain backwards compatibility with older versions of * ocfs2-tools, we still store the generation value for system * files. The only ones that actually matter to userspace are * the journals, but it's easier and inexpensive to just flag * all system files similarly. */ if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) generation = osb->fs_generation; ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, generation, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, OCFS2_LOCK_TYPE_OPEN, 0, inode); if (can_lock) { status = ocfs2_open_lock(inode); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } } if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { status = ocfs2_try_open_lock(inode, 0); if (status) { make_bad_inode(inode); return status; } } if (can_lock) { if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) status = ocfs2_filecheck_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE, 0); else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) status = ocfs2_filecheck_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE, 1); else status = ocfs2_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ if (!status && !buffer_jbd(bh)) { if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) status = ocfs2_filecheck_validate_inode_block( osb->sb, bh); else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) status = ocfs2_filecheck_repair_inode_block( osb->sb, bh); else status = ocfs2_validate_inode_block( osb->sb, bh); } } if (status < 0) { mlog_errno(status); goto bail; } status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; /* * This is a code bug. Right now the caller needs to * understand whether it is asking for a system file inode or * not so the proper lock names can be built. */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), "Inode %llu: system file state is ambigous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); ocfs2_populate_inode(inode, fe, 0); BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); if (buffer_dirty(bh) && !buffer_jbd(bh)) { if (can_lock) { ocfs2_inode_unlock(inode, lock_level); lock_level = 1; ocfs2_inode_lock(inode, NULL, lock_level); } status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); if (status < 0) { mlog_errno(status); goto bail; } } status = 0; bail: if (can_lock) ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); brelse(bh); return status; }
static int __process_buffer(journal_t *journal, struct journal_head *jh, int *batch_count, transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); ret = 1; } else if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* */ printk(KERN_ERR "JBD2: %s: " "Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); jbd2_log_start_commit(journal, tid); jbd2_log_wait_commit(journal, tid); ret = 1; } else if (!buffer_dirty(bh)) { ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; get_bh(bh); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); __brelse(bh); } else { /* */ BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == JBD2_NR_BATCH) { spin_unlock(&journal->j_list_lock); __flush_batch(journal, batch_count); ret = 1; } } return ret; }
static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { struct buffer_head *bh; transaction_t *transaction; journal_t *journal; int error; char *frozen_buffer = NULL; int need_copy = 0; if (is_handle_aborted(handle)) return -EROFS; transaction = handle->h_transaction; journal = transaction->t_journal; jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: bh = jh2bh(jh); lock_buffer(bh); jbd_lock_bh_state(bh); if (buffer_dirty(bh)) { if (jh->b_transaction) { J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == journal->j_committing_transaction); if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); warn_dirty_buffer(bh); } JBUFFER_TRACE(jh, "Journalling dirty buffer"); clear_buffer_dirty(bh); set_buffer_jbddirty(bh); } unlock_buffer(bh); error = -EROFS; if (is_handle_aborted(handle)) { jbd_unlock_bh_state(bh); goto out; } error = 0; if (jh->b_transaction == transaction || jh->b_next_transaction == transaction) goto done; jh->b_modified = 0; if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); jh->b_next_transaction = transaction; goto done; } if (jh->b_transaction && jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "owned by older transaction"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); if (jh->b_jlist == BJ_Shadow) { DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); wait_queue_head_t *wqh; wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); for ( ; ; ) { prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE); if (jh->b_jlist != BJ_Shadow) break; schedule(); } finish_wait(wqh, &wait.wait); goto repeat; } if (jh->b_jlist != BJ_Forget || force_copy) { JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; jbd_lock_bh_state(bh); goto done; } goto repeat; } jh->b_frozen_data = frozen_buffer; frozen_buffer = NULL; need_copy = 1; } jh->b_next_transaction = transaction; } /* * Finally, if the buffer is not journaled right now, we need to make * sure it doesn't get written to disk before the caller actually * commits the new data */ if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); spin_unlock(&journal->j_list_lock); } done: if (need_copy) { struct page *page; int offset; char *source; J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; offset = offset_in_page(jh2bh(jh)->b_data); source = kmap_atomic(page); jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source); jh->b_frozen_triggers = jh->b_triggers; } jbd_unlock_bh_state(bh); jbd2_journal_cancel_revoke(handle, jh); out: if (unlikely(frozen_buffer)) jbd2_free(frozen_buffer, bh->b_size); JBUFFER_TRACE(jh, "exit"); return error; }
int balance_internal(struct tree_balance *tb, int h, /* level of the tree */ int child_pos, /* key for insertion on higher level */ struct item_head *insert_key, /* node for insertion on higher level */ struct buffer_head **insert_ptr) { struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); struct buffer_info bi; /* * we return this: it is 0 if there is no S[h], * else it is tb->S[h]->b_item_order */ int order; int insert_num, n, k; struct buffer_head *S_new; struct item_head new_insert_key; struct buffer_head *new_insert_ptr = NULL; struct item_head *new_insert_key_addr = insert_key; RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h); PROC_INFO_INC(tb->tb_sb, balance_at[h]); order = (tbSh) ? PATH_H_POSITION(tb->tb_path, h + 1) /*tb->S[h]->b_item_order */ : 0; /* * Using insert_size[h] calculate the number insert_num of items * that must be inserted to or deleted from S[h]. */ insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); /* Check whether insert_num is proper * */ RFALSE(insert_num < -2 || insert_num > 2, "incorrect number of items inserted to the internal node (%d)", insert_num); RFALSE(h > 1 && (insert_num > 1 || insert_num < -1), "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", insert_num, h); /* Make balance in case insert_num < 0 */ if (insert_num < 0) { balance_internal_when_delete(tb, h, child_pos); return order; } k = 0; if (tb->lnum[h] > 0) { /* * shift lnum[h] items from S[h] to the left neighbor L[h]. * check how many of new items fall into L[h] or CFL[h] after * shifting */ n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ if (tb->lnum[h] <= child_pos) { /* new items don't fall into L[h] or CFL[h] */ internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); child_pos -= tb->lnum[h]; } else if (tb->lnum[h] > child_pos + insert_num) { /* all new items fall into L[h] */ internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); /* insert insert_num keys and node-pointers into L[h] */ bi.tb = tb; bi.bi_bh = tb->L[h]; bi.bi_parent = tb->FL[h]; bi.bi_position = get_left_neighbor_position(tb, h); internal_insert_childs(&bi, /*tb->L[h], tb->S[h-1]->b_next */ n + child_pos + 1, insert_num, insert_key, insert_ptr); insert_num = 0; } else { struct disk_child *dc; /* * some items fall into L[h] or CFL[h], * but some don't fall */ internal_shift1_left(tb, h, child_pos + 1); /* calculate number of new items that fall into L[h] */ k = tb->lnum[h] - child_pos - 1; bi.tb = tb; bi.bi_bh = tb->L[h]; bi.bi_parent = tb->FL[h]; bi.bi_position = get_left_neighbor_position(tb, h); internal_insert_childs(&bi, /*tb->L[h], tb->S[h-1]->b_next, */ n + child_pos + 1, k, insert_key, insert_ptr); replace_lkey(tb, h, insert_key + k); /* * replace the first node-ptr in S[h] by * node-ptr to insert_ptr[k] */ dc = B_N_CHILD(tbSh, 0); put_dc_size(dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE(insert_ptr[k])); put_dc_block_number(dc, insert_ptr[k]->b_blocknr); do_balance_mark_internal_dirty(tb, tbSh, 0); k++; insert_key += k; insert_ptr += k; insert_num -= k; child_pos = 0; } } /* tb->lnum[h] > 0 */ if (tb->rnum[h] > 0) { /*shift rnum[h] items from S[h] to the right neighbor R[h] */ /* * check how many of new items fall into R or CFR * after shifting */ n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ if (n - tb->rnum[h] >= child_pos) /* new items fall into S[h] */ internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); else if (n + insert_num - tb->rnum[h] < child_pos) { /* all new items fall into R[h] */ internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); /* insert insert_num keys and node-pointers into R[h] */ bi.tb = tb; bi.bi_bh = tb->R[h]; bi.bi_parent = tb->FR[h]; bi.bi_position = get_right_neighbor_position(tb, h); internal_insert_childs(&bi, /*tb->R[h],tb->S[h-1]->b_next */ child_pos - n - insert_num + tb->rnum[h] - 1, insert_num, insert_key, insert_ptr); insert_num = 0; } else { struct disk_child *dc; /* one of the items falls into CFR[h] */ internal_shift1_right(tb, h, n - child_pos + 1); /* calculate number of new items that fall into R[h] */ k = tb->rnum[h] - n + child_pos - 1; bi.tb = tb; bi.bi_bh = tb->R[h]; bi.bi_parent = tb->FR[h]; bi.bi_position = get_right_neighbor_position(tb, h); internal_insert_childs(&bi, /*tb->R[h], tb->R[h]->b_child, */ 0, k, insert_key + 1, insert_ptr + 1); replace_rkey(tb, h, insert_key + insert_num - k - 1); /* * replace the first node-ptr in R[h] by * node-ptr insert_ptr[insert_num-k-1] */ dc = B_N_CHILD(tb->R[h], 0); put_dc_size(dc, MAX_CHILD_SIZE(insert_ptr [insert_num - k - 1]) - B_FREE_SPACE(insert_ptr [insert_num - k - 1])); put_dc_block_number(dc, insert_ptr[insert_num - k - 1]->b_blocknr); do_balance_mark_internal_dirty(tb, tb->R[h], 0); insert_num -= (k + 1); } } /** Fill new node that appears instead of S[h] **/ RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); if (!tb->blknum[h]) { /* node S[h] is empty now */ RFALSE(!tbSh, "S[h] is equal NULL"); /* do what is needed for buffer thrown from tree */ reiserfs_invalidate_buffer(tb, tbSh); return order; } if (!tbSh) { /* create new root */ struct disk_child *dc; struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1); struct block_head *blkh; if (tb->blknum[h] != 1) reiserfs_panic(NULL, "ibalance-3", "One new node " "required for creating the new root"); /* S[h] = empty buffer from the list FEB. */ tbSh = get_FEB(tb); blkh = B_BLK_HEAD(tbSh); set_blkh_level(blkh, h + 1); /* Put the unique node-pointer to S[h] that points to S[h-1]. */ dc = B_N_CHILD(tbSh, 0); put_dc_block_number(dc, tbSh_1->b_blocknr); put_dc_size(dc, (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1))); tb->insert_size[h] -= DC_SIZE; set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE); do_balance_mark_internal_dirty(tb, tbSh, 0); /*&&&&&&&&&&&&&&&&&&&&&&&& */ check_internal(tbSh); /*&&&&&&&&&&&&&&&&&&&&&&&& */ /* put new root into path structure */ PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh; /* Change root in structure super block. */ PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr); PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1); do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); } if (tb->blknum[h] == 2) { int snum; struct buffer_info dest_bi, src_bi; /* S_new = free buffer from list FEB */ S_new = get_FEB(tb); set_blkh_level(B_BLK_HEAD(S_new), h + 1); dest_bi.tb = tb; dest_bi.bi_bh = S_new; dest_bi.bi_parent = NULL; dest_bi.bi_position = 0; src_bi.tb = tb; src_bi.bi_bh = tbSh; src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ snum = (insert_num + n + 1) / 2; if (n - snum >= child_pos) { /* new items don't fall into S_new */ /* store the delimiting key for the next level */ /* new_insert_key = (n - snum)'th key in S[h] */ memcpy(&new_insert_key, internal_key(tbSh, n - snum), KEY_SIZE); /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); } else if (n + insert_num - snum < child_pos) { /* all new items fall into S_new */ /* store the delimiting key for the next level */ /* * new_insert_key = (n + insert_item - snum)'th * key in S[h] */ memcpy(&new_insert_key, internal_key(tbSh, n + insert_num - snum), KEY_SIZE); /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); /* * insert insert_num keys and node-pointers * into S_new */ internal_insert_childs(&dest_bi, /*S_new,tb->S[h-1]->b_next, */ child_pos - n - insert_num + snum - 1, insert_num, insert_key, insert_ptr); insert_num = 0; } else { struct disk_child *dc; /* some items fall into S_new, but some don't fall */ /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); /* calculate number of new items that fall into S_new */ k = snum - n + child_pos - 1; internal_insert_childs(&dest_bi, /*S_new, */ 0, k, insert_key + 1, insert_ptr + 1); /* new_insert_key = insert_key[insert_num - k - 1] */ memcpy(&new_insert_key, insert_key + insert_num - k - 1, KEY_SIZE); /* * replace first node-ptr in S_new by node-ptr * to insert_ptr[insert_num-k-1] */ dc = B_N_CHILD(S_new, 0); put_dc_size(dc, (MAX_CHILD_SIZE (insert_ptr[insert_num - k - 1]) - B_FREE_SPACE(insert_ptr [insert_num - k - 1]))); put_dc_block_number(dc, insert_ptr[insert_num - k - 1]->b_blocknr); do_balance_mark_internal_dirty(tb, S_new, 0); insert_num -= (k + 1); } /* new_insert_ptr = node_pointer to S_new */ new_insert_ptr = S_new; RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new) || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", S_new); /* S_new is released in unfix_nodes */ } n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ if (0 <= child_pos && child_pos <= n && insert_num > 0) { bi.tb = tb; bi.bi_bh = tbSh; bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); internal_insert_childs(&bi, /*tbSh, */ /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */ child_pos, insert_num, insert_key, insert_ptr); } memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); insert_ptr[0] = new_insert_ptr; return order; }
static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; struct address_space *mapping = page->mapping; struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; unsigned first_unmapped = blocks_per_page; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; int length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ page_block = 0; do { BUG_ON(buffer_locked(bh)); if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by * __set_page_dirty_buffers -> mmapped data */ if (buffer_dirty(bh)) goto confused; if (first_unmapped == blocks_per_page) first_unmapped = page_block; continue; } if (first_unmapped != blocks_per_page) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) goto confused; if (page_block) { if (bh->b_blocknr != blocks[page_block-1] + 1) goto confused; } blocks[page_block++] = bh->b_blocknr; boundary = buffer_boundary(bh); if (boundary) { boundary_block = bh->b_blocknr; boundary_bdev = bh->b_bdev; } bdev = bh->b_bdev; } while ((bh = bh->b_this_page) != head); if (first_unmapped) goto page_is_mapped; /* * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also * using mpage_readpages then this can rarely happen. */ goto confused; } /* * The page has no buffers: map it to disk */ BUG_ON(!PageUptodate(page)); block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = (i_size - 1) >> blkbits; map_bh.b_page = page; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (buffer_new(&map_bh)) unmap_underlying_metadata(map_bh.b_bdev, map_bh.b_blocknr); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; } if (page_block) { if (map_bh.b_blocknr != blocks[page_block-1] + 1) goto confused; } blocks[page_block++] = map_bh.b_blocknr; boundary = buffer_boundary(&map_bh); bdev = map_bh.b_bdev; if (block_in_file == last_block) break; block_in_file++; } BUG_ON(page_block == 0); first_unmapped = page_block; page_is_mapped: end_index = i_size >> PAGE_CACHE_SHIFT; if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not * written out to the file." */ unsigned offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || !offset) goto confused; zero_user_segment(page, offset, PAGE_CACHE_SIZE); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { if (first_unmapped == blocks_per_page) { if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), page, wbc)) { clean_buffers(page, first_unmapped); goto out; } } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; } /* * Must try to add the page before marking the buffer clean or * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(wr, bio); goto alloc_new; } clean_buffers(page, first_unmapped); BUG_ON(PageWriteback(page)); set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { mpd->last_block_in_bio = blocks[blocks_per_page - 1]; } goto out; confused: if (bio) bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); } else { ret = -EAGAIN; goto out; } /* * The caller has a ref on the inode, so *mapping is stable */ mapping_set_error(mapping, ret); out: mpd->bio = bio; return ret; }
/* * Submit all the data buffers to disk */ static int journal_submit_data_buffers(journal_t *journal, transaction_t *commit_transaction, int write_op) { struct journal_head *jh; struct buffer_head *bh; int locked; int bufs = 0; struct buffer_head **wbuf = journal->j_wbuf; int err = 0; /* * Whenever we unlock the journal and sleep, things can get added * onto ->t_sync_datalist, so we have to keep looping back to * write_out_data until we *know* that the list is empty. * * Cleanup any flushed data buffers from the data list. Even in * abort mode, we want to flush this out as soon as possible. */ write_out_data: cond_resched(); spin_lock(&journal->j_list_lock); while (commit_transaction->t_sync_datalist) { jh = commit_transaction->t_sync_datalist; bh = jh2bh(jh); locked = 0; /* Get reference just to make sure buffer does not disappear * when we are forced to drop various locks */ get_bh(bh); /* If the buffer is dirty, we need to submit IO and hence * we need the buffer lock. We try to lock the buffer without * blocking. If we fail, we need to drop j_list_lock and do * blocking lock_buffer(). */ if (buffer_dirty(bh)) { if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; lock_buffer(bh); spin_lock(&journal->j_list_lock); } locked = 1; } /* We have to get bh_state lock. Again out of order, sigh. */ if (!inverted_lock(journal, bh)) { jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); } /* Someone already cleaned up the buffer? */ if (!buffer_jbd(bh) || bh2jh(bh) != jh || jh->b_transaction != commit_transaction || jh->b_jlist != BJ_SyncData) { jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); BUFFER_TRACE(bh, "already cleaned up"); release_data_buffer(bh); continue; } if (locked && test_clear_buffer_dirty(bh)) { BUFFER_TRACE(bh, "needs writeout, adding to array"); wbuf[bufs++] = bh; __journal_file_buffer(jh, commit_transaction, BJ_Locked); jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; } } else if (!locked && buffer_locked(bh)) { __journal_file_buffer(jh, commit_transaction, BJ_Locked); jbd_unlock_bh_state(bh); put_bh(bh); } else { BUFFER_TRACE(bh, "writeout complete: unfile"); if (unlikely(!buffer_uptodate(bh))) err = -EIO; __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); release_data_buffer(bh); } if (lock_need_resched(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } } spin_unlock(&journal->j_list_lock); journal_do_submit_data(wbuf, bufs, write_op); return err; }
static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; struct workqueue_struct *wq; struct inode *inode; unsigned long flags; int i; BUG_ON(!io_end); bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; bio_put(bio); for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; int partial_write = 0; head = page_buffers(page); if (error) SetPageError(page); BUG_ON(!head); if (head->b_size == PAGE_CACHE_SIZE) clear_buffer_dirty(head); else { loff_t offset; loff_t io_end_offset = io_end->offset + io_end->size; offset = (sector_t) page->index << PAGE_CACHE_SHIFT; bh = head; do { if ((offset >= io_end->offset) && (offset+bh->b_size <= io_end_offset)) { if (error) buffer_io_error(bh); clear_buffer_dirty(bh); } if (buffer_delay(bh)) partial_write = 1; else if (!buffer_mapped(bh)) clear_buffer_dirty(bh); else if (buffer_dirty(bh)) partial_write = 1; offset += bh->b_size; bh = bh->b_this_page; } while (bh != head); } /* * If this is a partial write which happened to make * all buffers uptodate then we can optimize away a * bogus readpage() for the next read(). Here we * 'discover' whether the page went uptodate as a * result of this (potentially partial) write. */ if (!partial_write) SetPageUptodate(page); put_io_page(io_end->pages[i]); } io_end->num_io_pages = 0; inode = io_end->inode; if (error) { io_end->flag |= EXT4_IO_END_ERROR; ext4_warning(inode->i_sb, "I/O error writing to inode %lu " "(offset %llu size %ld starting block %llu)", inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bio->bi_sector >> (inode->i_blkbits - 9)); } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); }
/* * Since the page cache of B-tree node pages or data page cache of pseudo * inodes does not have a valid mapping->host pointer, calling * mark_buffer_dirty() for their buffers causes a NULL pointer dereference; * it calls __mark_inode_dirty(NULL) through __set_page_dirty(). * To avoid this problem, the old style mark_buffer_dirty() is used instead. */ void nilfs_mark_buffer_dirty(struct buffer_head *bh) { if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) __set_page_dirty_nobuffers(bh->b_page); }
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]) { int status = 0; unsigned int i; struct buffer_head *bh; trace_ocfs2_read_blocks_sync((unsigned long long)block, nr); if (!nr) goto bail; for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(osb->sb, block++); if (bhs[i] == NULL) { status = -ENOMEM; mlog_errno(status); goto bail; } } bh = bhs[i]; if (buffer_jbd(bh)) { trace_ocfs2_read_blocks_sync_jbd( (unsigned long long)bh->b_blocknr); continue; } if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ mlog(ML_ERROR, "trying to sync read a dirty " "buffer! (blocknr = %llu), skipping\n", (unsigned long long)bh->b_blocknr); continue; } lock_buffer(bh); if (buffer_jbd(bh)) { #ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); #else unlock_buffer(bh); continue; #endif } clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, 0, bh); } for (i = nr; i > 0; i--) { bh = bhs[i - 1]; /* No need to wait on the buffer if it's managed by JBD. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* Status won't be cleared from here on out, * so we can safely record this and loop back * to cleanup the other buffers. */ status = -EIO; put_bh(bh); bhs[i - 1] = NULL; } } bail: return status; }
void __jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { struct journal_head **list = NULL; int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == NULL); if (jh->b_transaction && jh->b_jlist == jlist) return; if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { if (buffer_dirty(bh)) warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; } if (jh->b_transaction) __jbd2_journal_temp_unlink_buffer(jh); else jbd2_journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { case BJ_None: J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; break; case BJ_Forget: list = &transaction->t_forget; break; case BJ_IO: list = &transaction->t_iobuf_list; break; case BJ_Shadow: list = &transaction->t_shadow_list; break; case BJ_LogCtl: list = &transaction->t_log_list; break; case BJ_Reserved: list = &transaction->t_reserved_list; break; } __blist_add_buffer(list, jh); jh->b_jlist = jlist; if (was_dirty) set_buffer_jbddirty(bh); }
/** * sync_mft_mirror - synchronize an mft record to the mft mirror * @ni: ntfs inode whose mft record to synchronize * @m: mapped, mst protected (extent) mft record to synchronize * @sync: if true, wait for i/o completion * * Write the mapped, mst protected (extent) mft record @m described by the * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr). * * On success return 0. On error return -errno and set the volume errors flag * in the ntfs_volume to which @ni belongs. * * NOTE: We always perform synchronous i/o and ignore the @sync parameter. * * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just * schedule i/o via ->writepage or do it via kntfsd or whatever. */ static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync) { ntfs_volume *vol = ni->vol; struct page *page; unsigned int blocksize = vol->sb->s_blocksize; int max_bhs = vol->mft_record_size / blocksize; struct buffer_head *bhs[max_bhs]; struct buffer_head *bh, *head; u8 *kmirr; unsigned int block_start, block_end, m_start, m_end; int i_bhs, nr_bhs, err = 0; ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); BUG_ON(!max_bhs); if (unlikely(!vol->mftmirr_ino)) { /* This could happen during umount... */ err = sync_mft_mirror_umount(ni, m); if (likely(!err)) return err; goto err_out; } /* Get the page containing the mirror copy of the mft record @m. */ page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >> (PAGE_CACHE_SHIFT - vol->mft_record_size_bits)); if (unlikely(IS_ERR(page))) { ntfs_error(vol->sb, "Failed to map mft mirror page."); err = PTR_ERR(page); goto err_out; } /* * Exclusion against other writers. This should never be a problem * since the page in which the mft record @m resides is also locked and * hence any other writers would be held up there but it is better to * make sure no one is writing from elsewhere. */ lock_page(page); /* The address in the page of the mirror copy of the mft record @m. */ kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK); /* Copy the mst protected mft record to the mirror. */ memcpy(kmirr, m, vol->mft_record_size); /* Make sure we have mapped buffers. */ if (!page_has_buffers(page)) { no_buffers_err_out: ntfs_error(vol->sb, "Writing mft mirror records without " "existing buffers is not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; goto unlock_err_out; } bh = head = page_buffers(page); if (!bh) goto no_buffers_err_out; nr_bhs = 0; block_start = 0; m_start = kmirr - (u8*)page_address(page); m_end = m_start + vol->mft_record_size; do { block_end = block_start + blocksize; /* * If the buffer is outside the mft record, just skip it, * clearing it if it is dirty to make sure it is not written * out. It should never be marked dirty but better be safe. */ if ((block_end <= m_start) || (block_start >= m_end)) { if (buffer_dirty(bh)) { ntfs_warning(vol->sb, "Clearing dirty mft " "record page buffer. %s", ntfs_please_email); clear_buffer_dirty(bh); } continue; } if (!buffer_mapped(bh)) { ntfs_error(vol->sb, "Writing mft mirror records " "without existing mapped buffers is " "not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } if (!buffer_uptodate(bh)) { ntfs_error(vol->sb, "Writing mft mirror records " "without existing uptodate buffers is " "not implemented yet. %s", ntfs_please_email); err = -EOPNOTSUPP; continue; } BUG_ON(!nr_bhs && (m_start != block_start)); BUG_ON(nr_bhs >= max_bhs); bhs[nr_bhs++] = bh; BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); } while (block_start = block_end, (bh = bh->b_this_page) != head); if (likely(!err)) { /* Lock buffers and start synchronous write i/o on them. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; if (unlikely(test_set_buffer_locked(tbh))) BUG(); BUG_ON(!buffer_uptodate(tbh)); if (buffer_dirty(tbh)) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, tbh); } /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { struct buffer_head *tbh = bhs[i_bhs]; wait_on_buffer(tbh); if (unlikely(!buffer_uptodate(tbh))) { err = -EIO; /* * Set the buffer uptodate so the page & buffer * states don't become out of sync. */ if (PageUptodate(page)) set_buffer_uptodate(tbh); } } } else /* if (unlikely(err)) */ { /* Clean the buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) clear_buffer_dirty(bhs[i_bhs]); } unlock_err_out: /* Current state: all buffers are clean, unlocked, and uptodate. */ /* Remove the mst protection fixups again. */ post_write_mst_fixup((NTFS_RECORD*)kmirr); flush_dcache_page(page); unlock_page(page); ntfs_unmap_page(page); if (unlikely(err)) { /* I/O error during writing. This is really bad! */ ntfs_error(vol->sb, "I/O error while writing mft mirror " "record 0x%lx! You should unmount the volume " "and run chkdsk or ntfsfix.", ni->mft_no); goto err_out; } ntfs_debug("Done."); return 0; err_out: ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i). " "Volume will be left marked dirty on umount. Run " "ntfsfix on the partition after umounting to correct " "this.", -err); /* We don't want to clear the dirty bit on umount. */ NVolSetErrors(vol); return err; }