Example #1
0
/**
 * ntfs_file_fsync - sync a file to disk
 * @filp:	file to be synced
 * @dentry:	dentry describing the file to sync
 * @datasync:	if non-zero only flush user data and not metadata
 *
 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
 * system calls.  This function is inspired by fs/buffer.c::file_fsync().
 *
 * If @datasync is false, write the mft record and all associated extent mft
 * records as well as the $DATA attribute and then sync the block device.
 *
 * If @datasync is true and the attribute is non-resident, we skip the writing
 * of the mft record and all associated extent mft records (this might still
 * happen due to the write_inode_now() call).
 *
 * Also, if @datasync is true, we do not wait on the inode to be written out
 * but we always wait on the page cache pages to be written out.
 *
 * Note: In the past @filp could be NULL so we ignore it as we don't need it
 * anyway.
 *
 * Locking: Caller must hold i_sem on the inode.
 *
 * TODO: We should probably also write all attribute/index inodes associated
 * with this inode but since we have no simple way of getting to them we ignore
 * this problem for now.
 */
static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
		int datasync)
{
	struct inode *vi = dentry->d_inode;
	int err, ret = 0;

	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
	BUG_ON(S_ISDIR(vi->i_mode));
	if (!datasync || !NInoNonResident(NTFS_I(vi)))
		ret = ntfs_write_inode(vi, 1);
	write_inode_now(vi, !datasync);
	/*
	 * NOTE: If we were to use mapping->private_list (see ext2 and
	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
	 * sync_mapping_buffers(vi->i_mapping).
	 */
	err = sync_blockdev(vi->i_sb->s_bdev);
	if (unlikely(err && !ret))
		ret = err;
	if (likely(!ret))
		ntfs_debug("Done.");
	else
		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
	return ret;
}
Example #2
0
/**
 * ntfs_page_unmap - unmap a page belonging to a vnode from memory
 * @ni:		ntfs inode to which the page belongs
 * @upl:	page list of the page
 * @pl:		array of pages containing the page itself
 * @mark_dirty:	mark the page dirty
 *
 * Unmap the page belonging to the ntfs inode @ni from memory releasing it back
 * to the vm.
 *
 * The page is described by the page list @upl, the array of pages containing
 * the page @pl and the address of the mapped page contents @kaddr.
 *
 * If @mark_dirty is TRUE, tell the vm to mark the page dirty when releasing
 * the page.
 *
 * Locking: Caller must hold an iocount reference on the vnode of @ni.
 */
void ntfs_page_unmap(ntfs_inode *ni, upl_t upl, upl_page_info_array_t pl,
		const BOOL mark_dirty)
{
	kern_return_t kerr;
	BOOL was_valid, was_dirty;

	was_valid = upl_valid_page(pl, 0);
	/* The page dirty bit is only valid if the page was valid. */
	was_dirty = (was_valid && upl_dirty_page(pl, 0));
	ntfs_debug("Entering for inode 0x%llx, page was %svalid %s %sdirty%s.",
			(unsigned long long)ni->mft_no,
			was_valid ? "" : "not ",
			(int)was_valid ^ (int)was_dirty ? "but" : "and",
			was_dirty ? "" : "not ",
			mark_dirty ? ", marking it dirty" : "");
	/* Unmap the page from the kernel's address space. */
	kerr = ubc_upl_unmap(upl);
	if (kerr != KERN_SUCCESS)
		ntfs_warning(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
				(int)kerr);
	/*
	 * If the page was valid and dirty or is being made dirty or if caching
	 * for the vnode is enabled (as it will usually be the case for all
	 * metadata files), commit it thus releasing it into the vm taking care
	 * to preserve the dirty state and marking the page dirty if requested
	 * when committing the page.
	 *
	 * If the page was not valid or was valid but not dirty, it is not
	 * being marked dirty, and caching is disabled on the vnode, dump the
	 * page.
	 */
	if (was_dirty || mark_dirty || !vnode_isnocache(ni->vn)) {
		int commit_flags;

		commit_flags = UPL_COMMIT_FREE_ON_EMPTY |
				UPL_COMMIT_INACTIVATE;
		if (!was_valid && !mark_dirty)
			commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
		else if (was_dirty || mark_dirty)
			commit_flags |= UPL_COMMIT_SET_DIRTY;
		ubc_upl_commit_range(upl, 0, PAGE_SIZE, commit_flags);
		ntfs_debug("Done (committed page).");
	} else {
		ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES |
				UPL_ABORT_FREE_ON_EMPTY);
		ntfs_debug("Done (dumped page).");
	}
}
Example #3
0
/**
 * ntfs_page_dump - discard a page belonging to a vnode from memory
 * @ni:		ntfs inode to which the page belongs
 * @upl:	page list of the page
 * @pl:		array of pages containing the page itself
 *
 * Unmap the page belonging to the ntfs inode @ni from memory throwing it away.
 * Note that if the page is dirty all changes to the page will be lost as it
 * will be discarded so use this function with extreme caution.
 *
 * The page is described by the page list @upl, the array of pages containing
 * the page @pl and the address of the mapped page contents @kaddr.
 *
 * Locking: Caller must hold an iocount reference on the vnode of @ni.
 */
void ntfs_page_dump(ntfs_inode *ni, upl_t upl,
		upl_page_info_array_t pl __unused)
{
	kern_return_t kerr;

	ntfs_debug("Entering for inode 0x%llx, page is %svalid, %sdirty.",
			(unsigned long long)ni->mft_no,
			upl_valid_page(pl, 0) ? "" : "not ",
			upl_dirty_page(pl, 0) ? "" : "not ");
	/* Unmap the page from the kernel's address space. */
	kerr = ubc_upl_unmap(upl);
	if (kerr != KERN_SUCCESS)
		ntfs_warning(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
				(int)kerr);
	/* Dump the page. */
	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES |
			UPL_ABORT_FREE_ON_EMPTY);
	ntfs_debug("Done.");
}
Example #4
0
/**
 * ntfs_file_fsync - sync a file to disk
 * @filp:	file to be synced
 * @dentry:	dentry describing the file to sync
 * @datasync:	if non-zero only flush user data and not metadata
 *
 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
 * system calls.  This function is inspired by fs/buffer.c::file_fsync().
 *
 * If @datasync is false, write the mft record and all associated extent mft
 * records as well as the $DATA attribute and then sync the block device.
 *
 * If @datasync is true and the attribute is non-resident, we skip the writing
 * of the mft record and all associated extent mft records (this might still
 * happen due to the write_inode_now() call).
 *
 * Also, if @datasync is true, we do not wait on the inode to be written out
 * but we always wait on the page cache pages to be written out.
 *
 * Note: In the past @filp could be NULL so we ignore it as we don't need it
 * anyway.
 *
 * Locking: Caller must hold i_sem on the inode.
 *
 * TODO: We should probably also write all attribute/index inodes associated
 * with this inode but since we have no simple way of getting to them we ignore
 * this problem for now.
 */
static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
		int datasync)
{
	struct inode *vi = dentry->d_inode;
	int err, ret = 0;

	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
	BUG_ON(S_ISDIR(vi->i_mode));
	if (!datasync || !NInoNonResident(NTFS_I(vi)))
		ret = ntfs_write_inode(vi, 1);
	write_inode_now(vi, !datasync);
	err = sync_blockdev(vi->i_sb->s_bdev);
	if (unlikely(err && !ret))
		ret = err;
	if (likely(!ret))
		ntfs_debug("Done.");
	else
		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
	return ret;
}
Example #5
0
/**
 * ntfs_pagein - read a range of pages into memory
 * @ni:		ntfs inode whose data to read into the page range
 * @attr_ofs:	byte offset in the inode at which to start
 * @size:	number of bytes to read from the inode
 * @upl:	page list describing destination page range
 * @upl_ofs:	byte offset into page list at which to start
 * @flags:	flags further describing the pagein request
 *
 * Read @size bytes from the ntfs inode @ni, starting at byte offset @attr_ofs
 * into the inode, into the range of pages specified by the page list @upl,
 * starting at byte offset @upl_ofs into the page list.
 *
 * The @flags further describe the pagein request.  The following pagein flags
 * are currently defined in OSX kernel:
 *	UPL_IOSYNC	- Perform synchronous i/o.
 *	UPL_NOCOMMIT	- Do not commit/abort the page range.
 *	UPL_NORDAHEAD	- Do not perform any speculative read-ahead.
 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
 *
 * Inside the ntfs driver we have the need to perform pageins whilst the inode
 * is locked for writing (@ni->lock) thus we cheat and set UPL_NESTED_PAGEOUT
 * in @flags when this is the case.  We make sure to clear it in @flags before
 * calling into the cluster layer so we do not accidentally cause confusion.
 *
 * For encrypted attributes we abort for now as we do not support them yet.
 *
 * For non-resident, non-compressed attributes we use cluster_pagein_ext()
 * which deals with both normal and multi sector transfer protected attributes.
 *
 * For resident attributes and non-resident, compressed attributes we read the
 * data ourselves by mapping the page list, and in the resident case, mapping
 * the mft record, looking up the attribute in it, and copying the requested
 * data from the mapped attribute into the page list, then unmapping the mft
 * record, whilst for non-resident, compressed attributes, we get the raw inode
 * and use it with ntfs_read_compressed() to read and decompress the data into
 * our mapped page list.  We then unmap the page list and finally, if
 * UPL_NOCOMMIT is not specified, we commit (success) or abort (error) the page
 * range.
 *
 * Return 0 on success and errno on error.
 *
 * Note the pages in the page list are marked busy on entry and the busy bit is
 * cleared when we commit the page range.  Thus it is perfectly safe for us to
 * fill the pages with encrypted or mst protected data and to decrypt or mst
 * deprotect in place before committing the page range.
 *
 * Adapted from cluster_pagein_ext().
 *
 * Locking: - Caller must hold an iocount reference on the vnode of @ni.
 *	    - Caller must not hold @ni->lock or if it is held it must be for
 *	      reading unless UPL_NESTED_PAGEOUT is set in @flags in which case
 *	      the caller must hold @ni->lock for reading or writing.
 */
int ntfs_pagein(ntfs_inode *ni, s64 attr_ofs, unsigned size, upl_t upl,
		upl_offset_t upl_ofs, int flags)
{
	s64 attr_size;
	u8 *kaddr;
	kern_return_t kerr;
	unsigned to_read;
	int err;
	BOOL locked = FALSE;

	ntfs_debug("Entering for mft_no 0x%llx, offset 0x%llx, size 0x%x, "
			"pagein flags 0x%x, page list offset 0x%llx.",
			(unsigned long long)ni->mft_no,
			(unsigned long long)attr_ofs, size, flags,
			(unsigned long long)upl_ofs);
	/*
	 * If the caller did not specify any i/o, then we are done.  We cannot
	 * issue an abort because we do not have a upl or we do not know its
	 * size.
	 */
	if (!upl) {
		ntfs_error(ni->vol->mp, "NULL page list passed in (error "
				"EINVAL).");
		return EINVAL;
	}
	if (S_ISDIR(ni->mode)) {
		ntfs_error(ni->vol->mp, "Called for directory vnode.");
		err = EISDIR;
		goto err;
	}
	/*
	 * Protect against changes in initialized_size and thus against
	 * truncation also unless UPL_NESTED_PAGEOUT is set in which case the
	 * caller has already taken @ni->lock for exclusive access.  We simply
	 * leave @locked to be FALSE in this case so we do not try to drop the
	 * lock later on.
	 *
	 * If UPL_NESTED_PAGEOUT is set we clear it in @flags to ensure we do
	 * not cause confusion in the cluster layer or the VM.
	 */
	if (flags & UPL_NESTED_PAGEOUT)
		flags &= ~UPL_NESTED_PAGEOUT;
	else {
		locked = TRUE;
		lck_rw_lock_shared(&ni->lock);
	}
	/* Do not allow messing with the inode once it has been deleted. */
	if (NInoDeleted(ni)) {
		/* Remove the inode from the name cache. */
		cache_purge(ni->vn);
		err = ENOENT;
		goto err;
	}
retry_pagein:
	/*
	 * We guarantee that the size in the ubc will be smaller or equal to
	 * the size in the ntfs inode thus no need to check @ni->data_size.
	 */
	attr_size = ubc_getsize(ni->vn);
	/*
	 * Only $DATA attributes can be encrypted/compressed.  Index root can
	 * have the flags set but this means to create compressed/encrypted
	 * files, not that the attribute is compressed/encrypted.  Note we need
	 * to check for AT_INDEX_ALLOCATION since this is the type of directory
	 * index inodes.
	 */
	if (ni->type != AT_INDEX_ALLOCATION) {
		/* TODO: Deny access to encrypted attributes, just like NT4. */
		if (NInoEncrypted(ni)) {
			if (ni->type != AT_DATA)
				panic("%s(): Encrypted non-data attribute.\n",
						__FUNCTION__);
			ntfs_warning(ni->vol->mp, "Denying access to "
					"encrypted attribute (EACCES).");
			err = EACCES;
			goto err;
		}
		/* Compressed data streams need special handling. */
		if (NInoNonResident(ni) && NInoCompressed(ni) && !NInoRaw(ni)) {
			if (ni->type != AT_DATA)
				panic("%s(): Compressed non-data attribute.\n",
						__FUNCTION__);
			goto compressed;
		}
	}
	/* NInoNonResident() == NInoIndexAllocPresent() */
	if (NInoNonResident(ni)) {
		int (*callback)(buf_t, void *);

		callback = NULL;
		if (NInoMstProtected(ni) || NInoEncrypted(ni))
			callback = ntfs_cluster_iodone;
		/* Non-resident, possibly mst protected, attribute. */
		err = cluster_pagein_ext(ni->vn, upl, upl_ofs, attr_ofs, size,
				attr_size, flags, callback, NULL);
		if (!err)
			ntfs_debug("Done (cluster_pagein_ext()).");
		else
			ntfs_error(ni->vol->mp, "Failed (cluster_pagein_ext(), "
					"error %d).", err);
		if (locked)
			lck_rw_unlock_shared(&ni->lock);
		return err;
	}
compressed:
	/*
	 * The attribute is resident and/or compressed.
	 *
	 * Cannot pagein from a negative offset or if we are starting beyond
	 * the end of the attribute or if the attribute offset is not page
	 * aligned or the size requested is not a multiple of PAGE_SIZE.
	 */
	if (attr_ofs < 0 || attr_ofs >= attr_size || attr_ofs & PAGE_MASK_64 ||
			size & PAGE_MASK || upl_ofs & PAGE_MASK) {
		err = EINVAL;
		goto err;
	}
	to_read = size;
	attr_size -= attr_ofs;
	if (to_read > attr_size)
		to_read = attr_size;
	/*
	 * We do not need @attr_size any more so reuse it to hold the number of
	 * bytes available in the attribute starting at offset @attr_ofs up to
	 * a maximum of the requested number of bytes rounded up to a multiple
	 * of the system page size.
	 */
	attr_size = (to_read + PAGE_MASK) & ~PAGE_MASK;
	/* Abort any pages outside the end of the attribute. */
	if (size > attr_size && !(flags & UPL_NOCOMMIT)) {
		ubc_upl_abort_range(upl, upl_ofs + attr_size, size - attr_size,
				UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
		/* Update @size. */
		size = attr_size;
	}
	/* To access the page list contents, we need to map the page list. */
	kerr = ubc_upl_map(upl, (vm_offset_t*)&kaddr);
	if (kerr != KERN_SUCCESS) {
		ntfs_error(ni->vol->mp, "ubc_upl_map() failed (error %d).",
				(int)kerr);
		err = EIO;
		goto err;
	}
	if (!NInoNonResident(ni)) {
		/*
		 * Read the data from the resident attribute into the page
		 * list.
		 */
		err = ntfs_resident_attr_read(ni, attr_ofs, size,
				kaddr + upl_ofs);
		if (err && err != EAGAIN)
			ntfs_error(ni->vol->mp, "ntfs_resident_attr_read() "
					"failed (error %d).", err);
	} else {
		ntfs_inode *raw_ni;
		int ioflags;

		/*
		 * Get the raw inode.  We take the inode lock shared to protect
		 * against concurrent writers as the compressed data is invalid
		 * whilst a write is in progress.
		 */
		err = ntfs_raw_inode_get(ni, LCK_RW_TYPE_SHARED, &raw_ni);
		if (err)
			ntfs_error(ni->vol->mp, "Failed to get raw inode "
					"(error %d).", err);
		else {
			if (!NInoRaw(raw_ni))
				panic("%s(): Requested raw inode but got "
						"non-raw one.\n", __FUNCTION__);
			ioflags = 0;
			if (vnode_isnocache(ni->vn) ||
					vnode_isnocache(raw_ni->vn))
				ioflags |= IO_NOCACHE;
			if (vnode_isnoreadahead(ni->vn) ||
					vnode_isnoreadahead(raw_ni->vn))
				ioflags |= IO_RAOFF;
			err = ntfs_read_compressed(ni, raw_ni, attr_ofs, size,
					kaddr + upl_ofs, NULL, ioflags);
			if (err)
				ntfs_error(ni->vol->mp,
						"ntfs_read_compressed() "
						"failed (error %d).", err);
			lck_rw_unlock_shared(&raw_ni->lock);
			(void)vnode_put(raw_ni->vn);
		}
	}
	kerr = ubc_upl_unmap(upl);
	if (kerr != KERN_SUCCESS) {
		ntfs_error(ni->vol->mp, "ubc_upl_unmap() failed (error %d).",
				(int)kerr);
		if (!err)
			err = EIO;
	}
	if (!err) {
		if (!(flags & UPL_NOCOMMIT)) {
			/* Commit the page range we brought up to date. */
			ubc_upl_commit_range(upl, upl_ofs, size,
					UPL_COMMIT_FREE_ON_EMPTY);
		}
		ntfs_debug("Done (%s).", !NInoNonResident(ni) ?
				"ntfs_resident_attr_read()" :
				"ntfs_read_compressed()");
	} else /* if (err) */ {
		/*
		 * If the attribute was converted to non-resident under our
		 * nose, retry the pagein.
		 *
		 * TODO: This may no longer be possible to happen now that we
		 * lock against changes in initialized size and thus
		 * truncation...  Revisit this issue when the write code has
		 * been written and remove the check + goto if appropriate.
		 */
		if (err == EAGAIN)
			goto retry_pagein;
err:
		if (!(flags & UPL_NOCOMMIT)) {
			int upl_flags = UPL_ABORT_FREE_ON_EMPTY;
			if (err != ENOMEM)
				upl_flags |= UPL_ABORT_ERROR;
			ubc_upl_abort_range(upl, upl_ofs, size, upl_flags);
		}
		ntfs_error(ni->vol->mp, "Failed (error %d).", err);
	}
	if (locked)
		lck_rw_unlock_shared(&ni->lock);
	return err;
}
/**
 * write_mft_record_nolock - write out a mapped (extent) mft record
 * @ni:		ntfs inode describing the mapped (extent) mft record
 * @m:		mapped (extent) mft record to write
 * @sync:	if true, wait for i/o completion
 *
 * Write the mapped (extent) mft record @m described by the (regular or extent)
 * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
 * the mft mirror, that is also updated.
 *
 * On success, clean the mft record and return 0.  On error, leave the mft
 * record dirty and return -errno.  The caller should call make_bad_inode() on
 * the base inode to ensure no more access happens to this inode.  We do not do
 * it here as the caller may want to finish writing other extent mft records
 * first to minimize on-disk metadata inconsistencies.
 *
 * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
 * However, if the mft record has a counterpart in the mft mirror and @sync is
 * true, we write the mft record, wait for i/o completion, and only then write
 * the mft mirror copy.  This ensures that if the system crashes either the mft
 * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
 * false on the other hand, we start i/o on both and then wait for completion
 * on them.  This provides a speedup but no longer guarantees that you will end
 * up with a self-consistent mft record in the case of a crash but if you asked
 * for asynchronous writing you probably do not care about that anyway.
 *
 * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
 * schedule i/o via ->writepage or do it via kntfsd or whatever.
 */
int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
{
	ntfs_volume *vol = ni->vol;
	struct page *page = ni->page;
	unsigned int blocksize = vol->sb->s_blocksize;
	int max_bhs = vol->mft_record_size / blocksize;
	struct buffer_head *bhs[max_bhs];
	struct buffer_head *bh, *head;
	unsigned int block_start, block_end, m_start, m_end;
	int i_bhs, nr_bhs, err = 0;

	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
	BUG_ON(NInoAttr(ni));
	BUG_ON(!max_bhs);
	BUG_ON(!PageLocked(page));
	/*
	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
	 * mark it as clean now so that it can be redirtied later on if needed.
	 * There is no danger of races since the caller is holding the locks
	 * for the mft record @m and the page it is in.
	 */
	if (!NInoTestClearDirty(ni))
		goto done;
	/* Make sure we have mapped buffers. */
	if (!page_has_buffers(page)) {
no_buffers_err_out:
		ntfs_error(vol->sb, "Writing mft records without existing "
				"buffers is not implemented yet.  %s",
				ntfs_please_email);
		err = -EOPNOTSUPP;
		goto err_out;
	}
	bh = head = page_buffers(page);
	if (!bh)
		goto no_buffers_err_out;
	nr_bhs = 0;
	block_start = 0;
	m_start = ni->page_ofs;
	m_end = m_start + vol->mft_record_size;
	do {
		block_end = block_start + blocksize;
		/*
		 * If the buffer is outside the mft record, just skip it,
		 * clearing it if it is dirty to make sure it is not written
		 * out.  It should never be marked dirty but better be safe.
		 */
		if ((block_end <= m_start) || (block_start >= m_end)) {
			if (buffer_dirty(bh)) {
				ntfs_warning(vol->sb, "Clearing dirty mft "
						"record page buffer.  %s",
						ntfs_please_email);
				clear_buffer_dirty(bh);
			}
			continue;
		}
		if (!buffer_mapped(bh)) {
			ntfs_error(vol->sb, "Writing mft records without "
					"existing mapped buffers is not "
					"implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		if (!buffer_uptodate(bh)) {
			ntfs_error(vol->sb, "Writing mft records without "
					"existing uptodate buffers is not "
					"implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		BUG_ON(!nr_bhs && (m_start != block_start));
		BUG_ON(nr_bhs >= max_bhs);
		bhs[nr_bhs++] = bh;
		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
	} while (block_start = block_end, (bh = bh->b_this_page) != head);
	if (unlikely(err))
		goto cleanup_out;
	/* Apply the mst protection fixups. */
	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
	if (err) {
		ntfs_error(vol->sb, "Failed to apply mst fixups!");
		goto cleanup_out;
	}
	flush_dcache_mft_record_page(ni);
	/* Lock buffers and start synchronous write i/o on them. */
	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
		struct buffer_head *tbh = bhs[i_bhs];

		if (unlikely(test_set_buffer_locked(tbh)))
			BUG();
		BUG_ON(!buffer_uptodate(tbh));
		if (buffer_dirty(tbh))
			clear_buffer_dirty(tbh);
		get_bh(tbh);
		tbh->b_end_io = end_buffer_write_sync;
		submit_bh(WRITE, tbh);
	}
	/* Synchronize the mft mirror now if not @sync. */
	if (!sync && ni->mft_no < vol->mftmirr_size)
		sync_mft_mirror(ni, m, sync);
	/* Wait on i/o completion of buffers. */
	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
		struct buffer_head *tbh = bhs[i_bhs];

		wait_on_buffer(tbh);
		if (unlikely(!buffer_uptodate(tbh))) {
			err = -EIO;
			/*
			 * Set the buffer uptodate so the page & buffer states
			 * don't become out of sync.
			 */
			if (PageUptodate(page))
				set_buffer_uptodate(tbh);
		}
	}
	/* If @sync, now synchronize the mft mirror. */
	if (sync && ni->mft_no < vol->mftmirr_size)
		sync_mft_mirror(ni, m, sync);
	/* Remove the mst protection fixups again. */
	post_write_mst_fixup((NTFS_RECORD*)m);
	flush_dcache_mft_record_page(ni);
	if (unlikely(err)) {
		/* I/O error during writing.  This is really bad! */
		ntfs_error(vol->sb, "I/O error while writing mft record "
				"0x%lx!  Marking base inode as bad.  You "
				"should unmount the volume and run chkdsk.",
				ni->mft_no);
		goto err_out;
	}
done:
	ntfs_debug("Done.");
	return 0;
cleanup_out:
	/* Clean the buffers. */
	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
		clear_buffer_dirty(bhs[i_bhs]);
err_out:
	/*
	 * Current state: all buffers are clean, unlocked, and uptodate.
	 * The caller should mark the base inode as bad so that no more i/o
	 * happens.  ->clear_inode() will still be invoked so all extent inodes
	 * and other allocated memory will be freed.
	 */
	if (err == -ENOMEM) {
		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
				"Redirtying so the write is retried later.");
		mark_mft_record_dirty(ni);
		err = 0;
	}
	return err;
}
/**
 * sync_mft_mirror - synchronize an mft record to the mft mirror
 * @ni:		ntfs inode whose mft record to synchronize
 * @m:		mapped, mst protected (extent) mft record to synchronize
 * @sync:	if true, wait for i/o completion
 *
 * Write the mapped, mst protected (extent) mft record @m described by the
 * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr).
 *
 * On success return 0.  On error return -errno and set the volume errors flag
 * in the ntfs_volume to which @ni belongs.
 *
 * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
 *
 * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
 * schedule i/o via ->writepage or do it via kntfsd or whatever.
 */
static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
{
	ntfs_volume *vol = ni->vol;
	struct page *page;
	unsigned int blocksize = vol->sb->s_blocksize;
	int max_bhs = vol->mft_record_size / blocksize;
	struct buffer_head *bhs[max_bhs];
	struct buffer_head *bh, *head;
	u8 *kmirr;
	unsigned int block_start, block_end, m_start, m_end;
	int i_bhs, nr_bhs, err = 0;

	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
	BUG_ON(!max_bhs);
	if (unlikely(!vol->mftmirr_ino)) {
		/* This could happen during umount... */
		err = sync_mft_mirror_umount(ni, m);
		if (likely(!err))
			return err;
		goto err_out;
	}
	/* Get the page containing the mirror copy of the mft record @m. */
	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >>
			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
	if (unlikely(IS_ERR(page))) {
		ntfs_error(vol->sb, "Failed to map mft mirror page.");
		err = PTR_ERR(page);
		goto err_out;
	}
	/*
	 * Exclusion against other writers.   This should never be a problem
	 * since the page in which the mft record @m resides is also locked and
	 * hence any other writers would be held up there but it is better to
	 * make sure no one is writing from elsewhere.
	 */
	lock_page(page);
	/* The address in the page of the mirror copy of the mft record @m. */
	kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits)
			& ~PAGE_CACHE_MASK);
	/* Copy the mst protected mft record to the mirror. */
	memcpy(kmirr, m, vol->mft_record_size);
	/* Make sure we have mapped buffers. */
	if (!page_has_buffers(page)) {
no_buffers_err_out:
		ntfs_error(vol->sb, "Writing mft mirror records without "
				"existing buffers is not implemented yet.  %s",
				ntfs_please_email);
		err = -EOPNOTSUPP;
		goto unlock_err_out;
	}
	bh = head = page_buffers(page);
	if (!bh)
		goto no_buffers_err_out;
	nr_bhs = 0;
	block_start = 0;
	m_start = kmirr - (u8*)page_address(page);
	m_end = m_start + vol->mft_record_size;
	do {
		block_end = block_start + blocksize;
		/*
		 * If the buffer is outside the mft record, just skip it,
		 * clearing it if it is dirty to make sure it is not written
		 * out.  It should never be marked dirty but better be safe.
		 */
		if ((block_end <= m_start) || (block_start >= m_end)) {
			if (buffer_dirty(bh)) {
				ntfs_warning(vol->sb, "Clearing dirty mft "
						"record page buffer.  %s",
						ntfs_please_email);
				clear_buffer_dirty(bh);
			}
			continue;
		}
		if (!buffer_mapped(bh)) {
			ntfs_error(vol->sb, "Writing mft mirror records "
					"without existing mapped buffers is "
					"not implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		if (!buffer_uptodate(bh)) {
			ntfs_error(vol->sb, "Writing mft mirror records "
					"without existing uptodate buffers is "
					"not implemented yet.  %s",
					ntfs_please_email);
			err = -EOPNOTSUPP;
			continue;
		}
		BUG_ON(!nr_bhs && (m_start != block_start));
		BUG_ON(nr_bhs >= max_bhs);
		bhs[nr_bhs++] = bh;
		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
	} while (block_start = block_end, (bh = bh->b_this_page) != head);
	if (likely(!err)) {
		/* Lock buffers and start synchronous write i/o on them. */
		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
			struct buffer_head *tbh = bhs[i_bhs];

			if (unlikely(test_set_buffer_locked(tbh)))
				BUG();
			BUG_ON(!buffer_uptodate(tbh));
			if (buffer_dirty(tbh))
				clear_buffer_dirty(tbh);
			get_bh(tbh);
			tbh->b_end_io = end_buffer_write_sync;
			submit_bh(WRITE, tbh);
		}
		/* Wait on i/o completion of buffers. */
		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
			struct buffer_head *tbh = bhs[i_bhs];

			wait_on_buffer(tbh);
			if (unlikely(!buffer_uptodate(tbh))) {
				err = -EIO;
				/*
				 * Set the buffer uptodate so the page & buffer
				 * states don't become out of sync.
				 */
				if (PageUptodate(page))
					set_buffer_uptodate(tbh);
			}
		}
	} else /* if (unlikely(err)) */ {
		/* Clean the buffers. */
		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
			clear_buffer_dirty(bhs[i_bhs]);
	}
unlock_err_out:
	/* Current state: all buffers are clean, unlocked, and uptodate. */
	/* Remove the mst protection fixups again. */
	post_write_mst_fixup((NTFS_RECORD*)kmirr);
	flush_dcache_page(page);
	unlock_page(page);
	ntfs_unmap_page(page);
	if (unlikely(err)) {
		/* I/O error during writing.  This is really bad! */
		ntfs_error(vol->sb, "I/O error while writing mft mirror "
				"record 0x%lx!  You should unmount the volume "
				"and run chkdsk or ntfsfix.", ni->mft_no);
		goto err_out;
	}
	ntfs_debug("Done.");
	return 0;
err_out:
	ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i).  "
			"Volume will be left marked dirty on umount.  Run "
			"ntfsfix on the partition after umounting to correct "
			"this.", -err);
	/* We don't want to clear the dirty bit on umount. */
	NVolSetErrors(vol);
	return err;
}