Example #1
0
static int
ocfs2_filecheck_read_inode_block_full(struct inode *inode,
				      struct buffer_head **bh,
				      int flags, int type)
{
	int rc;
	struct buffer_head *tmp = *bh;

	if (!type) /* Check inode block */
		rc = ocfs2_read_blocks(INODE_CACHE(inode),
				OCFS2_I(inode)->ip_blkno,
				1, &tmp, flags,
				ocfs2_filecheck_validate_inode_block);
	else /* Repair inode block */
		rc = ocfs2_read_blocks(INODE_CACHE(inode),
				OCFS2_I(inode)->ip_blkno,
				1, &tmp, flags,
				ocfs2_filecheck_repair_inode_block);

	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
	if (!rc && !*bh)
		*bh = tmp;

	return rc;
}
Example #2
0
/*
 * Go through the dirblocks pre-filling them.  We try to coalesce adjacent
 * ones.  Don't care to return errors, because it's a cache pre-fill.
 */
static int try_to_cache(ocfs2_filesys *fs, struct rb_node *node,
			char *pre_cache_buf, int pre_cache_blocks)
{
	int cached_blocks = 0;
	o2fsck_dirblock_entry *dbe;
	uint64_t io_blkno = 0, next_blkno = 0;
	int count = 0;
	errcode_t err;
	uint64_t blocks_seen = 0;

	o2fsck_reset_blocks_cached();
	for (; node; node = rb_next(node)) {
		blocks_seen++;
		dbe = rb_entry(node, o2fsck_dirblock_entry, e_node);
		if (io_blkno) {
			assert(count);
			assert(next_blkno > io_blkno);

			if ((next_blkno == dbe->e_blkno) &&
			    (count < pre_cache_blocks)) {
				count++;
				next_blkno++;
				continue;
			}

			if (!o2fsck_worth_caching(count)) {
				io_blkno = 0;
				break;
			}

			err = ocfs2_read_blocks(fs, io_blkno, count,
						pre_cache_buf);
			io_blkno = 0;
			next_blkno = 0;

			if (err)
				break;

			cached_blocks += count;
			count = 0;
		}

		assert(!io_blkno);
		io_blkno = dbe->e_blkno;
		next_blkno = io_blkno + 1;
		count = 1;
	}

	/* Catch the last pre-fill buffer */
	if (io_blkno && o2fsck_worth_caching(count)) {
		assert(count);
		err = ocfs2_read_blocks(fs, io_blkno, count, pre_cache_buf);
		if (!err)
			cached_blocks += count;
	}

	return cached_blocks;
}
Example #3
0
static int walk_blocks_func(ocfs2_filesys *fs,
			    uint64_t blkno,
			    uint64_t bcount,
			    uint16_t ext_flags,
			    void *priv_data)
{
	struct walk_block *wb = priv_data;
	errcode_t ret;
	int i;
	uint32_t *up;

	ret = ocfs2_read_blocks(fs, blkno, 1, wb->buf);
	if (ret) {
		com_err("walk_blocks_func", ret,
			"while reading block %"PRIu64, blkno);
		return OCFS2_BLOCK_ABORT;
	}

	/* set every other bit */
	up = (uint32_t *) wb->buf;
	for(i = 0; i < (fs->fs_blocksize / sizeof(uint32_t)); i++) {
		up[i] |= 0x55555555;
		wb->used += BITCOUNT(up[i]);
	}

	ret = io_write_block(fs->fs_io, blkno, 1, wb->buf);
	if (ret) {
		com_err("walk_blocks_func", ret,
			"while writing block %"PRIu64, blkno);
		return OCFS2_BLOCK_ABORT;
	}
	return 0;
}
Example #4
0
static errcode_t ocfs2_validate_ocfs1_header(ocfs2_filesys *fs)
{
	errcode_t ret;
	char *blk;
	struct ocfs1_vol_disk_hdr *hdr;
	
	ret = ocfs2_malloc_block(fs->fs_io, &blk);
	if (ret)
		return ret;

	ret = ocfs2_read_blocks(fs, 0, 1, blk);
	if (ret)
		goto out;
	hdr = (struct ocfs1_vol_disk_hdr *)blk;

	ret = OCFS2_ET_OCFS_REV;
	if (le32_to_cpu(hdr->major_version) == OCFS1_MAJOR_VERSION)
		goto out;
	if (!memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
		    strlen(OCFS1_VOLUME_SIGNATURE)))
		goto out;

	ret = 0;

out:
	ocfs2_free(&blk);

	return ret;
}
Example #5
0
int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
{
	int ret;
	struct ocfs2_slot_info *si = osb->slot_info;

	if (si == NULL)
		return 0;

	BUG_ON(si->si_blocks == 0);
	BUG_ON(si->si_bh == NULL);

	trace_ocfs2_refresh_slot_info(si->si_blocks);

	/*
	 * We pass -1 as blocknr because we expect all of si->si_bh to
	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
	 * this is not true, the read of -1 (UINT64_MAX) will fail.
	 */
	ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
				si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
	if (ret == 0) {
		spin_lock(&osb->osb_lock);
		ocfs2_update_slot_info(si);
		spin_unlock(&osb->osb_lock);
	}

	return ret;
}
int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
{
	int ret;
	struct ocfs2_slot_info *si = osb->slot_info;

	if (si == NULL)
		return 0;

	BUG_ON(si->si_blocks == 0);
	BUG_ON(si->si_bh == NULL);

	trace_ocfs2_refresh_slot_info(si->si_blocks);

	/*
                                                               
                                                                 
                                                            
  */
	ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
				si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
	if (ret == 0) {
		spin_lock(&osb->osb_lock);
		ocfs2_update_slot_info(si);
		spin_unlock(&osb->osb_lock);
	}

	return ret;
}
Example #7
0
/*
 * JBD Might read a cached version of another nodes journal file. We
 * don't want this as this file changes often and we get no
 * notification on those changes. The only way to be sure that we've
 * got the most up to date version of those blocks then is to force
 * read them off disk. Just searching through the buffer cache won't
 * work as there may be pages backing this file which are still marked
 * up to date. We know things can't change on this file underneath us
 * as we have the lock by now :)
 */
static int ocfs2_force_read_journal(struct inode *inode)
{
	int status = 0;
	int i;
	u64 v_blkno, p_blkno, p_blocks, num_blocks;
#define CONCURRENT_JOURNAL_FILL 32ULL
	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];

	mlog_entry_void();

	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);

	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
	v_blkno = 0;
	while (v_blkno < num_blocks) {
		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
						     &p_blkno, &p_blocks, NULL);
		if (status < 0) {
			mlog_errno(status);
			goto bail;
		}

		if (p_blocks > CONCURRENT_JOURNAL_FILL)
			p_blocks = CONCURRENT_JOURNAL_FILL;

		/* We are reading journal data which should not
		 * be put in the uptodate cache */
		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
					   p_blkno, p_blocks, bhs, 0,
					   NULL);
		if (status < 0) {
			mlog_errno(status);
			goto bail;
		}

		for(i = 0; i < p_blocks; i++) {
			brelse(bhs[i]);
			bhs[i] = NULL;
		}

		v_blkno += p_blocks;
	}

bail:
	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
		if (bhs[i])
			brelse(bhs[i]);
	mlog_exit(status);
	return status;
}
Example #8
0
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
				int flags)
{
	int rc;
	struct buffer_head *tmp = *bh;

	rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
			       1, &tmp, flags, ocfs2_validate_inode_block);

	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
	if (!rc && !*bh)
		*bh = tmp;

	return rc;
}
Example #9
0
errcode_t ocfs2_read_extent_block_nocheck(ocfs2_filesys *fs,
					  uint64_t blkno,
					  char *eb_buf)
{
	errcode_t ret;
	char *blk;
	struct ocfs2_extent_block *eb;

	if ((blkno < OCFS2_SUPER_BLOCK_BLKNO) ||
	    (blkno > fs->fs_blocks))
		return OCFS2_ET_BAD_BLKNO;

	ret = ocfs2_malloc_block(fs->fs_io, &blk);
	if (ret)
		return ret;

	ret = ocfs2_read_blocks(fs, blkno, 1, blk);
	if (ret)
		goto out;

	eb = (struct ocfs2_extent_block *)blk;

	ret = ocfs2_validate_meta_ecc(fs, blk, &eb->h_check);
	if (ret)
		goto out;

	if (memcmp(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE,
		   strlen(OCFS2_EXTENT_BLOCK_SIGNATURE))) {
		ret = OCFS2_ET_BAD_EXTENT_BLOCK_MAGIC;
		goto out;
	}

	memcpy(eb_buf, blk, fs->fs_blocksize);

	eb = (struct ocfs2_extent_block *) eb_buf;
	ocfs2_swap_extent_block_to_cpu(fs, eb);

out:
	ocfs2_free(&blk);

	return ret;
}
Example #10
0
static errcode_t read_journal_block(ocfs2_filesys *fs, 
				    struct journal_info *ji, 
				    uint64_t blkoff, 
				    char *buf,
				    int check_dup)
{
	errcode_t err;
	uint64_t	blkno;

	err = lookup_journal_block(fs, ji, blkoff, &blkno, check_dup);
	if (err)
		return err;

	err = ocfs2_read_blocks(fs, blkno, 1, buf);
	if (err)
		com_err(whoami, err, "while reading block %"PRIu64" of slot "
			"%d's journal", blkno, ji->ji_slot);

	return err;
}
Example #11
0
errcode_t ocfs2_read_group_desc(ocfs2_filesys *fs, uint64_t blkno,
				char *gd_buf)
{
	errcode_t ret;
	char *blk;
	struct ocfs2_group_desc *gd;

	if ((blkno < OCFS2_SUPER_BLOCK_BLKNO) ||
	    (blkno > fs->fs_blocks))
		return OCFS2_ET_BAD_BLKNO;

	ret = ocfs2_malloc_block(fs->fs_io, &blk);
	if (ret)
		return ret;

	ret = ocfs2_read_blocks(fs, blkno, 1, blk);
	if (ret)
		goto out;

	gd = (struct ocfs2_group_desc *)blk;

	ret = ocfs2_validate_meta_ecc(fs, blk, &gd->bg_check);
	if (ret)
		goto out;

	ret = OCFS2_ET_BAD_GROUP_DESC_MAGIC;
	if (memcmp(gd->bg_signature, OCFS2_GROUP_DESC_SIGNATURE,
		   strlen(OCFS2_GROUP_DESC_SIGNATURE)))
		goto out;

	memcpy(gd_buf, blk, fs->fs_blocksize);

	gd = (struct ocfs2_group_desc *)gd_buf;
	ocfs2_swap_group_desc_to_cpu(fs, gd);

	ret = 0;
out:
	ocfs2_free(&blk);

	return ret;
}
Example #12
0
static int read_whole_func(ocfs2_filesys *fs,
			   uint64_t blkno,
			   uint64_t bcount,
			   uint16_t ext_flags,
			   void *priv_data)
{
	struct read_whole_context *ctx = priv_data;

	if (ext_flags & OCFS2_EXT_UNWRITTEN) {
		memset(ctx->ptr, 0, fs->fs_blocksize);
		ctx->errcode = 0;
	} else
		ctx->errcode = ocfs2_read_blocks(fs, blkno, 1, ctx->ptr);

	if (ctx->errcode)
		return OCFS2_BLOCK_ABORT;

	ctx->ptr += fs->fs_blocksize;
	ctx->offset += fs->fs_blocksize;

	return 0;
}
Example #13
0
static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
				  struct ocfs2_slot_info *si)
{
	int status = 0;
	u64 blkno;
	unsigned long long blocks, bytes = 0;
	unsigned int i;
	struct buffer_head *bh;

	status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
	if (status)
		goto bail;

	blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
	BUG_ON(blocks > UINT_MAX);
	si->si_blocks = blocks;
	if (!si->si_blocks)
		goto bail;

	if (si->si_extended)
		si->si_slots_per_block =
			(osb->sb->s_blocksize /
			 sizeof(struct ocfs2_extended_slot));
	else
		si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);

	/* The size checks above should ensure this */
	BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);

	trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);

	si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
			    GFP_KERNEL);
	if (!si->si_bh) {
		status = -ENOMEM;
		mlog_errno(status);
		goto bail;
	}

	for (i = 0; i < si->si_blocks; i++) {
		status = ocfs2_extent_map_get_blocks(si->si_inode, i,
						     &blkno, NULL, NULL);
		if (status < 0) {
			mlog_errno(status);
			goto bail;
		}

		trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);

		bh = NULL;  /* Acquire a fresh bh */
		status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
					   1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
		if (status < 0) {
			mlog_errno(status);
			goto bail;
		}

		si->si_bh[i] = bh;
	}

bail:
	return status;
}
Example #14
0
static int ocfs2_read_locked_inode(struct inode *inode,
				   struct ocfs2_find_inode_args *args)
{
	struct super_block *sb;
	struct ocfs2_super *osb;
	struct ocfs2_dinode *fe;
	struct buffer_head *bh = NULL;
	int status, can_lock;
	u32 generation = 0;

	mlog_entry("(0x%p, 0x%p)\n", inode, args);

	status = -EINVAL;
	if (inode == NULL || inode->i_sb == NULL) {
		mlog(ML_ERROR, "bad inode\n");
		return status;
	}
	sb = inode->i_sb;
	osb = OCFS2_SB(sb);

	if (!args) {
		mlog(ML_ERROR, "bad inode args\n");
		make_bad_inode(inode);
		return status;
	}

	/*
	 * To improve performance of cold-cache inode stats, we take
	 * the cluster lock here if possible.
	 *
	 * Generally, OCFS2 never trusts the contents of an inode
	 * unless it's holding a cluster lock, so taking it here isn't
	 * a correctness issue as much as it is a performance
	 * improvement.
	 *
	 * There are three times when taking the lock is not a good idea:
	 *
	 * 1) During startup, before we have initialized the DLM.
	 *
	 * 2) If we are reading certain system files which never get
	 *    cluster locks (local alloc, truncate log).
	 *
	 * 3) If the process doing the iget() is responsible for
	 *    orphan dir recovery. We're holding the orphan dir lock and
	 *    can get into a deadlock with another process on another
	 *    node in ->delete_inode().
	 *
	 * #1 and #2 can be simply solved by never taking the lock
	 * here for system files (which are the only type we read
	 * during mount). It's a heavier approach, but our main
	 * concern is user-accesible files anyway.
	 *
	 * #3 works itself out because we'll eventually take the
	 * cluster lock before trusting anything anyway.
	 */
	can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
		&& !ocfs2_mount_local(osb);

	/*
	 * To maintain backwards compatibility with older versions of
	 * ocfs2-tools, we still store the generation value for system
	 * files. The only ones that actually matter to userspace are
	 * the journals, but it's easier and inexpensive to just flag
	 * all system files similarly.
	 */
	if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
		generation = osb->fs_generation;

	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
				  OCFS2_LOCK_TYPE_META,
				  generation, inode);

	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
				  OCFS2_LOCK_TYPE_OPEN,
				  0, inode);

	if (can_lock) {
		status = ocfs2_open_lock(inode);
		if (status) {
			make_bad_inode(inode);
			mlog_errno(status);
			return status;
		}
		status = ocfs2_inode_lock(inode, NULL, 0);
		if (status) {
			make_bad_inode(inode);
			mlog_errno(status);
			return status;
		}
	}

	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
		status = ocfs2_try_open_lock(inode, 0);
		if (status) {
			make_bad_inode(inode);	
			return status;
		}
	}

	if (can_lock)
		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
					   OCFS2_BH_IGNORE_CACHE);
	else
		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
	}

	status = -EINVAL;
	fe = (struct ocfs2_dinode *) bh->b_data;
	if (!OCFS2_IS_VALID_DINODE(fe)) {
		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
		     (unsigned long long)args->fi_blkno, 7,
		     fe->i_signature);
		goto bail;
	}

	/*
	 * This is a code bug. Right now the caller needs to
	 * understand whether it is asking for a system file inode or
	 * not so the proper lock names can be built.
	 */
	mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
			!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
			"Inode %llu: system file state is ambigous\n",
			(unsigned long long)args->fi_blkno);

	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
	    S_ISBLK(le16_to_cpu(fe->i_mode)))
    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));

	if (ocfs2_populate_inode(inode, fe, 0) < 0)
		goto bail;

	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));

	status = 0;

bail:
	if (can_lock)
		ocfs2_inode_unlock(inode, 0);

	if (status < 0)
		make_bad_inode(inode);

	if (args && bh)
		brelse(bh);

	mlog_exit(status);
	return status;
}
Example #15
0
errcode_t ocfs2_read_super(ocfs2_filesys *fs, uint64_t superblock, char *sb)
{
	errcode_t ret;
	char *blk, *swapblk;
	struct ocfs2_dinode *di, *orig_super;
	int orig_blocksize;
	int blocksize = io_get_blksize(fs->fs_io);

	ret = ocfs2_malloc_block(fs->fs_io, &blk);
	if (ret)
		return ret;

	ret = ocfs2_read_blocks(fs, superblock, 1, blk);
	if (ret)
		goto out_blk;

	di = (struct ocfs2_dinode *)blk;

	ret = OCFS2_ET_BAD_MAGIC;
	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)))
		goto out_blk;

	/*
	 * We want to use the latest superblock to validate.  We need
	 * a local-endian copy in fs->fs_super, and the unswapped copy to
	 * check in blk.  ocfs2_validate_meta_ecc() uses fs->fs_super and
	 * fs->fs_blocksize.
	 */
	ret = ocfs2_malloc_block(fs->fs_io, &swapblk);
	if (ret)
		goto out_blk;

	memcpy(swapblk, blk, blocksize);
	orig_super = fs->fs_super;
	orig_blocksize = fs->fs_blocksize;
	fs->fs_super = (struct ocfs2_dinode *)swapblk;
	fs->fs_blocksize = blocksize;
	ocfs2_swap_inode_to_cpu(fs, fs->fs_super);

	ret = ocfs2_validate_meta_ecc(fs, blk, &di->i_check);

	fs->fs_super = orig_super;
	fs->fs_blocksize = orig_blocksize;
	ocfs2_free(&swapblk);

	if (ret)
		goto out_blk;

	ocfs2_swap_inode_to_cpu(fs, di);
	if (!sb)
		fs->fs_super = di;
	else {
		memcpy(sb, blk, fs->fs_blocksize);
		ocfs2_free(&blk);
	}

	return 0;

out_blk:
	ocfs2_free(&blk);

	return ret;
}
Example #16
0
/*
 * Zero the area past i_size but still within an allocated
 * cluster. This avoids exposing nonzero data on subsequent file
 * extends.
 */
static errcode_t ocfs2_zero_tail_for_truncate(ocfs2_cached_inode *ci,
					      uint64_t new_size)
{
	errcode_t ret;
	char *buf = NULL;
	ocfs2_filesys *fs = ci->ci_fs;
	uint64_t start_blk, p_blkno, contig_blocks, start_off;
	int count, byte_counts, bpc = fs->fs_clustersize /fs->fs_blocksize;
	uint16_t ext_flags;

	if (new_size == 0)
		return 0;

	start_blk = new_size / fs->fs_blocksize;

	ret = ocfs2_extent_map_get_blocks(ci, start_blk, 1,
					  &p_blkno, &contig_blocks, &ext_flags);
	if (ret)
		goto out;

	/* Tail is a hole. */
	if (!p_blkno)
		goto out;

	if (ext_flags & OCFS2_EXT_REFCOUNTED) {
		uint32_t cpos = ocfs2_blocks_to_clusters(fs, start_blk);
		ret = ocfs2_refcount_cow(ci, cpos, 1, cpos + 1);
		if (ret)
			goto out;

		ret = ocfs2_extent_map_get_blocks(ci, start_blk, 1,
						  &p_blkno, &contig_blocks,
						  &ext_flags);
		if (ret)
			goto out;

		assert(!(ext_flags & OCFS2_EXT_REFCOUNTED) && p_blkno);

	}

	/* calculate the total blocks we need to empty. */
	count = bpc - (p_blkno & (bpc - 1));
	ret = ocfs2_malloc_blocks(fs->fs_io, count, &buf);
	if (ret)
		goto out;

	ret = ocfs2_read_blocks(fs, p_blkno, count, buf);
	if (ret)
		goto out;

	/* empty the content after the new_size and within the same cluster. */
	start_off = new_size % fs->fs_blocksize;
	byte_counts = count * fs->fs_blocksize - start_off;
	memset(buf + start_off, 0, byte_counts);

	ret = io_write_block(fs->fs_io, p_blkno, count, buf);

out:
	if (buf)
		ocfs2_free(&buf);
	return ret;
}
Example #17
0
errcode_t ocfs2_file_read(ocfs2_cached_inode *ci, void *buf, uint32_t count,
			  uint64_t offset, uint32_t *got)
{
	ocfs2_filesys	*fs = ci->ci_fs;
	errcode_t	ret = 0;
	char		*ptr = (char *) buf;
	uint32_t	wanted_blocks;
	uint64_t	contig_blocks;
	uint64_t	v_blkno;
	uint64_t	p_blkno;
	uint32_t	tmp;
	uint64_t	num_blocks;
	uint16_t	extent_flags;

	if (ci->ci_inode->i_dyn_features & OCFS2_INLINE_DATA_FL)
		return ocfs2_inline_data_read(ci->ci_inode, buf, count,
					      offset, got);

	/* o_direct requires aligned io */
	tmp = fs->fs_blocksize - 1;
	if ((count & tmp) || (offset & (uint64_t)tmp) ||
	    ((unsigned long)ptr & tmp))
		return OCFS2_ET_INVALID_ARGUMENT;

	wanted_blocks = count >> OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
	v_blkno = offset >> OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
	*got = 0;

	num_blocks = (ci->ci_inode->i_size + fs->fs_blocksize - 1) >>
			OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;

	if (v_blkno >= num_blocks)
		return 0;

	if (v_blkno + wanted_blocks > num_blocks)
		wanted_blocks = (uint32_t) (num_blocks - v_blkno);

	while(wanted_blocks) {
		ret = ocfs2_extent_map_get_blocks(ci, v_blkno, 1,
						  &p_blkno, &contig_blocks,
						  &extent_flags);
		if (ret)
			return ret;

		if (contig_blocks > wanted_blocks)
			contig_blocks = wanted_blocks;

		if (!p_blkno || extent_flags & OCFS2_EXT_UNWRITTEN) {
			/*
			 * we meet with a hole or an unwritten extent,
			 * so just empty the content.
			 */
			memset(ptr, 0, contig_blocks * fs->fs_blocksize);
		} else {
			ret = ocfs2_read_blocks(fs, p_blkno, contig_blocks,
						ptr);
			if (ret)
				return ret;
		}

		*got += (contig_blocks <<
			 OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits);
		wanted_blocks -= contig_blocks;

		if (wanted_blocks) {
			ptr += (contig_blocks <<
				OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits);
			v_blkno += (uint64_t)contig_blocks;
		} else {
			if (*got + offset > ci->ci_inode->i_size)
				*got = (uint32_t) (ci->ci_inode->i_size - offset);
			/* break */
		}
	}

	return ret;
}