/* hb going down releases any holds we might have had due to this node from
 * conn_up, conn_err, or hb_up */
void o2quo_hb_down(u8 node)
{
	struct o2quo_state *qs = &o2quo_state;

	spin_lock(&qs->qs_lock);

	qs->qs_heartbeating--;
	mlog_bug_on_msg(qs->qs_heartbeating < 0,
			"node %u, %d heartbeating\n",
			node, qs->qs_heartbeating);
	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
	clear_bit(node, qs->qs_hb_bm);

	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);

	o2quo_clear_hold(qs, node);

	spin_unlock(&qs->qs_lock);
}
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
{
	assert_spin_locked(&qs->qs_lock);

	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
		qs->qs_holds++;
		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
			        "node %u\n", node);
		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
	}
}
/* This is analogous to hb_up.  as a node's connection comes up we delay the
 * quorum decision until we see it heartbeating.  the hold will be droped in
 * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
 * it's already heartbeating we we might be dropping a hold that conn_up got.
 * */
void o2quo_conn_up(u8 node)
{
	struct o2quo_state *qs = &o2quo_state;

	spin_lock(&qs->qs_lock);

	qs->qs_connected++;
	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
		        "node %u\n", node);
	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
	set_bit(node, qs->qs_conn_bm);

	mlog(0, "node %u, %d total\n", node, qs->qs_connected);

	if (!test_bit(node, qs->qs_hb_bm))
		o2quo_set_hold(qs, node);
	else
		o2quo_clear_hold(qs, node);

	spin_unlock(&qs->qs_lock);
}
static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
{
	assert_spin_locked(&qs->qs_lock);

	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
		if (--qs->qs_holds == 0) {
			if (qs->qs_pending) {
				qs->qs_pending = 0;
				schedule_work(&qs->qs_work);
			}
		}
		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
				node, qs->qs_holds);
	}
}
Exemple #5
0
static int ocfs2_find_actor(struct inode *inode, void *opaque)
{
	struct ocfs2_find_inode_args *args = NULL;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	int ret = 0;

	args = opaque;

	mlog_bug_on_msg(!inode, "No inode in find actor!\n");

	trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);

	if (oi->ip_blkno != args->fi_blkno)
		goto bail;

	ret = 1;
bail:
	return ret;
}
Exemple #6
0
static int ocfs2_find_actor(struct inode *inode, void *opaque)
{
	struct ocfs2_find_inode_args *args = NULL;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	int ret = 0;

	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);

	args = opaque;

	mlog_bug_on_msg(!inode, "No inode in find actor!\n");

	if (oi->ip_blkno != args->fi_blkno)
		goto bail;

	ret = 1;
bail:
	mlog_exit(ret);
	return ret;
}
Exemple #7
0
static void user_ast(struct ocfs2_dlm_lksb *lksb)
{
	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
	int status;

	mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
	     lockres->l_namelen, lockres->l_name, lockres->l_level,
	     lockres->l_requested);

	spin_lock(&lockres->l_lock);

	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
	if (status) {
		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
		     status, lockres->l_namelen, lockres->l_name);
		spin_unlock(&lockres->l_lock);
		return;
	}

	mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
			"Lockres %.*s, requested ivmode. flags 0x%x\n",
			lockres->l_namelen, lockres->l_name, lockres->l_flags);

	/* we're downconverting. */
	if (lockres->l_requested < lockres->l_level) {
		if (lockres->l_requested <=
		    user_highest_compat_lock_level(lockres->l_blocking)) {
			lockres->l_blocking = DLM_LOCK_NL;
			lockres->l_flags &= ~USER_LOCK_BLOCKED;
		}
	}

	lockres->l_level = lockres->l_requested;
	lockres->l_requested = DLM_LOCK_IV;
	lockres->l_flags |= USER_LOCK_ATTACHED;
	lockres->l_flags &= ~USER_LOCK_BUSY;

	spin_unlock(&lockres->l_lock);

	wake_up(&lockres->l_event);
}
Exemple #8
0
static void user_ast(void *opaque)
{
	struct user_lock_res *lockres = opaque;
	struct dlm_lockstatus *lksb;

	mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
	     lockres->l_name);

	spin_lock(&lockres->l_lock);

	lksb = &(lockres->l_lksb);
	if (lksb->status != DLM_NORMAL) {
		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
		     lksb->status, lockres->l_namelen, lockres->l_name);
		spin_unlock(&lockres->l_lock);
		return;
	}

	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
			"Lockres %.*s, requested ivmode. flags 0x%x\n",
			lockres->l_namelen, lockres->l_name, lockres->l_flags);

	/* we're downconverting. */
	if (lockres->l_requested < lockres->l_level) {
		if (lockres->l_requested <=
		    user_highest_compat_lock_level(lockres->l_blocking)) {
			lockres->l_blocking = LKM_NLMODE;
			lockres->l_flags &= ~USER_LOCK_BLOCKED;
		}
	}

	lockres->l_level = lockres->l_requested;
	lockres->l_requested = LKM_IVMODE;
	lockres->l_flags |= USER_LOCK_ATTACHED;
	lockres->l_flags &= ~USER_LOCK_BUSY;

	spin_unlock(&lockres->l_lock);

	wake_up(&lockres->l_event);
}
/* we've decided that we won't ever be connecting to the node again.  if it's
 * still heartbeating we grab a hold that will delay decisions until either the
 * node stops heartbeating from hb_down or the caller decides that the node is
 * still up and calls still_up */
void o2quo_conn_err(u8 node)
{
	struct o2quo_state *qs = &o2quo_state;

	spin_lock(&qs->qs_lock);

	if (test_bit(node, qs->qs_conn_bm)) {
		qs->qs_connected--;
		mlog_bug_on_msg(qs->qs_connected < 0,
				"node %u, connected %d\n",
				node, qs->qs_connected);

		clear_bit(node, qs->qs_conn_bm);
	}

	mlog(0, "node %u, %d total\n", node, qs->qs_connected);

	if (test_bit(node, qs->qs_hb_bm))
		o2quo_set_hold(qs, node);

	spin_unlock(&qs->qs_lock);
}
Exemple #10
0
static void ocfs2_clear_inode(struct inode *inode)
{
	int status;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

	clear_inode(inode);
	trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
				inode->i_nlink);

	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
			"Inode=%lu\n", inode->i_ino);

	dquot_drop(inode);

	/* To preven remote deletes we hold open lock before, now it
	 * is time to unlock PR and EX open locks. */
	ocfs2_open_unlock(inode);

	/* Do these before all the other work so that we don't bounce
	 * the downconvert thread while waiting to destroy the locks. */
	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
	ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);

	ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
			   &oi->ip_la_data_resv);
	ocfs2_resv_init_once(&oi->ip_la_data_resv);

	/* We very well may get a clear_inode before all an inodes
	 * metadata has hit disk. Of course, we can't drop any cluster
	 * locks until the journal has finished with it. The only
	 * exception here are successfully wiped inodes - their
	 * metadata can now be considered to be part of the system
	 * inodes from which it came. */
	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
		ocfs2_checkpoint_inode(inode);

	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
			"Clear inode of %llu, inode has io markers\n",
			(unsigned long long)oi->ip_blkno);

	ocfs2_extent_map_trunc(inode, 0);

	status = ocfs2_drop_inode_locks(inode);
	if (status < 0)
		mlog_errno(status);

	ocfs2_lock_res_free(&oi->ip_rw_lockres);
	ocfs2_lock_res_free(&oi->ip_inode_lockres);
	ocfs2_lock_res_free(&oi->ip_open_lockres);

	ocfs2_metadata_cache_exit(INODE_CACHE(inode));

	mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
			"Clear inode of %llu, inode has %u cache items\n",
			(unsigned long long)oi->ip_blkno,
			INODE_CACHE(inode)->ci_num_cached);

	mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
			"Clear inode of %llu, inode has a bad flag\n",
			(unsigned long long)oi->ip_blkno);

	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
			"Clear inode of %llu, inode is locked\n",
			(unsigned long long)oi->ip_blkno);

	mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
			"Clear inode of %llu, io_mutex is locked\n",
			(unsigned long long)oi->ip_blkno);
	mutex_unlock(&oi->ip_io_mutex);

	/*
	 * down_trylock() returns 0, down_write_trylock() returns 1
	 * kernel 1, world 0
	 */
	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
			"Clear inode of %llu, alloc_sem is locked\n",
			(unsigned long long)oi->ip_blkno);
	up_write(&oi->ip_alloc_sem);

	mlog_bug_on_msg(oi->ip_open_count,
			"Clear inode of %llu has open count %d\n",
			(unsigned long long)oi->ip_blkno, oi->ip_open_count);

	/* Clear all other flags. */
	oi->ip_flags = 0;
	oi->ip_dir_start_lookup = 0;
	oi->ip_blkno = 0ULL;

	/*
	 * ip_jinode is used to track txns against this inode. We ensure that
	 * the journal is flushed before journal shutdown. Thus it is safe to
	 * have inodes get cleaned up after journal shutdown.
	 */
	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
				       &oi->ip_jinode);
}
Exemple #11
0
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
			  void **ret_data)
{
	int ret;
	unsigned int locklen;
	struct dlm_ctxt *dlm = data;
	struct dlm_lock_resource *res = NULL;
	struct dlm_lock *lock = NULL;
	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
	char *name;
	struct list_head *head = NULL;
	__be64 cookie;
	u32 flags;
	u8 node;

	if (!dlm_grab(dlm)) {
		dlm_error(DLM_REJECTED);
		return DLM_REJECTED;
	}

	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
			"Domain %s not fully joined!\n", dlm->name);

	name = past->name;
	locklen = past->namelen;
	cookie = past->cookie;
	flags = be32_to_cpu(past->flags);
	node = past->node_idx;

	if (locklen > DLM_LOCKID_NAME_MAX) {
		ret = DLM_IVBUFLEN;
		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
		     "handler!\n", locklen);
		goto leave;
	}

	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
	     (LKM_PUT_LVB|LKM_GET_LVB)) {
		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
		     flags);
		ret = DLM_BADARGS;
		goto leave;
	}

	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
		  (flags & LKM_GET_LVB ? "get lvb" : "none"));

	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);

	if (past->type != DLM_AST &&
	    past->type != DLM_BAST) {
		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
		     "name=%.*s, node=%u\n", past->type,
		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
		     locklen, name, node);
		ret = DLM_IVLOCKID;
		goto leave;
	}

	res = dlm_lookup_lockres(dlm, name, locklen);
	if (!res) {
		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
		     locklen, name, node);
		ret = DLM_IVLOCKID;
		goto leave;
	}

	/* cannot get a proxy ast message if this node owns it */
	BUG_ON(res->owner == dlm->node_num);

	mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
	     res->lockname.name);

	spin_lock(&res->spinlock);
	if (res->state & DLM_LOCK_RES_RECOVERING) {
		mlog(0, "Responding with DLM_RECOVERING!\n");
		ret = DLM_RECOVERING;
		goto unlock_out;
	}
	if (res->state & DLM_LOCK_RES_MIGRATING) {
		mlog(0, "Responding with DLM_MIGRATING!\n");
		ret = DLM_MIGRATING;
		goto unlock_out;
	}
	/* try convert queue for both ast/bast */
	head = &res->converting;
	lock = NULL;
	list_for_each_entry(lock, head, list) {
		if (lock->ml.cookie == cookie)
			goto do_ast;
	}

	/* if not on convert, try blocked for ast, granted for bast */
	if (past->type == DLM_AST)
		head = &res->blocked;
	else
		head = &res->granted;

	list_for_each_entry(lock, head, list) {
		if (lock->ml.cookie == cookie)
			goto do_ast;
	}

	mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
	     "node=%u\n", past->type == DLM_AST ? "" : "b",
	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
	     locklen, name, node);

	ret = DLM_NORMAL;
unlock_out:
	spin_unlock(&res->spinlock);
	goto leave;

do_ast:
	ret = DLM_NORMAL;
	if (past->type == DLM_AST) {
		/* do not alter lock refcount.  switching lists. */
		list_move_tail(&lock->list, &res->granted);
		mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
		     dlm->name, res->lockname.len, res->lockname.name,
		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
		     lock->ml.type, lock->ml.convert_type);

		if (lock->ml.convert_type != LKM_IVMODE) {
			lock->ml.type = lock->ml.convert_type;
			lock->ml.convert_type = LKM_IVMODE;
		} else {
			// should already be there....
		}

		lock->lksb->status = DLM_NORMAL;

		/* if we requested the lvb, fetch it into our lksb now */
		if (flags & LKM_GET_LVB) {
			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
		}
	}
	spin_unlock(&res->spinlock);

	if (past->type == DLM_AST)
		dlm_do_local_ast(dlm, res, lock);
	else
		dlm_do_local_bast(dlm, res, lock, past->blocked_type);

leave:
	if (res)
		dlm_lockres_put(res);

	dlm_put(dlm);
	return ret;
}
/*
 * locking:
 *   caller needs:  none
 *   taken:         takes and drops res->spinlock
 *   held on exit:  none
 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
 *          return value from dlmunlock_master
 */
int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
			    void **ret_data)
{
	struct dlm_ctxt *dlm = data;
	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
	struct dlm_lock_resource *res = NULL;
	struct list_head *iter;
	struct dlm_lock *lock = NULL;
	enum dlm_status status = DLM_NORMAL;
	int found = 0, i;
	struct dlm_lockstatus *lksb = NULL;
	int ignore;
	u32 flags;
	struct list_head *queue;

	flags = be32_to_cpu(unlock->flags);

	if (flags & LKM_GET_LVB) {
		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
		return DLM_BADARGS;
	}

	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
		     "request!\n");
		return DLM_BADARGS;
	}

	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
		return DLM_IVBUFLEN;
	}

	if (!dlm_grab(dlm))
		return DLM_REJECTED;

	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
			"Domain %s not fully joined!\n", dlm->name);

	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");

	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
	if (!res) {
		/* We assume here that a no lock resource simply means
		 * it was migrated away and destroyed before the other
		 * node could detect it. */
		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
		status = DLM_FORWARD;
		goto not_found;
	}

	queue=&res->granted;
	found = 0;
	spin_lock(&res->spinlock);
	if (res->state & DLM_LOCK_RES_RECOVERING) {
		spin_unlock(&res->spinlock);
		mlog(0, "returning DLM_RECOVERING\n");
		status = DLM_RECOVERING;
		goto leave;
	}

	if (res->state & DLM_LOCK_RES_MIGRATING) {
		spin_unlock(&res->spinlock);
		mlog(0, "returning DLM_MIGRATING\n");
		status = DLM_MIGRATING;
		goto leave;
	}

	if (res->owner != dlm->node_num) {
		spin_unlock(&res->spinlock);
		mlog(0, "returning DLM_FORWARD -- not master\n");
		status = DLM_FORWARD;
		goto leave;
	}

	for (i=0; i<3; i++) {
		list_for_each(iter, queue) {
			lock = list_entry(iter, struct dlm_lock, list);
			if (lock->ml.cookie == unlock->cookie &&
		    	    lock->ml.node == unlock->node_idx) {
				dlm_lock_get(lock);
				found = 1;
				break;
			}
		}
		if (found)
			break;
		/* scan granted -> converting -> blocked queues */
		queue++;
	}
Exemple #13
0
/*
 * Append this record to the tail of the extent map.  It must be
 * tree_depth 0.  The record might be an extension of an existing
 * record, and as such that needs to be handled.  eg:
 *
 * Existing record in the extent map:
 *
 *	cpos = 10, len = 10
 *	|---------|
 *
 * New Record:
 *
 *	cpos = 10, len = 20
 *	|------------------|
 *
 * The passed record is the new on-disk record.  The new_clusters value
 * is how many clusters were added to the file.  If the append is a
 * contiguous append, the new_clusters has been added to
 * rec->e_clusters.  If the append is an entirely new extent, then
 * rec->e_clusters is == new_clusters.
 */
int ocfs2_extent_map_append(struct inode *inode,
			    struct ocfs2_extent_rec *rec,
			    u32 new_clusters)
{
	int ret;
	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
	struct ocfs2_extent_map_entry *ent;
	struct ocfs2_extent_rec *old;

	BUG_ON(!new_clusters);
	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);

	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
		/*
		 * Size changed underneath us on disk.  Drop any
		 * straddling records and update our idea of
		 * i_clusters
		 */
		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
		em->em_clusters = OCFS2_I(inode)->ip_clusters;
	}

	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
			 le32_to_cpu(rec->e_clusters)) !=
			(em->em_clusters + new_clusters),
			"Inode %llu:\n"
			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
			"em->em_clusters = %u + new_clusters = %u = %u\n",
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
			em->em_clusters, new_clusters,
			em->em_clusters + new_clusters);

	em->em_clusters += new_clusters;

	ret = -ENOENT;
	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
		/* This is a contiguous append */
		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
					      NULL, NULL);
		if (ent) {
			old = &ent->e_rec;
			BUG_ON((le32_to_cpu(rec->e_cpos) +
				le32_to_cpu(rec->e_clusters)) !=
				 (le32_to_cpu(old->e_cpos) +
				  le32_to_cpu(old->e_clusters) +
				  new_clusters));
			if (ent->e_tree_depth == 0) {
				BUG_ON(le32_to_cpu(old->e_cpos) !=
				       le32_to_cpu(rec->e_cpos));
				BUG_ON(le64_to_cpu(old->e_blkno) !=
				       le64_to_cpu(rec->e_blkno));
				ret = 0;
			}
			/*
			 * Let non-leafs fall through as -ENOENT to
			 * force insertion of the new leaf.
			 */
			le32_add_cpu(&old->e_clusters, new_clusters);
		}
	}

	if (ret == -ENOENT)
		ret = ocfs2_extent_map_insert(inode, rec, 0);
	if (ret < 0)
		mlog_errno(ret);
	return ret;
}
Exemple #14
0
static void user_dlm_unblock_lock(struct work_struct *work)
{
	int new_level, status;
	struct user_lock_res *lockres =
		container_of(work, struct user_lock_res, l_work);
	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);

	mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
	     lockres->l_name);

	spin_lock(&lockres->l_lock);

	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
			"Lockres %.*s, flags 0x%x\n",
			lockres->l_namelen, lockres->l_name, lockres->l_flags);

	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
	 * set, we want user_ast clear it. */
	lockres->l_flags &= ~USER_LOCK_QUEUED;

	/* It's valid to get here and no longer be blocked - if we get
	 * several basts in a row, we might be queued by the first
	 * one, the unblock thread might run and clear the queued
	 * flag, and finally we might get another bast which re-queues
	 * us before our ast for the downconvert is called. */
	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
		spin_unlock(&lockres->l_lock);
		goto drop_ref;
	}

	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
		spin_unlock(&lockres->l_lock);
		goto drop_ref;
	}

	if (lockres->l_flags & USER_LOCK_BUSY) {
		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
			spin_unlock(&lockres->l_lock);
			goto drop_ref;
		}

		lockres->l_flags |= USER_LOCK_IN_CANCEL;
		spin_unlock(&lockres->l_lock);

		status = dlmunlock(dlm,
				   &lockres->l_lksb,
				   LKM_CANCEL,
				   user_unlock_ast,
				   lockres);
		if (status != DLM_NORMAL)
			user_log_dlm_error("dlmunlock", status, lockres);
		goto drop_ref;
	}

	/* If there are still incompat holders, we can exit safely
	 * without worrying about re-queueing this lock as that will
	 * happen on the last call to user_cluster_unlock. */
	if ((lockres->l_blocking == LKM_EXMODE)
	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
		spin_unlock(&lockres->l_lock);
		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
			lockres->l_ro_holders, lockres->l_ex_holders);
		goto drop_ref;
	}

	if ((lockres->l_blocking == LKM_PRMODE)
	    && lockres->l_ex_holders) {
		spin_unlock(&lockres->l_lock);
		mlog(0, "can't downconvert for pr: ex = %u\n",
			lockres->l_ex_holders);
		goto drop_ref;
	}

	/* yay, we can downconvert now. */
	new_level = user_highest_compat_lock_level(lockres->l_blocking);
	lockres->l_requested = new_level;
	lockres->l_flags |= USER_LOCK_BUSY;
	mlog(0, "Downconvert lock from %d to %d\n",
		lockres->l_level, new_level);
	spin_unlock(&lockres->l_lock);

	/* need lock downconvert request now... */
	status = dlmlock(dlm,
			 new_level,
			 &lockres->l_lksb,
			 LKM_CONVERT|LKM_VALBLK,
			 lockres->l_name,
			 lockres->l_namelen,
			 user_ast,
			 lockres,
			 user_bast);
	if (status != DLM_NORMAL) {
		user_log_dlm_error("dlmlock", status, lockres);
		user_recover_from_dlm_error(lockres);
	}

drop_ref:
	user_dlm_drop_inode_ref(lockres);
}
Exemple #15
0
static int ocfs2_read_locked_inode(struct inode *inode,
				   struct ocfs2_find_inode_args *args)
{
	struct super_block *sb;
	struct ocfs2_super *osb;
	struct ocfs2_dinode *fe;
	struct buffer_head *bh = NULL;
	int status, can_lock;
	u32 generation = 0;

	status = -EINVAL;
	if (inode == NULL || inode->i_sb == NULL) {
		mlog(ML_ERROR, "bad inode\n");
		return status;
	}
	sb = inode->i_sb;
	osb = OCFS2_SB(sb);

	if (!args) {
		mlog(ML_ERROR, "bad inode args\n");
		make_bad_inode(inode);
		return status;
	}

	/*
	 * To improve performance of cold-cache inode stats, we take
	 * the cluster lock here if possible.
	 *
	 * Generally, OCFS2 never trusts the contents of an inode
	 * unless it's holding a cluster lock, so taking it here isn't
	 * a correctness issue as much as it is a performance
	 * improvement.
	 *
	 * There are three times when taking the lock is not a good idea:
	 *
	 * 1) During startup, before we have initialized the DLM.
	 *
	 * 2) If we are reading certain system files which never get
	 *    cluster locks (local alloc, truncate log).
	 *
	 * 3) If the process doing the iget() is responsible for
	 *    orphan dir recovery. We're holding the orphan dir lock and
	 *    can get into a deadlock with another process on another
	 *    node in ->delete_inode().
	 *
	 * #1 and #2 can be simply solved by never taking the lock
	 * here for system files (which are the only type we read
	 * during mount). It's a heavier approach, but our main
	 * concern is user-accessible files anyway.
	 *
	 * #3 works itself out because we'll eventually take the
	 * cluster lock before trusting anything anyway.
	 */
	can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
		&& !ocfs2_mount_local(osb);

	trace_ocfs2_read_locked_inode(
		(unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);

	/*
	 * To maintain backwards compatibility with older versions of
	 * ocfs2-tools, we still store the generation value for system
	 * files. The only ones that actually matter to userspace are
	 * the journals, but it's easier and inexpensive to just flag
	 * all system files similarly.
	 */
	if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
		generation = osb->fs_generation;

	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
				  OCFS2_LOCK_TYPE_META,
				  generation, inode);

	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
				  OCFS2_LOCK_TYPE_OPEN,
				  0, inode);

	if (can_lock) {
		status = ocfs2_open_lock(inode);
		if (status) {
			make_bad_inode(inode);
			mlog_errno(status);
			return status;
		}
		status = ocfs2_inode_lock(inode, NULL, 0);
		if (status) {
			make_bad_inode(inode);
			mlog_errno(status);
			return status;
		}
	}

	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
		status = ocfs2_try_open_lock(inode, 0);
		if (status) {
			make_bad_inode(inode);
			return status;
		}
	}

	if (can_lock) {
		status = ocfs2_read_inode_block_full(inode, &bh,
						     OCFS2_BH_IGNORE_CACHE);
	} else {
		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
		/*
		 * If buffer is in jbd, then its checksum may not have been
		 * computed as yet.
		 */
		if (!status && !buffer_jbd(bh))
			status = ocfs2_validate_inode_block(osb->sb, bh);
	}
	if (status < 0) {
		mlog_errno(status);
		goto bail;
	}

	status = -EINVAL;
	fe = (struct ocfs2_dinode *) bh->b_data;

	/*
	 * This is a code bug. Right now the caller needs to
	 * understand whether it is asking for a system file inode or
	 * not so the proper lock names can be built.
	 */
	mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
			!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
			"Inode %llu: system file state is ambigous\n",
			(unsigned long long)args->fi_blkno);

	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
	    S_ISBLK(le16_to_cpu(fe->i_mode)))
		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));

	ocfs2_populate_inode(inode, fe, 0);

	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));

	status = 0;

bail:
	if (can_lock)
		ocfs2_inode_unlock(inode, 0);

	if (status < 0)
		make_bad_inode(inode);

	if (args && bh)
		brelse(bh);

	return status;
}
Exemple #16
0
void ocfs2_clear_inode(struct inode *inode)
{
	int status;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

	mlog_entry_void();

	if (!inode)
		goto bail;

	mlog(0, "Clearing inode: %llu, nlink = %u\n",
	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);

	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
			"Inode=%lu\n", inode->i_ino);

	/* For remove delete_inode vote, we hold open lock before,
	 * now it is time to unlock PR and EX open locks. */
	ocfs2_open_unlock(inode);

	/* Do these before all the other work so that we don't bounce
	 * the vote thread while waiting to destroy the locks. */
	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);

	/* We very well may get a clear_inode before all an inodes
	 * metadata has hit disk. Of course, we can't drop any cluster
	 * locks until the journal has finished with it. The only
	 * exception here are successfully wiped inodes - their
	 * metadata can now be considered to be part of the system
	 * inodes from which it came. */
	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
		ocfs2_checkpoint_inode(inode);

	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
			"Clear inode of %llu, inode has io markers\n",
			(unsigned long long)oi->ip_blkno);

	ocfs2_extent_map_trunc(inode, 0);

	status = ocfs2_drop_inode_locks(inode);
	if (status < 0)
		mlog_errno(status);

	ocfs2_lock_res_free(&oi->ip_rw_lockres);
	ocfs2_lock_res_free(&oi->ip_meta_lockres);
	ocfs2_lock_res_free(&oi->ip_data_lockres);
	ocfs2_lock_res_free(&oi->ip_open_lockres);

	ocfs2_metadata_cache_purge(inode);

	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
			"Clear inode of %llu, inode has %u cache items\n",
			(unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);

	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
			"Clear inode of %llu, inode has a bad flag\n",
			(unsigned long long)oi->ip_blkno);

	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
			"Clear inode of %llu, inode is locked\n",
			(unsigned long long)oi->ip_blkno);

	mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
			"Clear inode of %llu, io_mutex is locked\n",
			(unsigned long long)oi->ip_blkno);
	mutex_unlock(&oi->ip_io_mutex);

	/*
	 * down_trylock() returns 0, down_write_trylock() returns 1
	 * kernel 1, world 0
	 */
	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
			"Clear inode of %llu, alloc_sem is locked\n",
			(unsigned long long)oi->ip_blkno);
	up_write(&oi->ip_alloc_sem);

	mlog_bug_on_msg(oi->ip_open_count,
			"Clear inode of %llu has open count %d\n",
			(unsigned long long)oi->ip_blkno, oi->ip_open_count);

	/* Clear all other flags. */
	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
	oi->ip_created_trans = 0;
	oi->ip_last_trans = 0;
	oi->ip_dir_start_lookup = 0;
	oi->ip_blkno = 0ULL;

bail:
	mlog_exit_void();
}
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
			  void **ret_data)
{
	int ret;
	unsigned int locklen;
	struct dlm_ctxt *dlm = data;
	struct dlm_lock_resource *res = NULL;
	struct dlm_lock *lock = NULL;
	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
	char *name;
	struct list_head *iter, *head=NULL;
	u64 cookie;
	u32 flags;
	u8 node;

	if (!dlm_grab(dlm)) {
		dlm_error(DLM_REJECTED);
		return DLM_REJECTED;
	}

	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
			"Domain %s not fully joined!\n", dlm->name);

	name = past->name;
	locklen = past->namelen;
	cookie = past->cookie;
	flags = be32_to_cpu(past->flags);
	node = past->node_idx;

	if (locklen > DLM_LOCKID_NAME_MAX) {
		ret = DLM_IVBUFLEN;
		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
		     "handler!\n", locklen);
		goto leave;
	}

	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
	     (LKM_PUT_LVB|LKM_GET_LVB)) {
		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
		     flags);
		ret = DLM_BADARGS;
		goto leave;
	}

	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
		  (flags & LKM_GET_LVB ? "get lvb" : "none"));

	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);

	if (past->type != DLM_AST &&
	    past->type != DLM_BAST) {
		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
		     "name=%.*s, node=%u\n", past->type,
		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
		     locklen, name, node);
		ret = DLM_IVLOCKID;
		goto leave;
	}

	res = dlm_lookup_lockres(dlm, name, locklen);
	if (!res) {
		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
		     locklen, name, node);
		ret = DLM_IVLOCKID;
		goto leave;
	}

	/* cannot get a proxy ast message if this node owns it */
	BUG_ON(res->owner == dlm->node_num);

	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);

	spin_lock(&res->spinlock);
	if (res->state & DLM_LOCK_RES_RECOVERING) {
		mlog(0, "Responding with DLM_RECOVERING!\n");
		ret = DLM_RECOVERING;
		goto unlock_out;
	}
	if (res->state & DLM_LOCK_RES_MIGRATING) {
		mlog(0, "Responding with DLM_MIGRATING!\n");
		ret = DLM_MIGRATING;
		goto unlock_out;
	}
	/* try convert queue for both ast/bast */
	head = &res->converting;
	lock = NULL;
	list_for_each(iter, head) {
		lock = list_entry (iter, struct dlm_lock, list);
		if (lock->ml.cookie == cookie)
			goto do_ast;
	}
Exemple #18
0
static int ocfs2_get_block(struct inode *inode, sector_t iblock,
			   struct buffer_head *bh_result, int create)
{
	int err = 0;
	unsigned int ext_flags;
	u64 p_blkno, past_eof;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);

	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
		   (unsigned long long)iblock, bh_result, create);

	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
		     inode, inode->i_ino);

	if (S_ISLNK(inode->i_mode)) {
		/* this always does I/O for some reason. */
		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
		goto bail;
	}

	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
					  &ext_flags);
	if (err) {
		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
		     (unsigned long long)p_blkno);
		goto bail;
	}

	/*
	 * ocfs2 never allocates in this function - the only time we
	 * need to use BH_New is when we're extending i_size on a file
	 * system which doesn't support holes, in which case BH_New
	 * allows block_prepare_write() to zero.
	 */
	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
			"ino %lu, iblock %llu\n", inode->i_ino,
			(unsigned long long)iblock);

	/* Treat the unwritten extent as a hole for zeroing purposes. */
	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
		map_bh(bh_result, inode->i_sb, p_blkno);

	if (!ocfs2_sparse_alloc(osb)) {
		if (p_blkno == 0) {
			err = -EIO;
			mlog(ML_ERROR,
			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
			     (unsigned long long)iblock,
			     (unsigned long long)p_blkno,
			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
			dump_stack();
		}

		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
		     (unsigned long long)past_eof);

		if (create && (iblock >= past_eof))
			set_buffer_new(bh_result);
	}

bail:
	if (err < 0)
		err = -EIO;

	mlog_exit(err);
	return err;
}
Exemple #19
0
static void user_dlm_unblock_lock(struct work_struct *work)
{
	int new_level, status;
	struct user_lock_res *lockres =
		container_of(work, struct user_lock_res, l_work);
	struct ocfs2_cluster_connection *conn =
		cluster_connection_from_user_lockres(lockres);

	mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);

	spin_lock(&lockres->l_lock);

	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
			"Lockres %.*s, flags 0x%x\n",
			lockres->l_namelen, lockres->l_name, lockres->l_flags);

	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
	 * set, we want user_ast clear it. */
	lockres->l_flags &= ~USER_LOCK_QUEUED;

	/* It's valid to get here and no longer be blocked - if we get
	 * several basts in a row, we might be queued by the first
	 * one, the unblock thread might run and clear the queued
	 * flag, and finally we might get another bast which re-queues
	 * us before our ast for the downconvert is called. */
	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
		mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
		     lockres->l_namelen, lockres->l_name);
		spin_unlock(&lockres->l_lock);
		goto drop_ref;
	}

	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
		mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
		     lockres->l_namelen, lockres->l_name);
		spin_unlock(&lockres->l_lock);
		goto drop_ref;
	}

	if (lockres->l_flags & USER_LOCK_BUSY) {
		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
			mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
			     lockres->l_namelen, lockres->l_name);
			spin_unlock(&lockres->l_lock);
			goto drop_ref;
		}

		lockres->l_flags |= USER_LOCK_IN_CANCEL;
		spin_unlock(&lockres->l_lock);

		status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
					  DLM_LKF_CANCEL);
		if (status)
			user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
		goto drop_ref;
	}

	/* If there are still incompat holders, we can exit safely
	 * without worrying about re-queueing this lock as that will
	 * happen on the last call to user_cluster_unlock. */
	if ((lockres->l_blocking == DLM_LOCK_EX)
	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
		spin_unlock(&lockres->l_lock);
		mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
		     lockres->l_namelen, lockres->l_name,
		     lockres->l_ex_holders, lockres->l_ro_holders);
		goto drop_ref;
	}

	if ((lockres->l_blocking == DLM_LOCK_PR)
	    && lockres->l_ex_holders) {
		spin_unlock(&lockres->l_lock);
		mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
		     lockres->l_namelen, lockres->l_name,
		     lockres->l_ex_holders);
		goto drop_ref;
	}

	/* yay, we can downconvert now. */
	new_level = user_highest_compat_lock_level(lockres->l_blocking);
	lockres->l_requested = new_level;
	lockres->l_flags |= USER_LOCK_BUSY;
	mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
	     lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
	spin_unlock(&lockres->l_lock);

	/* need lock downconvert request now... */
	status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
				DLM_LKF_CONVERT|DLM_LKF_VALBLK,
				lockres->l_name,
				lockres->l_namelen);
	if (status) {
		user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
		user_recover_from_dlm_error(lockres);
	}

drop_ref:
	user_dlm_drop_inode_ref(lockres);
}
Exemple #20
0
static int ocfs2_truncate_file(struct inode *inode,
			       struct buffer_head *di_bh,
			       u64 new_i_size)
{
	int status = 0;
	struct ocfs2_dinode *fe = NULL;
	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
	struct ocfs2_truncate_context *tc = NULL;

	mlog_entry("(inode = %llu, new_i_size = %llu\n",
		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
		   (unsigned long long)new_i_size);

	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
	truncate_inode_pages(inode->i_mapping, new_i_size);

	fe = (struct ocfs2_dinode *) di_bh->b_data;
	if (!OCFS2_IS_VALID_DINODE(fe)) {
		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
		status = -EIO;
		goto bail;
	}

	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
			"Inode %llu, inode i_size = %lld != di "
			"i_size = %llu, i_flags = 0x%x\n",
			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			i_size_read(inode),
			(unsigned long long)le64_to_cpu(fe->i_size),
			le32_to_cpu(fe->i_flags));

	if (new_i_size > le64_to_cpu(fe->i_size)) {
		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
		     (unsigned long long)le64_to_cpu(fe->i_size),
		     (unsigned long long)new_i_size);
		status = -EINVAL;
		mlog_errno(status);
		goto bail;
	}

	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
	     (unsigned long long)le64_to_cpu(fe->i_blkno),
	     (unsigned long long)le64_to_cpu(fe->i_size),
	     (unsigned long long)new_i_size);

	/* lets handle the simple truncate cases before doing any more
	 * cluster locking. */
	if (new_i_size == le64_to_cpu(fe->i_size))
		goto bail;

	/* This forces other nodes to sync and drop their pages. Do
	 * this even if we have a truncate without allocation change -
	 * ocfs2 cluster sizes can be much greater than page size, so
	 * we have to truncate them anyway.  */
	status = ocfs2_data_lock(inode, 1);
	if (status < 0) {
		mlog_errno(status);
		goto bail;
	}

	/* alright, we're going to need to do a full blown alloc size
	 * change. Orphan the inode so that recovery can complete the
	 * truncate if necessary. This does the task of marking
	 * i_size. */
	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
	if (status < 0) {
		mlog_errno(status);
		goto bail_unlock_data;
	}

	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
	if (status < 0) {
		mlog_errno(status);
		goto bail_unlock_data;
	}

	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
	if (status < 0) {
		mlog_errno(status);
		goto bail_unlock_data;
	}

	/* TODO: orphan dir cleanup here. */
bail_unlock_data:
	ocfs2_data_unlock(inode, 1);

bail:

	mlog_exit(status);
	return status;
}