/* hb going down releases any holds we might have had due to this node from * conn_up, conn_err, or hb_up */ void o2quo_hb_down(u8 node) { struct o2quo_state *qs = &o2quo_state; spin_lock(&qs->qs_lock); qs->qs_heartbeating--; mlog_bug_on_msg(qs->qs_heartbeating < 0, "node %u, %d heartbeating\n", node, qs->qs_heartbeating); mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); clear_bit(node, qs->qs_hb_bm); mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); o2quo_clear_hold(qs, node); spin_unlock(&qs->qs_lock); }
static void o2quo_set_hold(struct o2quo_state *qs, u8 node) { assert_spin_locked(&qs->qs_lock); if (!test_and_set_bit(node, qs->qs_hold_bm)) { qs->qs_holds++; mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, "node %u\n", node); mlog(0, "node %u, %d total\n", node, qs->qs_holds); } }
/* This is analogous to hb_up. as a node's connection comes up we delay the * quorum decision until we see it heartbeating. the hold will be droped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if * it's already heartbeating we we might be dropping a hold that conn_up got. * */ void o2quo_conn_up(u8 node) { struct o2quo_state *qs = &o2quo_state; spin_lock(&qs->qs_lock); qs->qs_connected++; mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, "node %u\n", node); mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); set_bit(node, qs->qs_conn_bm); mlog(0, "node %u, %d total\n", node, qs->qs_connected); if (!test_bit(node, qs->qs_hb_bm)) o2quo_set_hold(qs, node); else o2quo_clear_hold(qs, node); spin_unlock(&qs->qs_lock); }
static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) { assert_spin_locked(&qs->qs_lock); if (test_and_clear_bit(node, qs->qs_hold_bm)) { mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); if (--qs->qs_holds == 0) { if (qs->qs_pending) { qs->qs_pending = 0; schedule_work(&qs->qs_work); } } mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", node, qs->qs_holds); } }
static int ocfs2_find_actor(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); int ret = 0; args = opaque; mlog_bug_on_msg(!inode, "No inode in find actor!\n"); trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); if (oi->ip_blkno != args->fi_blkno) goto bail; ret = 1; bail: return ret; }
static int ocfs2_find_actor(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); int ret = 0; mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); args = opaque; mlog_bug_on_msg(!inode, "No inode in find actor!\n"); if (oi->ip_blkno != args->fi_blkno) goto bail; ret = 1; bail: mlog_exit(ret); return ret; }
static void user_ast(struct ocfs2_dlm_lksb *lksb) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); int status; mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, lockres->l_requested); spin_lock(&lockres->l_lock); status = ocfs2_dlm_lock_status(&lockres->l_lksb); if (status) { mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, "Lockres %.*s, requested ivmode. flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= user_highest_compat_lock_level(lockres->l_blocking)) { lockres->l_blocking = DLM_LOCK_NL; lockres->l_flags &= ~USER_LOCK_BLOCKED; } } lockres->l_level = lockres->l_requested; lockres->l_requested = DLM_LOCK_IV; lockres->l_flags |= USER_LOCK_ATTACHED; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); }
static void user_ast(void *opaque) { struct user_lock_res *lockres = opaque; struct dlm_lockstatus *lksb; mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); lksb = &(lockres->l_lksb); if (lksb->status != DLM_NORMAL) { mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", lksb->status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, "Lockres %.*s, requested ivmode. flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= user_highest_compat_lock_level(lockres->l_blocking)) { lockres->l_blocking = LKM_NLMODE; lockres->l_flags &= ~USER_LOCK_BLOCKED; } } lockres->l_level = lockres->l_requested; lockres->l_requested = LKM_IVMODE; lockres->l_flags |= USER_LOCK_ATTACHED; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); }
/* we've decided that we won't ever be connecting to the node again. if it's * still heartbeating we grab a hold that will delay decisions until either the * node stops heartbeating from hb_down or the caller decides that the node is * still up and calls still_up */ void o2quo_conn_err(u8 node) { struct o2quo_state *qs = &o2quo_state; spin_lock(&qs->qs_lock); if (test_bit(node, qs->qs_conn_bm)) { qs->qs_connected--; mlog_bug_on_msg(qs->qs_connected < 0, "node %u, connected %d\n", node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); if (test_bit(node, qs->qs_hb_bm)) o2quo_set_hold(qs, node); spin_unlock(&qs->qs_lock); }
static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); status = ocfs2_drop_inode_locks(inode); if (status < 0) mlog_errno(status); ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_inode_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_exit(INODE_CACHE(inode)); mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, "Clear inode of %llu, inode has %u cache items\n", (unsigned long long)oi->ip_blkno, INODE_CACHE(inode)->ci_num_cached); mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), "Clear inode of %llu, inode has a bad flag\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), "Clear inode of %llu, inode is locked\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), "Clear inode of %llu, io_mutex is locked\n", (unsigned long long)oi->ip_blkno); mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 * kernel 1, world 0 */ mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), "Clear inode of %llu, alloc_sem is locked\n", (unsigned long long)oi->ip_blkno); up_write(&oi->ip_alloc_sem); mlog_bug_on_msg(oi->ip_open_count, "Clear inode of %llu has open count %d\n", (unsigned long long)oi->ip_blkno, oi->ip_open_count); /* Clear all other flags. */ oi->ip_flags = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; /* * ip_jinode is used to track txns against this inode. We ensure that * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, &oi->ip_jinode); }
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { int ret; unsigned int locklen; struct dlm_ctxt *dlm = data; struct dlm_lock_resource *res = NULL; struct dlm_lock *lock = NULL; struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; char *name; struct list_head *head = NULL; __be64 cookie; u32 flags; u8 node; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); return DLM_REJECTED; } mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); name = past->name; locklen = past->namelen; cookie = past->cookie; flags = be32_to_cpu(past->flags); node = past->node_idx; if (locklen > DLM_LOCKID_NAME_MAX) { ret = DLM_IVBUFLEN; mlog(ML_ERROR, "Invalid name length (%d) in proxy ast " "handler!\n", locklen); goto leave; } if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == (LKM_PUT_LVB|LKM_GET_LVB)) { mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n", flags); ret = DLM_BADARGS; goto leave; } mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : (flags & LKM_GET_LVB ? "get lvb" : "none")); mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); if (past->type != DLM_AST && past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" "name=%.*s, node=%u\n", past->type, dlm_get_lock_cookie_node(be64_to_cpu(cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, node); ret = DLM_IVLOCKID; goto leave; } res = dlm_lookup_lockres(dlm, name, locklen); if (!res) { mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, " "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"), dlm_get_lock_cookie_node(be64_to_cpu(cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, node); ret = DLM_IVLOCKID; goto leave; } /* cannot get a proxy ast message if this node owns it */ BUG_ON(res->owner == dlm->node_num); mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len, res->lockname.name); spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { mlog(0, "Responding with DLM_RECOVERING!\n"); ret = DLM_RECOVERING; goto unlock_out; } if (res->state & DLM_LOCK_RES_MIGRATING) { mlog(0, "Responding with DLM_MIGRATING!\n"); ret = DLM_MIGRATING; goto unlock_out; } /* try convert queue for both ast/bast */ head = &res->converting; lock = NULL; list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } /* if not on convert, try blocked for ast, granted for bast */ if (past->type == DLM_AST) head = &res->blocked; else head = &res->granted; list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " "node=%u\n", past->type == DLM_AST ? "" : "b", dlm_get_lock_cookie_node(be64_to_cpu(cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, node); ret = DLM_NORMAL; unlock_out: spin_unlock(&res->spinlock); goto leave; do_ast: ret = DLM_NORMAL; if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, &res->granted); mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(be64_to_cpu(cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { lock->ml.type = lock->ml.convert_type; lock->ml.convert_type = LKM_IVMODE; } else { // should already be there.... } lock->lksb->status = DLM_NORMAL; /* if we requested the lvb, fetch it into our lksb now */ if (flags & LKM_GET_LVB) { BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB)); memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN); } } spin_unlock(&res->spinlock); if (past->type == DLM_AST) dlm_do_local_ast(dlm, res, lock); else dlm_do_local_bast(dlm, res, lock, past->blocked_type); leave: if (res) dlm_lockres_put(res); dlm_put(dlm); return ret; }
/* * locking: * caller needs: none * taken: takes and drops res->spinlock * held on exit: none * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, * return value from dlmunlock_master */ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; struct list_head *iter; struct dlm_lock *lock = NULL; enum dlm_status status = DLM_NORMAL; int found = 0, i; struct dlm_lockstatus *lksb = NULL; int ignore; u32 flags; struct list_head *queue; flags = be32_to_cpu(unlock->flags); if (flags & LKM_GET_LVB) { mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n"); return DLM_BADARGS; } if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) { mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL " "request!\n"); return DLM_BADARGS; } if (unlock->namelen > DLM_LOCKID_NAME_MAX) { mlog(ML_ERROR, "Invalid name length in unlock handler!\n"); return DLM_IVBUFLEN; } if (!dlm_grab(dlm)) return DLM_REJECTED; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none"); res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen); if (!res) { /* We assume here that a no lock resource simply means * it was migrated away and destroyed before the other * node could detect it. */ mlog(0, "returning DLM_FORWARD -- res no longer exists\n"); status = DLM_FORWARD; goto not_found; } queue=&res->granted; found = 0; spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { spin_unlock(&res->spinlock); mlog(0, "returning DLM_RECOVERING\n"); status = DLM_RECOVERING; goto leave; } if (res->state & DLM_LOCK_RES_MIGRATING) { spin_unlock(&res->spinlock); mlog(0, "returning DLM_MIGRATING\n"); status = DLM_MIGRATING; goto leave; } if (res->owner != dlm->node_num) { spin_unlock(&res->spinlock); mlog(0, "returning DLM_FORWARD -- not master\n"); status = DLM_FORWARD; goto leave; } for (i=0; i<3; i++) { list_for_each(iter, queue) { lock = list_entry(iter, struct dlm_lock, list); if (lock->ml.cookie == unlock->cookie && lock->ml.node == unlock->node_idx) { dlm_lock_get(lock); found = 1; break; } } if (found) break; /* scan granted -> converting -> blocked queues */ queue++; }
/* * Append this record to the tail of the extent map. It must be * tree_depth 0. The record might be an extension of an existing * record, and as such that needs to be handled. eg: * * Existing record in the extent map: * * cpos = 10, len = 10 * |---------| * * New Record: * * cpos = 10, len = 20 * |------------------| * * The passed record is the new on-disk record. The new_clusters value * is how many clusters were added to the file. If the append is a * contiguous append, the new_clusters has been added to * rec->e_clusters. If the append is an entirely new extent, then * rec->e_clusters is == new_clusters. */ int ocfs2_extent_map_append(struct inode *inode, struct ocfs2_extent_rec *rec, u32 new_clusters) { int ret; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; struct ocfs2_extent_rec *old; BUG_ON(!new_clusters); BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { /* * Size changed underneath us on disk. Drop any * straddling records and update our idea of * i_clusters */ ocfs2_extent_map_drop(inode, em->em_clusters - 1); em->em_clusters = OCFS2_I(inode)->ip_clusters; } mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != (em->em_clusters + new_clusters), "Inode %llu:\n" "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" "em->em_clusters = %u + new_clusters = %u = %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), em->em_clusters, new_clusters, em->em_clusters + new_clusters); em->em_clusters += new_clusters; ret = -ENOENT; if (le32_to_cpu(rec->e_clusters) > new_clusters) { /* This is a contiguous append */ ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, NULL, NULL); if (ent) { old = &ent->e_rec; BUG_ON((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != (le32_to_cpu(old->e_cpos) + le32_to_cpu(old->e_clusters) + new_clusters)); if (ent->e_tree_depth == 0) { BUG_ON(le32_to_cpu(old->e_cpos) != le32_to_cpu(rec->e_cpos)); BUG_ON(le64_to_cpu(old->e_blkno) != le64_to_cpu(rec->e_blkno)); ret = 0; } /* * Let non-leafs fall through as -ENOENT to * force insertion of the new leaf. */ le32_add_cpu(&old->e_clusters, new_clusters); } } if (ret == -ENOENT) ret = ocfs2_extent_map_insert(inode, rec, 0); if (ret < 0) mlog_errno(ret); return ret; }
static void user_dlm_unblock_lock(struct work_struct *work) { int new_level, status; struct user_lock_res *lockres = container_of(work, struct user_lock_res, l_work); struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); mlog(0, "processing lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), "Lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* notice that we don't clear USER_LOCK_BLOCKED here. If it's * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; /* It's valid to get here and no longer be blocked - if we get * several basts in a row, we might be queued by the first * one, the unblock thread might run and clear the queued * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; } lockres->l_flags |= USER_LOCK_IN_CANCEL; spin_unlock(&lockres->l_lock); status = dlmunlock(dlm, &lockres->l_lksb, LKM_CANCEL, user_unlock_ast, lockres); if (status != DLM_NORMAL) user_log_dlm_error("dlmunlock", status, lockres); goto drop_ref; } /* If there are still incompat holders, we can exit safely * without worrying about re-queueing this lock as that will * happen on the last call to user_cluster_unlock. */ if ((lockres->l_blocking == LKM_EXMODE) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", lockres->l_ro_holders, lockres->l_ex_holders); goto drop_ref; } if ((lockres->l_blocking == LKM_PRMODE) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); mlog(0, "can't downconvert for pr: ex = %u\n", lockres->l_ex_holders); goto drop_ref; } /* yay, we can downconvert now. */ new_level = user_highest_compat_lock_level(lockres->l_blocking); lockres->l_requested = new_level; lockres->l_flags |= USER_LOCK_BUSY; mlog(0, "Downconvert lock from %d to %d\n", lockres->l_level, new_level); spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ status = dlmlock(dlm, new_level, &lockres->l_lksb, LKM_CONVERT|LKM_VALBLK, lockres->l_name, lockres->l_namelen, user_ast, lockres, user_bast); if (status != DLM_NORMAL) { user_log_dlm_error("dlmlock", status, lockres); user_recover_from_dlm_error(lockres); } drop_ref: user_dlm_drop_inode_ref(lockres); }
static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args) { struct super_block *sb; struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; int status, can_lock; u32 generation = 0; status = -EINVAL; if (inode == NULL || inode->i_sb == NULL) { mlog(ML_ERROR, "bad inode\n"); return status; } sb = inode->i_sb; osb = OCFS2_SB(sb); if (!args) { mlog(ML_ERROR, "bad inode args\n"); make_bad_inode(inode); return status; } /* * To improve performance of cold-cache inode stats, we take * the cluster lock here if possible. * * Generally, OCFS2 never trusts the contents of an inode * unless it's holding a cluster lock, so taking it here isn't * a correctness issue as much as it is a performance * improvement. * * There are three times when taking the lock is not a good idea: * * 1) During startup, before we have initialized the DLM. * * 2) If we are reading certain system files which never get * cluster locks (local alloc, truncate log). * * 3) If the process doing the iget() is responsible for * orphan dir recovery. We're holding the orphan dir lock and * can get into a deadlock with another process on another * node in ->delete_inode(). * * #1 and #2 can be simply solved by never taking the lock * here for system files (which are the only type we read * during mount). It's a heavier approach, but our main * concern is user-accessible files anyway. * * #3 works itself out because we'll eventually take the * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); trace_ocfs2_read_locked_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); /* * To maintain backwards compatibility with older versions of * ocfs2-tools, we still store the generation value for system * files. The only ones that actually matter to userspace are * the journals, but it's easier and inexpensive to just flag * all system files similarly. */ if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) generation = osb->fs_generation; ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, OCFS2_LOCK_TYPE_META, generation, inode); ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, OCFS2_LOCK_TYPE_OPEN, 0, inode); if (can_lock) { status = ocfs2_open_lock(inode); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } status = ocfs2_inode_lock(inode, NULL, 0); if (status) { make_bad_inode(inode); mlog_errno(status); return status; } } if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { status = ocfs2_try_open_lock(inode, 0); if (status) { make_bad_inode(inode); return status; } } if (can_lock) { status = ocfs2_read_inode_block_full(inode, &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { mlog_errno(status); goto bail; } status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; /* * This is a code bug. Right now the caller needs to * understand whether it is asking for a system file inode or * not so the proper lock names can be built. */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), "Inode %llu: system file state is ambigous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); ocfs2_populate_inode(inode, fe, 0); BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); status = 0; bail: if (can_lock) ocfs2_inode_unlock(inode, 0); if (status < 0) make_bad_inode(inode); if (args && bh) brelse(bh); return status; }
void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog_entry_void(); if (!inode) goto bail; mlog(0, "Clearing inode: %llu, nlink = %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink); mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); /* For remove delete_inode vote, we hold open lock before, * now it is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); /* Do these before all the other work so that we don't bounce * the vote thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); ocfs2_extent_map_trunc(inode, 0); status = ocfs2_drop_inode_locks(inode); if (status < 0) mlog_errno(status); ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_meta_lockres); ocfs2_lock_res_free(&oi->ip_data_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_purge(inode); mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, "Clear inode of %llu, inode has %u cache items\n", (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), "Clear inode of %llu, inode has a bad flag\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), "Clear inode of %llu, inode is locked\n", (unsigned long long)oi->ip_blkno); mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), "Clear inode of %llu, io_mutex is locked\n", (unsigned long long)oi->ip_blkno); mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 * kernel 1, world 0 */ mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), "Clear inode of %llu, alloc_sem is locked\n", (unsigned long long)oi->ip_blkno); up_write(&oi->ip_alloc_sem); mlog_bug_on_msg(oi->ip_open_count, "Clear inode of %llu has open count %d\n", (unsigned long long)oi->ip_blkno, oi->ip_open_count); /* Clear all other flags. */ oi->ip_flags = OCFS2_INODE_CACHE_INLINE; oi->ip_created_trans = 0; oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; bail: mlog_exit_void(); }
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { int ret; unsigned int locklen; struct dlm_ctxt *dlm = data; struct dlm_lock_resource *res = NULL; struct dlm_lock *lock = NULL; struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; char *name; struct list_head *iter, *head=NULL; u64 cookie; u32 flags; u8 node; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); return DLM_REJECTED; } mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); name = past->name; locklen = past->namelen; cookie = past->cookie; flags = be32_to_cpu(past->flags); node = past->node_idx; if (locklen > DLM_LOCKID_NAME_MAX) { ret = DLM_IVBUFLEN; mlog(ML_ERROR, "Invalid name length (%d) in proxy ast " "handler!\n", locklen); goto leave; } if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == (LKM_PUT_LVB|LKM_GET_LVB)) { mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n", flags); ret = DLM_BADARGS; goto leave; } mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : (flags & LKM_GET_LVB ? "get lvb" : "none")); mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); if (past->type != DLM_AST && past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" "name=%.*s, node=%u\n", past->type, dlm_get_lock_cookie_node(be64_to_cpu(cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, node); ret = DLM_IVLOCKID; goto leave; } res = dlm_lookup_lockres(dlm, name, locklen); if (!res) { mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, " "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"), dlm_get_lock_cookie_node(be64_to_cpu(cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), locklen, name, node); ret = DLM_IVLOCKID; goto leave; } /* cannot get a proxy ast message if this node owns it */ BUG_ON(res->owner == dlm->node_num); mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { mlog(0, "Responding with DLM_RECOVERING!\n"); ret = DLM_RECOVERING; goto unlock_out; } if (res->state & DLM_LOCK_RES_MIGRATING) { mlog(0, "Responding with DLM_MIGRATING!\n"); ret = DLM_MIGRATING; goto unlock_out; } /* try convert queue for both ast/bast */ head = &res->converting; lock = NULL; list_for_each(iter, head) { lock = list_entry (iter, struct dlm_lock, list); if (lock->ml.cookie == cookie) goto do_ast; }
static int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int err = 0; unsigned int ext_flags; u64 p_blkno, past_eof; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, (unsigned long long)iblock, bh_result, create); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", inode, inode->i_ino); if (S_ISLNK(inode->i_mode)) { /* this always does I/O for some reason. */ err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); goto bail; } err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, &ext_flags); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " "%llu, NULL)\n", err, inode, (unsigned long long)iblock, (unsigned long long)p_blkno); goto bail; } /* * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New * allows block_prepare_write() to zero. */ mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), "ino %lu, iblock %llu\n", inode->i_ino, (unsigned long long)iblock); /* Treat the unwritten extent as a hole for zeroing purposes. */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); if (!ocfs2_sparse_alloc(osb)) { if (p_blkno == 0) { err = -EIO; mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", (unsigned long long)iblock, (unsigned long long)p_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, (unsigned long long)past_eof); if (create && (iblock >= past_eof)) set_buffer_new(bh_result); } bail: if (err < 0) err = -EIO; mlog_exit(err); return err; }
static void user_dlm_unblock_lock(struct work_struct *work) { int new_level, status; struct user_lock_res *lockres = container_of(work, struct user_lock_res, l_work); struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), "Lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* notice that we don't clear USER_LOCK_BLOCKED here. If it's * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; /* It's valid to get here and no longer be blocked - if we get * several basts in a row, we might be queued by the first * one, the unblock thread might run and clear the queued * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { if (lockres->l_flags & USER_LOCK_IN_CANCEL) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } lockres->l_flags |= USER_LOCK_IN_CANCEL; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_CANCEL); if (status) user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto drop_ref; } /* If there are still incompat holders, we can exit safely * without worrying about re-queueing this lock as that will * happen on the last call to user_cluster_unlock. */ if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders, lockres->l_ro_holders); goto drop_ref; } if ((lockres->l_blocking == DLM_LOCK_PR) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders); goto drop_ref; } /* yay, we can downconvert now. */ new_level = user_highest_compat_lock_level(lockres->l_blocking); lockres->l_requested = new_level; lockres->l_flags |= USER_LOCK_BUSY; mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, DLM_LKF_CONVERT|DLM_LKF_VALBLK, lockres->l_name, lockres->l_namelen); if (status) { user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); } drop_ref: user_dlm_drop_inode_ref(lockres); }
static int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { int status = 0; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_truncate_context *tc = NULL; mlog_entry("(inode = %llu, new_i_size = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(inode->i_mapping, new_i_size); fe = (struct ocfs2_dinode *) di_bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); status = -EIO; goto bail; } mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), "Inode %llu, inode i_size = %lld != di " "i_size = %llu, i_flags = 0x%x\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), (unsigned long long)le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); if (new_i_size > le64_to_cpu(fe->i_size)) { mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", (unsigned long long)le64_to_cpu(fe->i_size), (unsigned long long)new_i_size); status = -EINVAL; mlog_errno(status); goto bail; } mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", (unsigned long long)le64_to_cpu(fe->i_blkno), (unsigned long long)le64_to_cpu(fe->i_size), (unsigned long long)new_i_size); /* lets handle the simple truncate cases before doing any more * cluster locking. */ if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; /* This forces other nodes to sync and drop their pages. Do * this even if we have a truncate without allocation change - * ocfs2 cluster sizes can be much greater than page size, so * we have to truncate them anyway. */ status = ocfs2_data_lock(inode, 1); if (status < 0) { mlog_errno(status); goto bail; } /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking * i_size. */ status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); goto bail_unlock_data; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); goto bail_unlock_data; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); goto bail_unlock_data; } /* TODO: orphan dir cleanup here. */ bail_unlock_data: ocfs2_data_unlock(inode, 1); bail: mlog_exit(status); return status; }