rc_t btree_impl::_sx_split_if_needed (btree_page_h &page, const w_keystr_t &new_key) { bool need_split = !page.check_space_for_insert_node(new_key) || (page.is_insertion_extremely_skewed_right() && page.check_chance_for_norecord_split(new_key)); if (!need_split) { return RCOK; // easy } PageID new_page_id; // we are running user transaction. simply call SSX split. W_DO(_sx_split_foster(page, new_page_id, new_key)); // After split, the new page might be the parent of the new_key now. if (!page.fence_contains(new_key)) { btree_page_h new_page; W_DO(new_page.fix_nonroot(page, new_page_id, LATCH_EX)); w_assert1(new_page.fence_contains(new_key)); page.unfix(); page = new_page; } return RCOK; }
rc_t bt_cursor_t::_advance_one_slot(btree_page_h &p, bool &eof) { w_assert1(p.is_fixed()); w_assert1(_slot <= p.nrecs()); if(_forward) { ++_slot; } else { --_slot; } eof = false; // keep following the next page. // because we might see empty pages to skip consecutively! while (true) { bool time2move = _forward ? (_slot >= p.nrecs()) : _slot < 0; if (time2move) { // Move to right(left) sibling bool reached_end = _forward ? p.is_fence_high_supremum() : p.is_fence_low_infimum(); if (reached_end) { eof = true; return RCOK; } // now, use fence keys to tell where the neighboring page exists w_keystr_t neighboring_fence; btree_impl::traverse_mode_t traverse_mode; bool only_low_fence_exact_match = false; if (_forward) { p.copy_fence_high_key(neighboring_fence); traverse_mode = btree_impl::t_fence_low_match; int d = _upper.compare(neighboring_fence); if (d < 0 || (d == 0 && !_upper_inclusive)) { eof = true; return RCOK; } if (d == 0 && _upper_inclusive) { // we will check the next page, but the only // possible matching is an entry with // the low-fence.. only_low_fence_exact_match = true; } } else { // if we are going backwards, the current page had // low = [current-fence-low], high = [current-fence-high] // and the previous page should have // low = [?], high = [current-fence-low]. p.copy_fence_low_key(neighboring_fence); // let's find a page which has this value as high-fence traverse_mode = btree_impl::t_fence_high_match; int d = _lower.compare(neighboring_fence); if (d >= 0) { eof = true; return RCOK; } } p.unfix(); // take lock for the fence key if (_needs_lock) { lockid_t lid (_store, (const unsigned char*) neighboring_fence.buffer_as_keystr(), neighboring_fence.get_length_as_keystr()); okvl_mode lock_mode; if (only_low_fence_exact_match) { lock_mode = _ex_lock ? ALL_X_GAP_N: ALL_S_GAP_N; } else { lock_mode = _ex_lock ? ALL_X_GAP_X : ALL_S_GAP_S; } // we can unconditionally request lock because we already released latch W_DO(ss_m::lm->lock(lid.hash(), lock_mode, true, true, true)); } // TODO this part should check if we find an exact match of fence keys. // because we unlatch above, it's possible to not find exact match. // in that case, we should change the traverse_mode to fence_contains and continue W_DO(btree_impl::_ux_traverse(_store, neighboring_fence, traverse_mode, LATCH_SH, p)); _slot = _forward ? 0 : p.nrecs() - 1; _set_current_page(p); continue; } // take lock on the next key. // NOTE: until we get locks, we aren't sure the key really becomes // the next key. So, we use the temporary variable _tmp_next_key_buf. const okvl_mode *mode = NULL; { p.get_key(_slot, _tmp_next_key_buf); if (_forward) { int d = _tmp_next_key_buf.compare(_upper); if (d < 0) { mode = _ex_lock ? &ALL_X_GAP_X : &ALL_S_GAP_S; } else if (d == 0 && _upper_inclusive) { mode = _ex_lock ? &ALL_X_GAP_N : &ALL_S_GAP_N; } else { eof = true; mode = &ALL_N_GAP_N; } } else { int d = _tmp_next_key_buf.compare(_lower); if (d > 0) { mode = _ex_lock ? &ALL_X_GAP_X : &ALL_S_GAP_S; } else if (d == 0 && _lower_inclusive) { mode = _ex_lock ? &ALL_X_GAP_X : &ALL_S_GAP_S; } else { eof = true; mode = _ex_lock ? &ALL_N_GAP_X : &ALL_N_GAP_S; } } } if (_needs_lock && !mode->is_empty()) { rc_t rc = btree_impl::_ux_lock_key (_store, p, _tmp_next_key_buf, LATCH_SH, *mode, false); if (rc.is_error()) { if (rc.err_num() == eLOCKRETRY) { W_DO(_check_page_update(p)); continue; } else { return rc; } } } // okay, now we are sure the _tmp_next_key_buf is the key we want to use _key = _tmp_next_key_buf; return RCOK; // found a record! (or eof) } return RCOK; }
rc_t btree_impl::_ux_lock_key( const StoreID& store, btree_page_h& leaf, const void* keystr, size_t keylen, latch_mode_t latch_mode, const okvl_mode& lock_mode, bool check_only ) { // Callers: // 1. Top level _ux_lock_key() - I/U/D and search operations, lock conflict is possible // 2. _ux_lock_range() - lock conflict is possible // // Lock conflict: // 1. Deadlock - the asking lock is held by another transaction currently, and the // current transaction is holding other locks already, failed // 2. Timeout - the asking lock is held by another transaction currently, but the // current transaction does not hold other locks, okay to retry // For restart operation using lock re-acquisition: // 1. On_demand or mixed UNDO - when lock conflict. it triggers UNDO transaction rollback // this is a blocking operation, meaning the other concurrent // transactions asking for the same lock are blocked, no deadlock // 2. Traditional UNDO - original behavior, either deadlock error or timeout and retry lockid_t lid (store, (const unsigned char*) keystr, keylen); // first, try conditionally. we utilize the inserted lock entry even if it fails RawLock* entry = nullptr; // The lock request does the following: // If the lock() failed to acquire lock (trying to acquire lock while holding the latch) and // if the transaction doesn't have any other locks, because 'condition' is true, lock() // returns immediatelly with eCONDLOCKTIMEOUT which indicates it failed to // acquire lock but no deadlock worry and the lock entry has been created already. // In this case caller (this function) releases latch and try again using retry_lock() // which is a blocking operation, this is because it is safe to forever retry without // risking deadlock // If the lock() returns eDEADLOCK, it means lock acquisition failed and // the current transaction already held other locks, it is not safe to retry (will cause // further deadlocks) therefore caller must abort the current transaction rc_t lock_rc = lm->lock(lid.hash(), lock_mode, true /*check */, false /* wait */, !check_only /* acquire */, smthread_t::xct(),timeout_t::WAIT_IMMEDIATE, &entry); if (!lock_rc.is_error()) { // lucky! we got it immediately. just return. return RCOK; } else { // if it caused deadlock and it was chosen to be victim, give up! (not retry) if (lock_rc.err_num() == eDEADLOCK) { // The user transaction will abort and rollback itself upon deadlock detection. // Because Express does not have a deadlock monitor and policy to determine // which transaction to rollback during a deadlock (should abort the cheaper // transaction), the user transaction which detects deadlock will be aborted. w_assert1(entry == nullptr); return lock_rc; } // couldn't immediately get it. then we unlatch the page and wait. w_assert1(lock_rc.err_num() == eCONDLOCKTIMEOUT); w_assert1(entry != nullptr); // we release the latch here. However, we increment the pin count before that // to prevent the page from being evicted. pin_for_refix_holder pin_holder(leaf.pin_for_refix()); // automatically releases the pin lsn_t prelsn = leaf.get_page_lsn(); // to check if it's modified after this unlatch leaf.unfix(); // then, we try it unconditionally (this will block) W_DO(lm->retry_lock(&entry, !check_only /* acquire */)); // now we got the lock.. but it might be changed because we unlatched. w_rc_t refix_rc = leaf.refix_direct(pin_holder._idx, latch_mode); if (refix_rc.is_error() || leaf.get_page_lsn() != prelsn) { // release acquired lock if (entry != nullptr) { w_assert1(!check_only); lm->unlock(entry); } else { w_assert1(check_only); } if (refix_rc.is_error()) { return refix_rc; } else { w_assert1(leaf.get_page_lsn() != prelsn); // unluckily, it's the case return RC(eLOCKRETRY); // retry! } } return RCOK; } }