rc_t btree_impl::_sx_adopt_foster_all_core ( btree_page_h &parent, bool is_root, bool recursive) { // TODO this should use the improved tree-walk-through // See jira ticket:60 "Tree walk-through without more than 2 pages latched" (originally trac ticket:62) w_assert1 (xct()->is_sys_xct()); w_assert1 (parent.is_fixed()); w_assert1 (parent.latch_mode() == LATCH_EX); if (parent.is_node()) { w_assert1(parent.pid0()); W_DO(_sx_adopt_foster_sweep(parent)); if (recursive) { // also adopt at all children recursively for (int i = -1; i < parent.nrecs(); ++i) { btree_page_h child; PageID shpid_opaqueptr = i == -1 ? parent.get_foster_opaqueptr() : parent.child_opaqueptr(i); W_DO(child.fix_nonroot(parent, shpid_opaqueptr, LATCH_EX)); W_DO(_sx_adopt_foster_all_core(child, false, true)); } } } // after all adopts, if this parent is the root and has foster, // let's grow the tree if (is_root && parent.get_foster()) { W_DO(_sx_grow_tree(parent)); W_DO(_sx_adopt_foster_sweep(parent)); } w_assert3(parent.is_consistent(true, true)); return RCOK; }
void bt_cursor_t::_set_current_page(btree_page_h &page) { if (_pid != 0) { _release_current_page(); } w_assert1(_pid == 0); w_assert1(_pid_bfidx.idx() == 0); _pid = page.pid(); // pin this page for subsequent refix() _pid_bfidx.set(page.pin_for_refix()); _lsn = page.get_page_lsn(); #ifndef USE_ATOMIC_COMMIT w_assert1(_lsn.valid()); // must have a valid LSN for _check_page_update to work #endif }
rc_t btree_impl::_sx_adopt_foster (btree_page_h &parent, btree_page_h &child) { w_keystr_t new_child_key; child.copy_fence_high_key(new_child_key); W_DO(_sx_split_if_needed(parent, new_child_key)); // Now, another SSX to move the pointer sys_xct_section_t sxs(true); W_DO(sxs.check_error_on_start()); rc_t ret = _ux_adopt_foster_core(parent, child, new_child_key); W_DO (sxs.end_sys_xct (ret)); DBG(<< "Adopted " << child.pid() << " into " << parent.pid()); return ret; }
rc_t btree_impl::_ux_assure_fence_low_entry(btree_page_h &leaf) { w_assert1(leaf.is_fixed()); w_assert1(leaf.latch_mode() == LATCH_EX); if (!leaf.is_leaf()) { // locks are taken only for leaf-page entries. this case isn't an issue return RCOK; } w_keystr_t fence_low; leaf.copy_fence_low_key(fence_low); bool needs_to_create = false; if (leaf.nrecs() == 0) { if (leaf.compare_with_fence_high(fence_low) == 0) { // low==high happens only during page split. In that case, no one can have a lock // in the page being created. No need to assure the record. return RCOK; } needs_to_create = true; } else { w_keystr_t first_key; leaf.get_key(0, first_key); w_assert1(fence_low.compare(first_key) <= 0); // can't be fence_low>first_key if (fence_low.compare(first_key) < 0) { // fence-low doesn't exist as an entry! needs_to_create = true; } } if (needs_to_create) { W_DO(_sx_reserve_ghost(leaf, fence_low, 0)); // no data is needed } return RCOK; }
w_rc_t bt_cursor_t::_refix_current_key(btree_page_h &p) { while (true) { w_rc_t fix_rt = p.refix_direct(_pid_bfidx.idx(), LATCH_SH); if (!fix_rt.is_error()) { break; // mostly no error. } if(fix_rt.err_num() != eBF_DIRECTFIX_SWIZZLED_PTR) { return fix_rt; // unexpected error code } W_DO(btree_impl::_ux_traverse(_store, _key, btree_impl::t_fence_contain, LATCH_SH, p)); _slot = _forward ? 0 : p.nrecs() - 1; _set_current_page(p); // now let's re-locate the key } return RCOK; }
rc_t bt_cursor_t::_make_rec(const btree_page_h& page) { // Copy the record to buffer bool ghost; _elen = sizeof(_elbuf); page.copy_element(_slot, _elbuf, _elen, ghost); #if W_DEBUG_LEVEL>0 w_assert1(_elen <= sizeof(_elbuf)); // this should have been skipped at _advance_one_slot() w_assert1(!ghost); w_keystr_t key_again; page.get_key(_slot, key_again); w_assert1(key_again.compare(_key) == 0); #endif // W_DEBUG_LEVEL>0 return RCOK; }
rc_t bt_cursor_t::_check_page_update(btree_page_h &p) { // was the page changed? if (_pid != p.pid() || p.get_page_lsn() != _lsn) { // check if the page still contains the key we are based on bool found = false; if (p.fence_contains(_key)) { // it still contains. just re-locate _slot p.search(_key, found, _slot); } else { // we have to re-locate the page W_DO( btree_impl::_ux_traverse(_store, _key, btree_impl::t_fence_contain, LATCH_SH, p)); p.search(_key, found, _slot); } w_assert1(found || !_needs_lock || (!_forward && !_upper_inclusive && !_dont_move_next)); // see _locate_first _set_current_page(p); } return RCOK; }
rc_t btree_impl::_sx_split_if_needed (btree_page_h &page, const w_keystr_t &new_key) { bool need_split = !page.check_space_for_insert_node(new_key) || (page.is_insertion_extremely_skewed_right() && page.check_chance_for_norecord_split(new_key)); if (!need_split) { return RCOK; // easy } PageID new_page_id; // we are running user transaction. simply call SSX split. W_DO(_sx_split_foster(page, new_page_id, new_key)); // After split, the new page might be the parent of the new_key now. if (!page.fence_contains(new_key)) { btree_page_h new_page; W_DO(new_page.fix_nonroot(page, new_page_id, LATCH_EX)); w_assert1(new_page.fence_contains(new_key)); page.unfix(); page = new_page; } return RCOK; }
rc_t bt_cursor_t::_find_next(btree_page_h &p, bool &eof) { while (true) { if (_dont_move_next) { _dont_move_next = false; } else { W_DO(_advance_one_slot(p, eof)); } if (eof) { break; } // skip ghost entries if (p.is_ghost(_slot)) { continue; } break; } return RCOK; }
rc_t btree_impl::_sx_opportunistic_adopt_foster (btree_page_h &parent, btree_page_h &child, bool &pushedup, const bool from_recovery) { w_assert1 (parent.is_fixed()); w_assert1 (parent.is_node()); w_assert1 (child.is_fixed()); pushedup = false; // let's try upgrading parent to EX latch. This highly likely fails in high-load situation, // so let's do it here to avoid system transaction creation cost. // we start from parent because EX latch on child is assured to be available in this order if (!parent.upgrade_latch_conditional()) { DBGOUT1(<< "opportunistic_adopt gave it up because of parent. " << parent.pid() << ". do nothing."); increase_ex_need(parent.pid()); // give a hint to subsequent accesses return RCOK; }
rc_t btree_impl::_sx_split_foster(btree_page_h& page, PageID& new_page_id, const w_keystr_t& triggering_key) { sys_xct_section_t sxs(true); W_DO(sxs.check_error_on_start()); w_assert1 (page.latch_mode() == LATCH_EX); // DBG(<< "SPLITTING " << page); /* * Step 1: Allocate a new page for the foster child */ W_DO(smlevel_0::vol->alloc_a_page(new_page_id)); /* * Step 2: Create new foster child and move records into it, logging its * raw contents as a page_img_format operation */ btree_page_h new_page; rc_t rc = new_page.fix_nonroot(page, new_page_id, LATCH_EX, false, true); if (rc.is_error()) { W_DO(smlevel_0::vol ->deallocate_page(new_page_id)); return rc; } // assure foster-child page has an entry same as fence-low for locking correctness. // See jira ticket:84 "Key Range Locking" (originally trac ticket:86). // CS TODO - why is this required // There may be a bug, since error happens if we uncomment this // W_DO(_ux_assure_fence_low_entry(new_page)); // this might be another SSX int move_count = 0; w_keystr_t split_key; W_DO(new_page.format_foster_child(page, new_page_id, triggering_key, split_key, move_count)); w_assert0(move_count > 0); // DBG5(<< "NEW FOSTER CHILD " << new_page); /* * Step 3: Delete moved records and update foster child pointer and high * fence on overflowing page. Foster parent is not recompressed after * moving records (CS TODO) */ page.delete_range(page.nrecs() - move_count, page.nrecs()); // DBG5(<< "AFTER RANGE DELETE " << page); w_keystr_t new_chain; new_page.copy_chain_fence_high_key(new_chain); bool foster_set = page.set_foster_child(new_page_id, split_key, new_chain); w_assert0(foster_set); /* * Step 4: Update parent pointers of the moved records. The new foster * child will have the correct parent set by the fix call above. This is * only required because of swizzling. */ // set parent pointer on hash table // smlevel_0::bf->switch_parent(new_page_id, page.get_generic_page()); // set parent pointer for children that moved to new page int max_slot = new_page.max_child_slot(); for (general_recordid_t i = GeneralRecordIds::FOSTER_CHILD; i <= max_slot; ++i) { // CS TODO: Slot 1 (which is actually 0 in the internal page // representation) is not used when inserting into an empty node (see // my comment on btree_page_h.cpp::insert_nonghost), so in *some* // cases, the slot i=1 will yield and invalid page in switch_parent // below. Because of this great design feature, switch_parent has to // cope with an invalid page. smlevel_0::bf->switch_parent(*new_page.child_slot_address(i), new_page.get_generic_page()); } /* * Step 5: Log bulk deletion and foster update on parent */ W_DO(log_btree_split(new_page, page, move_count, split_key, new_chain)); w_assert1(new_page.get_page_lsn() != lsn_t::null); // hint for subsequent accesses increase_forster_child(page.pid()); W_DO (sxs.end_sys_xct (RCOK)); DBG1(<< "Split page " << page.pid() << " into " << new_page_id); return RCOK; }
rc_t btree_impl::_ux_norec_alloc_core(btree_page_h &page, PageID &new_page_id) { // This is called only in REDO-only SSX, so no compensation logging. Just apply. w_assert1 (xct()->is_single_log_sys_xct()); w_assert1 (page.latch_mode() == LATCH_EX); W_DO(smlevel_0::vol->alloc_a_page(new_page_id)); btree_page_h new_page; w_rc_t rc; rc = new_page.fix_nonroot(page, new_page_id, LATCH_EX, false, true); if (rc.is_error()) { // if failed for any reason, we release the allocated page. W_DO(smlevel_0::vol ->deallocate_page(new_page_id)); return rc; } // The new page has an empty key range; parent's high to high. w_keystr_t fence, chain_high; page.copy_fence_high_key(fence); bool was_right_most = (page.get_chain_fence_high_length() == 0); page.copy_chain_fence_high_key(chain_high); if (was_right_most) { // this means there was no chain or the page was the right-most of it. // (so its high=high of chain) // upon the first foster split, we start setting the chain-high. page.copy_fence_high_key(chain_high); } #if W_DEBUG_LEVEL >= 3 lsn_t old_lsn = page.get_page_lsn(); #endif //W_DEBUG_LEVEL W_DO(log_btree_norec_alloc(page, new_page, new_page_id, fence, chain_high)); DBGOUT3(<< "btree_impl::_ux_norec_alloc_core, fence=" << fence << ", old-LSN=" << old_lsn << ", new-LSN=" << page.get_page_lsn() << ", PID=" << new_page_id); // initialize as an empty child: new_page.format_steal(page.get_page_lsn(), new_page_id, page.store(), page.root(), page.level(), 0, lsn_t::null, page.get_foster_opaqueptr(), page.get_foster_emlsn(), fence, fence, chain_high, false); page.accept_empty_child(page.get_page_lsn(), new_page_id, false /*not from redo*/); // in this operation, the log contains everything we need to recover without any // write-order-dependency. So, no registration for WOD. w_assert3(new_page.is_consistent(true, true)); w_assert1(new_page.is_fixed()); w_assert1(new_page.latch_mode() == LATCH_EX); w_assert3(page.is_consistent(true, true)); w_assert1(page.is_fixed()); return RCOK; }
rc_t btree_impl::_ux_adopt_foster_core (btree_page_h &parent, btree_page_h &child, const w_keystr_t &new_child_key) { w_assert1 (g_xct()->is_single_log_sys_xct()); w_assert1 (parent.is_fixed()); w_assert1 (parent.latch_mode() == LATCH_EX); w_assert1 (parent.is_node()); w_assert1 (child.is_fixed()); w_assert1 (child.latch_mode() == LATCH_EX); w_assert0 (child.get_foster() != 0); PageID new_child_pid = child.get_foster(); if (smlevel_0::bf->is_swizzled_pointer(new_child_pid)) { smlevel_0::bf->unswizzle(parent.get_generic_page(), GeneralRecordIds::FOSTER_CHILD, true, &new_child_pid); } w_assert1(!smlevel_0::bf->is_swizzled_pointer(new_child_pid)); lsn_t child_emlsn = child.get_foster_emlsn(); W_DO(log_btree_foster_adopt (parent, child, new_child_pid, child_emlsn, new_child_key)); _ux_adopt_foster_apply_parent (parent, new_child_pid, child_emlsn, new_child_key); _ux_adopt_foster_apply_child (child); // Switch parent of newly adopted child // CS TODO: I'm not sure we can do this because we don't hold a latch on new_child_pid smlevel_0::bf->switch_parent(new_child_pid, parent.get_generic_page()); w_assert3(parent.is_consistent(true, true)); w_assert3(child.is_consistent(true, true)); return RCOK; }
rc_t bt_cursor_t::_advance_one_slot(btree_page_h &p, bool &eof) { w_assert1(p.is_fixed()); w_assert1(_slot <= p.nrecs()); if(_forward) { ++_slot; } else { --_slot; } eof = false; // keep following the next page. // because we might see empty pages to skip consecutively! while (true) { bool time2move = _forward ? (_slot >= p.nrecs()) : _slot < 0; if (time2move) { // Move to right(left) sibling bool reached_end = _forward ? p.is_fence_high_supremum() : p.is_fence_low_infimum(); if (reached_end) { eof = true; return RCOK; } // now, use fence keys to tell where the neighboring page exists w_keystr_t neighboring_fence; btree_impl::traverse_mode_t traverse_mode; bool only_low_fence_exact_match = false; if (_forward) { p.copy_fence_high_key(neighboring_fence); traverse_mode = btree_impl::t_fence_low_match; int d = _upper.compare(neighboring_fence); if (d < 0 || (d == 0 && !_upper_inclusive)) { eof = true; return RCOK; } if (d == 0 && _upper_inclusive) { // we will check the next page, but the only // possible matching is an entry with // the low-fence.. only_low_fence_exact_match = true; } } else { // if we are going backwards, the current page had // low = [current-fence-low], high = [current-fence-high] // and the previous page should have // low = [?], high = [current-fence-low]. p.copy_fence_low_key(neighboring_fence); // let's find a page which has this value as high-fence traverse_mode = btree_impl::t_fence_high_match; int d = _lower.compare(neighboring_fence); if (d >= 0) { eof = true; return RCOK; } } p.unfix(); // take lock for the fence key if (_needs_lock) { lockid_t lid (_store, (const unsigned char*) neighboring_fence.buffer_as_keystr(), neighboring_fence.get_length_as_keystr()); okvl_mode lock_mode; if (only_low_fence_exact_match) { lock_mode = _ex_lock ? ALL_X_GAP_N: ALL_S_GAP_N; } else { lock_mode = _ex_lock ? ALL_X_GAP_X : ALL_S_GAP_S; } // we can unconditionally request lock because we already released latch W_DO(ss_m::lm->lock(lid.hash(), lock_mode, true, true, true)); } // TODO this part should check if we find an exact match of fence keys. // because we unlatch above, it's possible to not find exact match. // in that case, we should change the traverse_mode to fence_contains and continue W_DO(btree_impl::_ux_traverse(_store, neighboring_fence, traverse_mode, LATCH_SH, p)); _slot = _forward ? 0 : p.nrecs() - 1; _set_current_page(p); continue; } // take lock on the next key. // NOTE: until we get locks, we aren't sure the key really becomes // the next key. So, we use the temporary variable _tmp_next_key_buf. const okvl_mode *mode = NULL; { p.get_key(_slot, _tmp_next_key_buf); if (_forward) { int d = _tmp_next_key_buf.compare(_upper); if (d < 0) { mode = _ex_lock ? &ALL_X_GAP_X : &ALL_S_GAP_S; } else if (d == 0 && _upper_inclusive) { mode = _ex_lock ? &ALL_X_GAP_N : &ALL_S_GAP_N; } else { eof = true; mode = &ALL_N_GAP_N; } } else { int d = _tmp_next_key_buf.compare(_lower); if (d > 0) { mode = _ex_lock ? &ALL_X_GAP_X : &ALL_S_GAP_S; } else if (d == 0 && _lower_inclusive) { mode = _ex_lock ? &ALL_X_GAP_X : &ALL_S_GAP_S; } else { eof = true; mode = _ex_lock ? &ALL_N_GAP_X : &ALL_N_GAP_S; } } } if (_needs_lock && !mode->is_empty()) { rc_t rc = btree_impl::_ux_lock_key (_store, p, _tmp_next_key_buf, LATCH_SH, *mode, false); if (rc.is_error()) { if (rc.err_num() == eLOCKRETRY) { W_DO(_check_page_update(p)); continue; } else { return rc; } } } // okay, now we are sure the _tmp_next_key_buf is the key we want to use _key = _tmp_next_key_buf; return RCOK; // found a record! (or eof) } return RCOK; }
rc_t btree_impl::_ux_lock_key( const StoreID& store, btree_page_h& leaf, const void* keystr, size_t keylen, latch_mode_t latch_mode, const okvl_mode& lock_mode, bool check_only ) { // Callers: // 1. Top level _ux_lock_key() - I/U/D and search operations, lock conflict is possible // 2. _ux_lock_range() - lock conflict is possible // // Lock conflict: // 1. Deadlock - the asking lock is held by another transaction currently, and the // current transaction is holding other locks already, failed // 2. Timeout - the asking lock is held by another transaction currently, but the // current transaction does not hold other locks, okay to retry // For restart operation using lock re-acquisition: // 1. On_demand or mixed UNDO - when lock conflict. it triggers UNDO transaction rollback // this is a blocking operation, meaning the other concurrent // transactions asking for the same lock are blocked, no deadlock // 2. Traditional UNDO - original behavior, either deadlock error or timeout and retry lockid_t lid (store, (const unsigned char*) keystr, keylen); // first, try conditionally. we utilize the inserted lock entry even if it fails RawLock* entry = nullptr; // The lock request does the following: // If the lock() failed to acquire lock (trying to acquire lock while holding the latch) and // if the transaction doesn't have any other locks, because 'condition' is true, lock() // returns immediatelly with eCONDLOCKTIMEOUT which indicates it failed to // acquire lock but no deadlock worry and the lock entry has been created already. // In this case caller (this function) releases latch and try again using retry_lock() // which is a blocking operation, this is because it is safe to forever retry without // risking deadlock // If the lock() returns eDEADLOCK, it means lock acquisition failed and // the current transaction already held other locks, it is not safe to retry (will cause // further deadlocks) therefore caller must abort the current transaction rc_t lock_rc = lm->lock(lid.hash(), lock_mode, true /*check */, false /* wait */, !check_only /* acquire */, smthread_t::xct(),timeout_t::WAIT_IMMEDIATE, &entry); if (!lock_rc.is_error()) { // lucky! we got it immediately. just return. return RCOK; } else { // if it caused deadlock and it was chosen to be victim, give up! (not retry) if (lock_rc.err_num() == eDEADLOCK) { // The user transaction will abort and rollback itself upon deadlock detection. // Because Express does not have a deadlock monitor and policy to determine // which transaction to rollback during a deadlock (should abort the cheaper // transaction), the user transaction which detects deadlock will be aborted. w_assert1(entry == nullptr); return lock_rc; } // couldn't immediately get it. then we unlatch the page and wait. w_assert1(lock_rc.err_num() == eCONDLOCKTIMEOUT); w_assert1(entry != nullptr); // we release the latch here. However, we increment the pin count before that // to prevent the page from being evicted. pin_for_refix_holder pin_holder(leaf.pin_for_refix()); // automatically releases the pin lsn_t prelsn = leaf.get_page_lsn(); // to check if it's modified after this unlatch leaf.unfix(); // then, we try it unconditionally (this will block) W_DO(lm->retry_lock(&entry, !check_only /* acquire */)); // now we got the lock.. but it might be changed because we unlatched. w_rc_t refix_rc = leaf.refix_direct(pin_holder._idx, latch_mode); if (refix_rc.is_error() || leaf.get_page_lsn() != prelsn) { // release acquired lock if (entry != nullptr) { w_assert1(!check_only); lm->unlock(entry); } else { w_assert1(check_only); } if (refix_rc.is_error()) { return refix_rc; } else { w_assert1(leaf.get_page_lsn() != prelsn); // unluckily, it's the case return RC(eLOCKRETRY); // retry! } } return RCOK; } }
rc_t btree_impl::_ux_lock_range(const StoreID& stid, btree_page_h& leaf, const void* keystr, size_t keylen, slotid_t slot, latch_mode_t latch_mode, const okvl_mode& exact_hit_lock_mode, const okvl_mode& miss_lock_mode, bool check_only) { w_assert1(slot >= -1 && slot <= leaf.nrecs()); w_assert1(exact_hit_lock_mode.get_gap_mode() == okvl_mode::N); w_assert1(miss_lock_mode.is_keylock_empty()); if (slot == -1) { // this means we should search it again bool found; leaf.search((const char *) keystr, keylen, found, slot); w_assert1(!found); // precondition } w_assert1(slot >= 0 && slot <= leaf.nrecs()); #if W_DEBUG_LEVEL > 1 w_keystr_t key, key_at_slot; key.construct_from_keystr(keystr, keylen); if (slot<leaf.nrecs()) { leaf.get_key(slot, key_at_slot); w_assert1(key_at_slot.compare(key)>0); } #endif // W_DEBUG_LEVEL > 1 slot--; // want range lock from previous key if (slot == -1 && w_keystr_t::compare_bin_str(keystr, keylen, leaf.get_fence_low_key(), leaf.get_fence_low_length()) == 0) { // We were searching for the low-fence key! then, we take key lock on it and // subsequent structural modification (e.g., merge) will add the low-fence as // ghost record to be aware of the lock. W_DO (_ux_lock_key(stid, leaf, leaf.get_fence_low_key(), leaf.get_fence_low_length(), latch_mode, exact_hit_lock_mode, check_only)); } else { w_keystr_t prevkey; if (slot == -1) { leaf.copy_fence_low_key(prevkey); } else { leaf.get_key(slot, prevkey); } #if W_DEBUG_LEVEL > 1 w_assert1(prevkey.compare(key) < 0); #endif // W_DEBUG_LEVEL > 1 W_DO (_ux_lock_key(stid, leaf, prevkey, latch_mode, miss_lock_mode, check_only)); } return RCOK; }