ham_status_t btree_find_cursor(ham_btree_t *be, ham_bt_cursor_t *cursor, ham_key_t *key, ham_record_t *record, ham_u32_t flags) { ham_status_t st; ham_page_t *page = NULL; btree_node_t *node = NULL; int_key_t *entry; ham_s32_t idx = -1; ham_db_t *db=be_get_db(be); find_hints_t hints = {flags, flags, 0, HAM_FALSE, HAM_FALSE, 1}; btree_find_get_hints(&hints, db, key); if (hints.key_is_out_of_bounds) { stats_update_find_fail_oob(db, &hints); return HAM_KEY_NOT_FOUND; } if (hints.try_fast_track) { /* * see if we get a sure hit within this btree leaf; if not, revert to * regular scan * * As this is a speed-improvement hint re-using recent material, the * page should still sit in the cache, or we're using old info, which * should be discarded. */ st = db_fetch_page(&page, db, hints.leaf_page_addr, DB_ONLY_FROM_CACHE); ham_assert(st ? !page : 1, (0)); if (st) return st; if (page) { node=ham_page_get_btree_node(page); ham_assert(btree_node_is_leaf(node), (0)); ham_assert(btree_node_get_count(node) >= 3, (0)); /* edges + middle match */ idx = btree_node_search_by_key(db, page, key, hints.flags); /* * if we didn't hit a match OR a match at either edge, FAIL. * A match at one of the edges is very risky, as this can also * signal a match far away from the current node, so we need * the full tree traversal then. */ if (idx <= 0 || idx >= btree_node_get_count(node) - 1) { idx = -1; } /* * else: we landed in the middle of the node, so we don't need to * traverse the entire tree now. */ } /* Reset any errors which may have been collected during the hinting * phase -- this is done by setting 'idx = -1' above as that effectively * clears the possible error code stored in there when (idx < -1) */ } if (idx == -1) { /* get the address of the root page */ if (!btree_get_rootpage(be)) { stats_update_find_fail(db, &hints); return HAM_KEY_NOT_FOUND; } /* load the root page */ st=db_fetch_page(&page, db, btree_get_rootpage(be), 0); ham_assert(st ? !page : 1, (0)); if (!page) { ham_assert(st, (0)); stats_update_find_fail(db, &hints); return st ? st : HAM_INTERNAL_ERROR; } /* now traverse the root to the leaf nodes, till we find a leaf */ node=ham_page_get_btree_node(page); if (!btree_node_is_leaf(node)) { /* signal 'don't care' when we have multiple pages; we resolve this once we've got a hit further down */ if (hints.flags & (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)) hints.flags |= (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH); for (;;) { hints.cost++; st=btree_traverse_tree(&page, 0, db, page, key); if (!page) { stats_update_find_fail(db, &hints); return st ? st : HAM_KEY_NOT_FOUND; } node=ham_page_get_btree_node(page); if (btree_node_is_leaf(node)) break; } } /* check the leaf page for the key */ idx=btree_node_search_by_key(db, page, key, hints.flags); if (idx < -1) { stats_update_find_fail(db, &hints); return (ham_status_t)idx; } } /* end of regular search */ /* * When we are performing an approximate match, the worst case * scenario is where we've picked the wrong side of the fence * while sitting at a page/node boundary: that's what this * next piece of code resolves: * * essentially it moves one record forwards or backward when * the flags tell us this is mandatory and we're not yet in the proper * position yet. * * The whole trick works, because the code above detects when * we need to traverse a multi-page btree -- where this worst-case * scenario can happen -- and adjusted the flags to accept * both LT and GT approximate matches so that btree_node_search_by_key() * will be hard pressed to return a 'key not found' signal (idx==-1), * instead delivering the nearest LT or GT match; all we need to * do now is ensure we've got the right one and if not, * shift by one. */ if (idx >= 0) { if ((ham_key_get_intflags(key) & KEY_IS_APPROXIMATE) && (hints.original_flags & (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)) != (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)) { if ((ham_key_get_intflags(key) & KEY_IS_GT) && (hints.original_flags & HAM_FIND_LT_MATCH)) { /* * if the index-1 is still in the page, just decrement the * index */ if (idx > 0) { idx--; } else { /* * otherwise load the left sibling page */ if (!btree_node_get_left(node)) { stats_update_find_fail(db, &hints); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, -1); return HAM_KEY_NOT_FOUND; } hints.cost++; st = db_fetch_page(&page, db, btree_node_get_left(node), 0); ham_assert(st ? !page : 1, (0)); if (!page) { ham_assert(st, (0)); stats_update_find_fail(db, &hints); return st ? st : HAM_INTERNAL_ERROR; } node = ham_page_get_btree_node(page); idx = btree_node_get_count(node) - 1; } ham_key_set_intflags(key, (ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE) | KEY_IS_LT); } else if ((ham_key_get_intflags(key) & KEY_IS_LT) && (hints.original_flags & HAM_FIND_GT_MATCH)) { /* * if the index+1 is still in the page, just increment the * index */ if (idx + 1 < btree_node_get_count(node)) { idx++; } else { /* * otherwise load the right sibling page */ if (!btree_node_get_right(node)) { stats_update_find_fail(db, &hints); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, -1); return HAM_KEY_NOT_FOUND; } hints.cost++; st = db_fetch_page(&page, db, btree_node_get_right(node), 0); if (!page) { ham_assert(st, (0)); stats_update_find_fail(db, &hints); return st ? st : HAM_INTERNAL_ERROR; } node = ham_page_get_btree_node(page); idx = 0; } ham_key_set_intflags(key, (ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE) | KEY_IS_GT); } } else if (!(ham_key_get_intflags(key) & KEY_IS_APPROXIMATE) && !(hints.original_flags & HAM_FIND_EXACT_MATCH) && (hints.original_flags != 0)) { /* * 'true GT/LT' has been added @ 2009/07/18 to complete * the EQ/LEQ/GEQ/LT/GT functionality; * * 'true LT/GT' is simply an extension upon the already existing * LEQ/GEQ logic just above; all we do here is move one record * up/down as it just happens that we get an exact ('equal') * match here. * * The fact that the LT/GT constants share their bits with the * LEQ/GEQ flags so that LEQ==(LT|EXACT) and GEQ==(GT|EXACT) * ensures that we can restrict our work to a simple adjustment * right here; everything else has already been taken of by the * LEQ/GEQ logic in the section above when the key has been * flagged with the KEY_IS_APPROXIMATE flag. */ if (hints.original_flags & HAM_FIND_LT_MATCH) { /* * if the index-1 is still in the page, just decrement the * index */ if (idx > 0) { idx--; ham_key_set_intflags(key, (ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE) | KEY_IS_LT); } else { /* * otherwise load the left sibling page */ if (!btree_node_get_left(node)) { /* when an error is otherwise unavoidable, see if we have an escape route through GT? */ if (hints.original_flags & HAM_FIND_GT_MATCH) { /* * if the index+1 is still in the page, just * increment the index */ if (idx + 1 < btree_node_get_count(node)) { idx++; } else { /* * otherwise load the right sibling page */ if (!btree_node_get_right(node)) { stats_update_find_fail(db, &hints); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, -1); return HAM_KEY_NOT_FOUND; } hints.cost++; st = db_fetch_page(&page, db, btree_node_get_right(node), 0); if (!page) { ham_assert(st, (0)); stats_update_find_fail(db, &hints); return st ? st : HAM_INTERNAL_ERROR; } node = ham_page_get_btree_node(page); idx = 0; } ham_key_set_intflags(key, (ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE) | KEY_IS_GT); } else { stats_update_find_fail(db, &hints); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, -1); return HAM_KEY_NOT_FOUND; } } else { hints.cost++; st = db_fetch_page(&page, db, btree_node_get_left(node), 0); if (!page) { ham_assert(st, (0)); stats_update_find_fail(db, &hints); return st ? st : HAM_INTERNAL_ERROR; } node = ham_page_get_btree_node(page); idx = btree_node_get_count(node) - 1; ham_key_set_intflags(key, (ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE) | KEY_IS_LT); } } } else if (hints.original_flags & HAM_FIND_GT_MATCH) { /* * if the index+1 is still in the page, just increment the * index */ if (idx + 1 < btree_node_get_count(node)) { idx++; } else { /* * otherwise load the right sibling page */ if (!btree_node_get_right(node)) { stats_update_find_fail(db, &hints); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, -1); return HAM_KEY_NOT_FOUND; } hints.cost++; st = db_fetch_page(&page, db, btree_node_get_right(node), 0); if (!page) { ham_assert(st, (0)); stats_update_find_fail(db, &hints); return st ? st : HAM_INTERNAL_ERROR; } node = ham_page_get_btree_node(page); idx = 0; } ham_key_set_intflags(key, (ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE) | KEY_IS_GT); } } } if (idx<0) { stats_update_find_fail(db, &hints); ham_assert(node, (0)); ham_assert(page, (0)); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, -1); return HAM_KEY_NOT_FOUND; } /* load the entry, and store record ID and key flags */ entry=btree_node_get_key(db, node, idx); /* set the cursor-position to this key */ if (cursor) { ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_UNCOUPLED), ("coupling an uncoupled cursor, but need a nil-cursor")); ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_COUPLED), ("coupling a coupled cursor, but need a nil-cursor")); page_add_cursor(page, (ham_cursor_t *)cursor); bt_cursor_set_flags(cursor, bt_cursor_get_flags(cursor)|BT_CURSOR_FLAG_COUPLED); bt_cursor_set_coupled_page(cursor, page); bt_cursor_set_coupled_index(cursor, idx); } /* * during util_read_key and util_read_record, new pages might be needed, * and the page at which we're pointing could be moved out of memory; * that would mean that the cursor would be uncoupled, and we're losing * the 'entry'-pointer. therefore we 'lock' the page by incrementing * the reference counter */ page_add_ref(page); ham_assert(btree_node_is_leaf(node), ("iterator points to internal node")); /* no need to load the key if we have an exact match: */ if (key && (ham_key_get_intflags(key) & KEY_IS_APPROXIMATE)) { ham_status_t st=util_read_key(db, entry, key); if (st) { page_release_ref(page); stats_update_find_fail(db, &hints); return (st); } } if (record) { ham_status_t st; record->_intflags=key_get_flags(entry); record->_rid=key_get_ptr(entry); st=util_read_record(db, record, flags); if (st) { page_release_ref(page); stats_update_find_fail(db, &hints); return (st); } } page_release_ref(page); stats_update_find(db, page, &hints); ham_assert(node == ham_page_get_btree_node(page), (0)); stats_update_any_bound(db, page, key, hints.original_flags, idx); return (0); }
static ham_status_t __insert_nosplit(ham_page_t *page, ham_key_t *key, ham_offset_t rid, ham_record_t *record, ham_bt_cursor_t *cursor, insert_hints_t *hints) { ham_status_t st; ham_u16_t count; ham_size_t keysize; ham_size_t new_dupe_id = 0; int_key_t *bte = 0; btree_node_t *node; ham_db_t *db=page_get_owner(page); ham_bool_t exists = HAM_FALSE; ham_s32_t slot; ham_assert(page_get_owner(page), (0)); ham_assert(device_get_env(page_get_device(page)) == db_get_env(page_get_owner(page)), (0)); node=ham_page_get_btree_node(page); count=btree_node_get_count(node); keysize=db_get_keysize(db); if (btree_node_get_count(node)==0) { slot = 0; } else if (hints->force_append) { slot = count; } else if (hints->force_prepend) { /* insert at beginning; shift all up by one */ slot = 0; } else { int cmp; hints->cost++; st=btree_get_slot(db, page, key, &slot, &cmp); if (st) return (st); /* insert the new key at the beginning? */ if (slot == -1) { slot = 0; } else { /* * key exists already */ if (cmp == 0) { if (hints->flags & HAM_OVERWRITE) { /* * no need to overwrite the key - it already exists! * however, we have to overwrite the data! */ if (!btree_node_is_leaf(node)) return (HAM_SUCCESS); } else if (!(hints->flags & HAM_DUPLICATE)) return (HAM_DUPLICATE_KEY); /* do NOT shift keys up to make room; just overwrite the current [slot] */ exists = HAM_TRUE; } else { /* * otherwise, if the new key is > then the slot key, move to * the next slot */ if (cmp > 0) { slot++; } } } } /* * in any case, uncouple the cursors and see if we must shift any elements to the * right */ bte=btree_node_get_key(db, node, slot); ham_assert(bte, (0)); if (!exists) { if (count > slot) { /* uncouple all cursors & shift any elements following [slot] */ st=bt_uncouple_all_cursors(page, slot); if (st) return (st); hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-slot)); memmove(((char *)bte)+db_get_int_key_header_size()+keysize, bte, (db_get_int_key_header_size()+keysize)*(count-slot)); } /* * if a new key is created or inserted: initialize it with zeroes */ memset(bte, 0, db_get_int_key_header_size()+keysize); } /* * if we're in the leaf: insert, overwrite or append the blob * (depends on the flags) */ if (btree_node_is_leaf(node)) { ham_status_t st; hints->cost++; st=key_set_record(db, bte, record, cursor ? bt_cursor_get_dupe_id(cursor) : 0, hints->flags, &new_dupe_id); if (st) return (st); hints->processed_leaf_page = page; hints->processed_slot = slot; } else { key_set_ptr(bte, rid); } page_set_dirty(page, db_get_env(db)); key_set_size(bte, key->size); /* * set a flag if the key is extended, and does not fit into the * btree */ if (key->size > db_get_keysize(db)) key_set_flags(bte, key_get_flags(bte)|KEY_IS_EXTENDED); /* * if we have a cursor: couple it to the new key * * the cursor always points to NIL. */ if (cursor) { if ((st=bt_cursor_set_to_nil(cursor))) return (st); ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_UNCOUPLED), ("coupling an uncoupled cursor, but need a nil-cursor")); ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_COUPLED), ("coupling a coupled cursor, but need a nil-cursor")); bt_cursor_set_flags(cursor, bt_cursor_get_flags(cursor)|BT_CURSOR_FLAG_COUPLED); bt_cursor_set_coupled_page(cursor, page); bt_cursor_set_coupled_index(cursor, slot); bt_cursor_set_dupe_id(cursor, new_dupe_id); memset(bt_cursor_get_dupe_cache(cursor), 0, sizeof(dupe_entry_t)); page_add_cursor(page, (ham_cursor_t *)cursor); } /* * if we've overwritten a key: no need to continue, we're done */ if (exists) return (0); /* * we insert the extended key, if necessary */ key_set_key(bte, key->data, db_get_keysize(db) < key->size ? db_get_keysize(db) : key->size); /* * if we need an extended key, allocate a blob and store * the blob-id in the key */ if (key->size > db_get_keysize(db)) { ham_offset_t blobid; key_set_key(bte, key->data, db_get_keysize(db)); st=key_insert_extended(&blobid, db, page, key); ham_assert(st ? blobid == 0 : 1, (0)); if (!blobid) return st ? st : HAM_INTERNAL_ERROR; key_set_extended_rid(db, bte, blobid); } /* * update the btree node-header */ btree_node_set_count(node, count+1); return (0); }