/* * __free_page_row_leaf -- * Discard a WT_PAGE_ROW_LEAF page. */ static void __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_IKEY *ikey; WT_ROW *rip; uint32_t i; /* * Free the in-memory index array. * * For each entry, see if the key was an allocation (that is, if it * points somewhere other than the original page), and if so, free * the memory. */ WT_ROW_FOREACH(page, rip, i) if ((ikey = rip->key) != NULL && __wt_off_page(page, ikey)) __wt_free(session, ikey); __wt_free(session, page->u.row.d); /* * Free the insert array. * * Row-store tables have one additional slot in the insert array (the * insert array has an extra slot to hold keys that sort before keys * found on the original page). */ if (page->u.row.ins != NULL) __free_skip_array(session, page->u.row.ins, page->entries + 1); /* Free the update array. */ if (page->u.row.upd != NULL) __free_update(session, page->u.row.upd, page->entries); }
/* * __free_page_row_int -- * Discard a WT_PAGE_ROW_INT page. */ static void __free_page_row_int(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_IKEY *ikey; WT_REF *ref; uint32_t i; /* * Free any allocated keys. * * For each referenced addr, see if the addr was an allocation, and if * so, free it. */ WT_REF_FOREACH(page, ref, i) { if ((ikey = ref->u.key) != NULL) __wt_free(session, ikey); if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } } /* Free the subtree-reference array. */ __wt_free(session, page->u.intl.t); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; case WT_PM_REC_EMPTY: /* Page is empty */ /* We checked if the page was empty when we reviewed it. */ /* FALLTHROUGH */ WT_ILLEGAL_VALUE(session); } return (0); }
/* * __free_page_col_int -- * Discard a WT_PAGE_COL_INT page. */ static void __free_page_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_REF *ref; uint32_t i; /* * For each referenced addr, see if the addr was an allocation, and if * so, free it. */ WT_REF_FOREACH(page, ref, i) if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } }
/* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the * pages it references). */ void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages) { WT_IKEY *ikey; if (ref == NULL) return; /* * Optionally free the referenced pages. (The path to free referenced * page is used for error cleanup, no instantiated and then discarded * page should have WT_REF entries with real pages. The page may have * been marked dirty as well; page discard checks for that, so we mark * it clean explicitly.) */ if (free_pages && ref->page != NULL) { __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } /* Free any key allocation. */ switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) __wt_free(session, ikey); break; } /* Free any address allocation. */ if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } /* Free any page-deleted information. */ if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } __wt_overwrite_and_free(session, ref); }
/* * __free_page_row_leaf -- * Discard a WT_PAGE_ROW_LEAF page. */ static void __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_IKEY *ikey; WT_ROW *rip; uint32_t i; /* * Free the in-memory index array. * * For each entry, see if the key was an allocation (that is, if it * points somewhere other than the original page), and if so, free * the memory. */ WT_ROW_FOREACH(page, rip, i) { ikey = WT_ROW_KEY_COPY(rip); if (ikey != NULL && __wt_off_page(page, ikey)) __wt_free(session, ikey); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict_page(session, ref); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the * page instead of instantiating it and figuring out there are no rows * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the on-page cell type for the page, cells for leaf * pages that have no overflow items are special. * * In some cases, the reference address may not reference an on-page * cell (for example, some combination of page splits), in which case * we can't check the original cell value and we fail. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. * * !!! * I doubt it's worth the effort, but we could copy the cell's type into * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) || __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __split_ref_deepen_move -- * Move a WT_REF from a parent to a child in service of a split to deepen * the tree, including updating the accounting information. */ static int __split_ref_deepen_move(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; size_t size; void *key; /* * Instantiate row-store keys, and column- and row-store addresses in * the WT_REF structures referenced by a page that's being split (and * deepening the tree). The WT_REF structures aren't moving, but the * index references are moving from the page we're splitting to a set * of child pages, and so we can no longer reference the block image * that remains with the page being split. * * No locking is required to update the WT_REF structure because we're * the only thread splitting the parent page, and there's no way for * readers to race with our updates of single pointers. The changes * have to be written before the page goes away, of course, our caller * owns that problem. * * Row-store keys, first. */ if (parent->type == WT_PAGE_ROW_INT) { if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { __wt_ref_key(parent, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); ikey = ref->key.ikey; } else { WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); *parent_decrp += sizeof(WT_IKEY) + ikey->size; } *child_incrp += sizeof(WT_IKEY) + ikey->size; } /* * If there's no address (the page has never been written), or the * address has been instantiated, there's no work to do. Otherwise, * get the address from the on-page cell. */ addr = ref->addr; if (addr != NULL && !__wt_off_page(parent, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( session, unpack.data, unpack.size, &addr->addr)) != 0) { __wt_free(session, addr); return (ret); } addr->size = (uint8_t)unpack.size; addr->type = unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; ref->addr = addr; } /* And finally, the WT_REF itself. */ WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); return (0); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference an empty page. * * Set the transaction ID to WT_TXN_NONE because the fact that * reconciliation left the page "empty" means there's no older * transaction in the system that might need to see an earlier * version of the page. It isn't necessary (WT_TXN_NONE is 0), * but it's the right thing to do. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = NULL; parent_ref->addr = NULL; parent_ref->txnid = WT_TXN_NONE; WT_PUBLISH(parent_ref->state, WT_REF_DELETED); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the page's cell type, cells for leaf pages without * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) ? ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_STAT_CONN_INCR(session, rec_page_delete_fast); WT_STAT_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = session->btree; unpack = &_unpack; page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins == NULL) { if (key_ret) { if (__wt_off_page(page, rip->key)) { ikey = rip->key; cursor->key.data = WT_IKEY_DATA(ikey); cursor->key.size = ikey->size; } else WT_RET(__wt_row_key( session, page, rip, &cursor->key)); } upd = WT_ROW_UPDATE(page, rip); } else { if (key_ret) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } upd = cbt->ins->upd; } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* It's a cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, unpack); if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) { cursor->value.data = unpack->data; cursor->value.size = unpack->size; return (0); } else return (__wt_cell_unpack_copy(session, unpack, &cursor->value)); }