/* * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are visible to us. */ bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case * of a fast-delete page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete * is by definition visible, eviction could not have deleted the page if * there were changes on it that were not globally visible. * * We're here because we found a WT_REF state set to WT_REF_DELETED. It * is possible the page is being read into memory right now, though, and * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ if (ref->page_del == NULL) return (true); if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); skip = (ref->page_del == NULL || __wt_txn_visible(session, ref->page_del->txnid)); WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); }
/* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; /* * If the page is still "deleted", it's as we left it, reset the state * to on-disk and we're done. Otherwise, we expect the page is either * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ for (;; __wt_yield()) switch (ref->state) { case WT_REF_DISK: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: /* * We can't use the normal read path to get a copy of * the page because the session may have closed the * cursor, we no longer have the reference to the tree * required for a hazard pointer. We're safe because * with unresolved transactions, the page isn't going * anywhere. * * The page is in an in-memory state, walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; /* * Discard the memory, the transaction can't abort * twice. */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); return; } }
/* * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are either visible to * us or globally visible. */ bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case * of a fast-delete page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete * is by definition visible, eviction could not have deleted the page if * there were changes on it that were not globally visible. * * We're here because we found a WT_REF state set to WT_REF_DELETED. It * is possible the page is being read into memory right now, though, and * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ if (ref->page_del == NULL) return (true); if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); skip = ref->page_del == NULL || (visible_all ? __wt_txn_visible_all(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp)): __wt_txn_visible(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp))); /* * The page_del structure can be freed as soon as the delete is stable: * it is only read when the ref state is WT_REF_DELETED. It is worth * checking every time we come through because once this is freed, we * no longer need synchronization to check the ref. */ if (skip && ref->page_del != NULL && (visible_all || __wt_txn_visible_all(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict_page(session, ref); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the * page instead of instantiating it and figuring out there are no rows * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the on-page cell type for the page, cells for leaf * pages that have no overflow items are special. * * In some cases, the reference address may not reference an on-page * cell (for example, some combination of page splits), in which case * we can't check the original cell value and we fail. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. * * !!! * I doubt it's worth the effort, but we could copy the cell's type into * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) || __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the page's cell type, cells for leaf pages without * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) ? ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_STAT_CONN_INCR(session, rec_page_delete_fast); WT_STAT_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; uint64_t sleep_count, yield_count; /* * If the page is still "deleted", it's as we left it, reset the state * to on-disk and we're done. Otherwise, we expect the page is either * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ for (sleep_count = yield_count = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_LOOKASIDE: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: /* * We can't use the normal read path to get a copy of * the page because the session may have closed the * cursor, we no longer have the reference to the tree * required for a hazard pointer. We're safe because * with unresolved transactions, the page isn't going * anywhere. * * The page is in an in-memory state, walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; /* * Discard the memory, the transaction can't abort * twice. */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); return; } /* * We wait for the change in page state, yield before retrying, * and if we've yielded enough times, start sleeping so we don't * burn CPU to no purpose. */ __wt_ref_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_count); } }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; size_t addr_size; u_int type; const uint8_t *addr; /* * Skip deleted pages, rewriting them doesn't seem useful; in a better * world we'd write the parent to delete the page. */ if (ref->state == WT_REF_DELETED) { *skipp = true; return (0); } *skipp = false; /* Default to reading */ /* * If the page is in-memory, we want to look at it (it may have been * modified and written, and the current location is the interesting * one in terms of compaction, not the original location). * * This test could be combined with the next one, but this is a cheap * test and the next one is expensive. */ if (ref->state != WT_REF_DISK) return (0); /* * There's nothing to prevent the WT_REF state from changing underfoot, * which can change its address. For example, the WT_REF address might * reference an on-page cell, and page eviction can free that memory. * Lock the WT_REF so we can look at its address. */ if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * The page is on disk, so there had better be an address; assert that * fact, test at run-time to avoid the core dump. * * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ __wt_ref_info(ref, &addr, &addr_size, &type); WT_ASSERT(session, addr != NULL); if (addr != NULL && type != WT_CELL_ADDR_INT) { bm = S2BT(session)->bm; ret = bm->compact_page_skip( bm, session, addr, addr_size, skipp); } /* * Reset the WT_REF state and push the change. The full-barrier isn't * necessary, but it's better to keep pages in circulation than not. */ ref->state = WT_REF_DISK; WT_FULL_BARRIER(); return (ret); }