/* * __wt_ovfl_remove -- * Remove an overflow value. */ int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting) { /* * This function solves two problems in reconciliation. * * The first problem is snapshot readers needing on-page overflow values * that have been removed. The scenario is as follows: * * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks * - a snapshot transaction wants the original version of the item * * In summary, we may need the original version of an overflow item for * a snapshot transaction after the item was deleted from a page that's * subsequently been checkpointed, where the checkpoint must know about * the freed blocks. We don't have any way to delay a free of the * underlying blocks until a particular set of transactions exit (and * this shouldn't be a common scenario), so cache the overflow value in * memory. * * This gets hard because the snapshot transaction reader might: * - search the WT_UPDATE list and not find an useful entry * - read the overflow value's address from the on-page cell * - go to sleep * - checkpoint runs, caches the overflow value, frees the blocks * - another thread allocates and overwrites the blocks * - the reader wakes up and reads the wrong value * * Use a read/write lock and the on-page cell to fix the problem: hold * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow * item. * * The read/write lock is per btree, but it could be per page or even * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * * We only have to do this for checkpoints: in any eviction mode, there * can't be threads sitting in our update lists. */ if (!evicting) WT_RET(__ovfl_cache(session, page, unpack)); /* * The second problem is to only remove the underlying blocks once, * solved by the WT_CELL_VALUE_OVFL_RM flag. * * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. */ return (__wt_ovfl_discard_add(session, page, unpack->cell)); }
/* * __wt_ovfl_cache -- * Handle deletion of an overflow value. */ int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack) { int visible; /* * This function solves a problem in reconciliation. The scenario is: * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks * - a snapshot transaction wants the original version of the item * * In summary, we may need the original version of an overflow item for * a snapshot transaction after the item was deleted from a page that's * subsequently been checkpointed, where the checkpoint must know about * the freed blocks. We don't have any way to delay a free of the * underlying blocks until a particular set of transactions exit (and * this shouldn't be a common scenario), so cache the overflow value in * memory. * * This gets hard because the snapshot transaction reader might: * - search the WT_UPDATE list and not find an useful entry * - read the overflow value's address from the on-page cell * - go to sleep * - checkpoint runs, caches the overflow value, frees the blocks * - another thread allocates and overwrites the blocks * - the reader wakes up and reads the wrong value * * Use a read/write lock and the on-page cell to fix the problem: hold * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow * item. * * The read/write lock is per btree, but it could be per page or even * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * * Check for a globally visible update. If there is a globally visible * update, we don't need to cache the item because it's not possible for * a running thread to have moved past it. */ switch (page->type) { case WT_PAGE_COL_VAR: visible = __ovfl_cache_col_visible(session, cookie, vpack); break; case WT_PAGE_ROW_LEAF: visible = __ovfl_cache_row_visible(session, page, cookie); break; WT_ILLEGAL_VALUE(session); } /* * If there's no globally visible update, there's a reader in the system * that might try and read the old value, cache it. */ if (!visible) { WT_RET(__ovfl_cache(session, page, vpack)); WT_STAT_FAST_DATA_INCR(session, cache_overflow_value); } /* * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. */ return (__wt_ovfl_discard_add(session, page, vpack->cell)); }