/* * __wt_ovfl_read -- * Bring an overflow item into memory. */ int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) { WT_DECL_RET; /* * If no page specified, there's no need to lock and there's no cache * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells. */ if (page == NULL) return ( __ovfl_read(session, unpack->data, unpack->size, store)); /* * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow * value, but there was still a reader in the system that might need it, * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM * and we will be passed a page so we can look-aside into the cache of * such values. * * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ WT_RET(__wt_readlock(session, S2BT(session)->ovfl_lock)); ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ? __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) : __ovfl_read(session, unpack->data, unpack->size, store); WT_TRET(__wt_readunlock(session, S2BT(session)->ovfl_lock)); return (ret); }
/* * __stat_tree_walk -- * Gather btree statistics that require traversing the tree. */ static int __stat_tree_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DECL_RET; WT_DSRC_STATS **stats; WT_REF *next_walk; btree = S2BT(session); stats = btree->dhandle->stats; /* * Clear the statistics we're about to count. */ WT_STAT_SET(session, stats, btree_column_deleted, 0); WT_STAT_SET(session, stats, btree_column_fix, 0); WT_STAT_SET(session, stats, btree_column_internal, 0); WT_STAT_SET(session, stats, btree_column_rle, 0); WT_STAT_SET(session, stats, btree_column_variable, 0); WT_STAT_SET(session, stats, btree_entries, 0); WT_STAT_SET(session, stats, btree_overflow, 0); WT_STAT_SET(session, stats, btree_row_internal, 0); WT_STAT_SET(session, stats, btree_row_leaf, 0); next_walk = NULL; while ((ret = __wt_tree_walk( session, &next_walk, 0)) == 0 && next_walk != NULL) { WT_WITH_PAGE_INDEX(session, ret = __stat_page(session, next_walk->page, stats)); WT_RET(ret); } return (ret == WT_NOTFOUND ? 0 : ret); }
/* * __wt_btree_stat_init -- * Initialize the Btree statistics. */ int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) { WT_BM *bm; WT_BTREE *btree; WT_DSRC_STATS **stats; btree = S2BT(session); bm = btree->bm; stats = btree->dhandle->stats; WT_RET(bm->stat(bm, session, stats[0])); WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); if (F_ISSET(cst, WT_STAT_TYPE_CACHE_WALK)) __wt_curstat_cache_walk(session); if (F_ISSET(cst, WT_STAT_TYPE_TREE_WALK)) WT_RET(__stat_tree_walk(session)); return (0); }
/* * __inmem_col_int -- * Build in-memory index for column-store internal pages. */ static void __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_PAGE_HEADER *dsk; WT_REF *ref; uint32_t i; btree = S2BT(session); dsk = page->dsk; unpack = &_unpack; /* * Walk the page, building references: the page contains value items. * The value items are on-page items (WT_CELL_VALUE). */ ref = page->u.intl.t; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); ref->addr = cell; ref->u.recno = unpack->v; ++ref; }
/* * __wt_compact_page_skip -- * Return if the block-manager wants us to re-write this page. */ int __wt_compact_page_skip( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, int *skipp) { WT_BM *bm; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; /* * There's one compaction test we do before we read the page, to see * if the block-manager thinks it useful to rewrite the page. If a * rewrite won't help, we don't want to do I/O for nothing. For that * reason, this check is done in a call from inside the tree-walking * routine. * * Ignore everything but on-disk pages, we've already done a pass over * the in-memory pages. */ if (ref->state != WT_REF_DISK) { *skipp = 1; return (0); } __wt_get_addr(parent, ref, &addr, &addr_size); if (addr == NULL) { *skipp = 1; return (0); } return (bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __lsm_discard_handle -- * Try to discard a handle from cache. */ static int __lsm_discard_handle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_DECL_RET; int locked; /* This will fail with EBUSY if the file is still in use. */ WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); WT_ASSERT(session, S2BT(session)->modified == 0); /* * We need the checkpoint lock to discard in-memory handles: otherwise, * an application checkpoint could see this file locked and fail with * EBUSY. * * We can't get the checkpoint lock earlier or it will deadlock with * the schema lock. */ locked = 0; if (checkpoint == NULL && (ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; if (ret == 0) F_SET(session->dhandle, WT_DHANDLE_DISCARD); WT_TRET(__wt_session_release_btree(session)); if (locked) __wt_spin_unlock(session, &S2C(session)->checkpoint_lock); return (ret); }
/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static int __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) { WT_BTREE *btree; btree = S2BT(session); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->maxmempage) return (0); /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (0); /* Eviction may be turned off. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (0); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, page, 1)); }
/* * __truncate_file -- * WT_SESSION::truncate for a file. */ static int __truncate_file(WT_SESSION_IMPL *session, const char *name) { WT_DECL_RET; const char *filename; uint32_t allocsize; filename = name; if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); /* Open and lock the file. */ WT_RET(__wt_session_get_btree( session, name, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); /* Get the allocation size. */ allocsize = S2BT(session)->allocsize; WT_RET(__wt_session_release_btree(session)); /* Close any btree handles in the file. */ WT_WITH_DHANDLE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, name, 0)); WT_RET(ret); /* Delete the root address and truncate the file. */ WT_RET(__wt_meta_checkpoint_clear(session, name)); WT_RET(__wt_block_manager_truncate(session, filename, allocsize)); return (0); }
/* * __wt_metadata_open -- * Opens the metadata file, sets session->meta_dhandle. */ int __wt_metadata_open(WT_SESSION_IMPL *session) { WT_BTREE *btree; if (session->meta_dhandle != NULL) return (0); WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0)); session->meta_dhandle = session->dhandle; WT_ASSERT(session, session->meta_dhandle != NULL); /* * Set special flags for the metadata file: eviction (the metadata file * is in-memory and never evicted), logging (the metadata file is always * logged if possible). * * Test flags before setting them so updates can't race in subsequent * opens (the first update is safe because it's single-threaded from * wiredtiger_open). */ btree = S2BT(session); if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) F_SET(btree, WT_BTREE_IN_MEMORY); if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) F_SET(btree, WT_BTREE_NO_EVICTION); if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_CLR(btree, WT_BTREE_NO_LOGGING); /* The metadata handle doesn't need to stay locked -- release it. */ return (__wt_session_release_btree(session)); }
/* * __txn_log_file_sync -- * Write a log record for a file sync. */ static int __txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp) { WT_BTREE *btree; WT_DECL_RET; WT_DECL_ITEM(logrec); const char *fmt = WT_UNCHECKED_STRING(III); size_t header_size; uint32_t rectype = WT_LOGREC_FILE_SYNC; int start; btree = S2BT(session); start = LF_ISSET(WT_TXN_LOG_CKPT_START); WT_RET(__wt_struct_size( session, &header_size, fmt, rectype, btree->id, start)); WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, header_size, fmt, rectype, btree->id, start)); logrec->size += (uint32_t)header_size; WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __wt_debug_offset -- * Read and dump a disk page in debugging mode, using a file * offset/size/checksum triplet. */ int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile) { WT_DECL_ITEM(buf); WT_DECL_RET; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp; WT_ASSERT(session, S2BT_SAFE(session) != NULL); /* * This routine depends on the default block manager's view of files, * where an address consists of a file offset, length, and checksum. * This is for debugging only: other block managers might not see a * file or address the same way, that's why there's no block manager * method. * * Convert the triplet into an address structure. */ endp = addr; WT_RET(__wt_block_addr_to_buffer( S2BT(session)->bm->block, &endp, offset, size, cksum)); /* * Read the address through the btree I/O functions (so the block is * decompressed as necessary). */ WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_bt_read(session, buf, addr, WT_PTRDIFF(endp, addr))); ret = __wt_debug_disk(session, buf->mem, ofile); err: __wt_scr_free(session, &buf); return (ret); }
/* * __wt_las_cursor_create -- * Open a new lookaside table cursor. */ int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_BTREE *btree; const char *open_cursor_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; WT_RET(__wt_open_cursor( session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); /* * Set special flags for the lookaside table: the lookaside flag (used, * for example, to avoid writing records during reconciliation), also * turn off checkpoints and logging. * * Test flags before setting them so updates can't race in subsequent * opens (the first update is safe because it's single-threaded from * wiredtiger_open). */ btree = S2BT(session); if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) F_SET(btree, WT_BTREE_LOOKASIDE); if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) F_SET(btree, WT_BTREE_NO_CHECKPOINT); if (!F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_SET(btree, WT_BTREE_NO_LOGGING); return (0); }
/* * __ovfl_read -- * Read an overflow item from the disk. */ static int __ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *store) { WT_BTREE *btree; const WT_PAGE_HEADER *dsk; btree = S2BT(session); /* * Read the overflow item from the block manager, then reference the * start of the data and set the data's length. * * Overflow reads are synchronous. That may bite me at some point, but * WiredTiger supports large page sizes, overflow items should be rare. */ WT_RET(__wt_bt_read(session, store, addr, addr_size)); dsk = store->data; store->data = WT_PAGE_HEADER_BYTE(btree, dsk); store->size = dsk->u.datalen; WT_STAT_FAST_DATA_INCR(session, cache_read_overflow); return (0); }
/* * __inmem_col_int -- * Build in-memory index for column-store internal pages. */ static void __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; const WT_PAGE_HEADER *dsk; WT_PAGE_INDEX *pindex; WT_REF **refp, *ref; uint32_t i; btree = S2BT(session); dsk = page->dsk; unpack = &_unpack; /* * Walk the page, building references: the page contains value items. * The value items are on-page items (WT_CELL_VALUE). */ pindex = WT_INTL_INDEX_COPY(page); refp = pindex->index; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ref = *refp++; ref->home = page; __wt_cell_unpack(cell, unpack); ref->addr = cell; ref->key.recno = unpack->v; }
/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static int __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; btree = S2BT(session); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->maxmempage) return (0); /* Leaf pages only. */ if (page->type != WT_PAGE_COL_FIX && page->type != WT_PAGE_COL_VAR && page->type != WT_PAGE_ROW_LEAF) return (0); /* Eviction may be turned off, although that's rare. */ if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (0); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); return (1); }
/* * __wt_cache_op -- * Cache operations. */ int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op) { WT_DECL_RET; switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* * Make sure the checkpoint reference is set for * reconciliation; it's ugly, but drilling a function parameter * path from our callers to the reconciliation of the tree's * root page is going to be worse. */ WT_ASSERT(session, S2BT(session)->ckpt != NULL); break; case WT_SYNC_DISCARD: case WT_SYNC_WRITE_LEAVES: break; } switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: ret = __sync_file(session, op); break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: ret = __wt_evict_file(session, op); break; } return (ret); }
/* * __wt_btree_stat_init -- * Initialize the Btree statistics. */ int __wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_DSRC_STATS *stats; WT_PAGE *page; btree = S2BT(session); bm = btree->bm; stats = &btree->dhandle->stats; WT_RET(bm->stat(bm, session, stats)); WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth); WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem); WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem); WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage); page = NULL; if (LF_ISSET(WT_STATISTICS_FAST)) return (0); while ((ret = __wt_tree_walk(session, &page, 0)) == 0 && page != NULL) WT_RET(__stat_page(session, page, stats)); return (ret == WT_NOTFOUND ? 0 : ret); }
/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static int __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; btree = S2BT(session); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->maxmempage) return (0); /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (0); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (0); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); /* Bump the oldest ID, we're about to do some visibility checks. */ __wt_txn_update_oldest(session, 0); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, page, 1, NULL)); }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = false; /* Default to reading. */ type = 0; /* Keep compiler quiet. */ bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ __wt_ref_info(ref, &addr, &addr_size, &type); if (addr == NULL) return (0); /* * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __wt_btcur_init -- * Initialize a cursor used for internal purposes. */ void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { memset(cbt, 0, sizeof(WT_CURSOR_BTREE)); cbt->iface.session = &session->iface; cbt->btree = S2BT(session); }
/* * __compact_end -- * End object compaction. */ static int __compact_end(WT_SESSION_IMPL *session) { WT_BM *bm; bm = S2BT(session)->bm; return (bm->compact_end(bm, session)); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_PAGE *page; int trigger, skip; bm = S2BT(session)->bm; WT_DSTAT_INCR(session, session_compact); WT_RET(__wt_config_gets(session, cfg, "trigger", &cval)); trigger = (int)cval.val; /* Check if compaction might be useful. */ WT_RET(bm->compact_skip(bm, session, trigger, &skip)); if (skip) return (0); /* * Walk the cache reviewing in-memory pages to see if they need to be * re-written. This requires looking at page reconciliation results, * which means the page cannot be reconciled at the same time as it's * being reviewed for compaction. The underlying functions ensure we * don't collide with page eviction, but we need to make sure we don't * collide with checkpoints either, they are the other operation that * can reconcile a page. */ __wt_spin_lock(session, &S2C(session)->metadata_lock); WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT)); __wt_spin_unlock(session, &S2C(session)->metadata_lock); /* * Walk the tree, reviewing on-disk pages to see if they need to be * re-written. */ for (page = NULL;;) { WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); if (page == NULL) break; /* * The only pages returned by the tree walk function are pages * we want to re-write; mark the page and tree dirty. */ if ((ret = __wt_page_modify_init(session, page)) != 0) { WT_TRET(__wt_page_release(session, page)); WT_RET(ret); } __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); } return (0); }
/* * __wt_ovfl_read -- * Bring an overflow item into memory. */ int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) { WT_DECL_RET; WT_OVFL_TRACK *track; size_t i; *decoded = false; /* * If no page specified, there's no need to lock and there's no cache * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells. */ if (page == NULL) return ( __ovfl_read(session, unpack->data, unpack->size, store)); /* * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow * value, but there was still a reader in the system that might need it, * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM * and we will be passed a page so we can check the on-page cell. * * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { track = page->modify->ovfl_track; for (i = 0; i < track->remove_next; ++i) if (track->remove[i].cell == unpack->cell) { store->data = track->remove[i].data; store->size = track->remove[i].size; break; } WT_ASSERT(session, i < track->remove_next); *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); __wt_readunlock(session, &S2BT(session)->ovfl_lock); return (ret); }
/* * __wt_conn_btree_sync_and_close -- * Sync and close the underlying btree handle. */ int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; int no_schema_lock; dhandle = session->dhandle; btree = S2BT(session); if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); /* * If we don't already have the schema lock, make it an error to try * to acquire it. The problem is that we are holding an exclusive * lock on the handle, and if we attempt to acquire the schema lock * we might deadlock with a thread that has the schema lock and wants * a handle lock (specifically, checkpoint). */ no_schema_lock = 0; if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { no_schema_lock = 1; F_SET(session, WT_SESSION_NO_SCHEMA_LOCK); } /* * We may not be holding the schema lock, and threads may be walking * the list of open handles (for example, checkpoint). Acquire the * handle's close lock. */ __wt_spin_lock(session, &dhandle->close_lock); /* * The close can fail if an update cannot be written, return the EBUSY * error to our caller for eventual retry. */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) WT_ERR(__wt_checkpoint_close(session, force)); if (dhandle->checkpoint == NULL) --S2C(session)->open_btree_count; WT_TRET(__wt_btree_close(session)); F_CLR(dhandle, WT_DHANDLE_OPEN); F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); err: __wt_spin_unlock(session, &dhandle->close_lock); if (no_schema_lock) F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); return (ret); }
/* * __ovfl_reuse_wrapup_err -- * Resolve the page's overflow reuse list after an error occurs. */ static int __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_DECL_RET; WT_OVFL_REUSE **e, **head, *reuse; size_t decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that were just added, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { e = &reuse->next[i]; continue; } *e = reuse->next[i]; } /* * Second, discard any overflow record with a just-added flag, clear the * flags for the next run. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE); e = &reuse->next[0]; continue; } *e = reuse->next[0]; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_TRET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __inmem_col_fix -- * Build in-memory index for fixed-length column-store leaf pages. */ static void __inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; const WT_PAGE_HEADER *dsk; btree = S2BT(session); dsk = page->dsk; page->pg_fix_bitf = WT_PAGE_HEADER_BYTE(btree, dsk); }
/* * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) { /* * A version of the page-out function that allows us to make additional * diagnostic checks. */ WT_ASSERT(session, S2BT(session)->evict_ref != ref); __wt_page_out(session, &ref->page); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (mod->rec_result == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if * the page is being reconciled, serialize with reconciliation. */ WT_RET(__wt_fair_lock(session, &page->page_lock)); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_TRET(__wt_fair_unlock(session, &page->page_lock)); WT_RET(ret); } return (0); }
/* * __split_should_deepen -- * Return if we should deepen the tree. */ static int __split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_PAGE_INDEX *pindex; /* * Splits are based on either the number of child pages that will be * created by the split (splitting an internal page that will be slow * to search), or by the memory footprint of the parent page (avoiding * an internal page that will eat up all of the cache and put eviction * pressure on the system). */ pindex = WT_INTL_INDEX_COPY(page); /* * Deepen the tree if the page's memory footprint is larger than the * maximum size for a page in memory. We need an absolute minimum * number of entries in order to split the page: if there is a single * huge key, splitting won't help. */ if (page->memory_footprint > S2BT(session)->maxmempage && pindex->entries >= __split_deepen_min_child) return (1); /* * Deepen the tree if the page's memory footprint is at least N * times the maximum internal page size chunk in the backing file and * the split will result in at least N children in the newly created * intermediate layer. */ if (page->memory_footprint > __split_deepen_max_internal_image * S2BT(session)->maxintlpage && pindex->entries >= (__split_deepen_per_child * __split_deepen_split_child)) return (1); return (0); }
/* * __wt_ovfl_track_wrapup_err -- * Resolve the page's overflow tracking on reconciliation error. */ int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_OVFL_TRACK *track; if (page->modify == NULL || page->modify->ovfl_track == NULL) return (0); track = page->modify->ovfl_track; if (track->discard != NULL) WT_RET(__ovfl_discard_wrapup_err(session, page)); if (track->ovfl_reuse[0] != NULL) WT_RET(__ovfl_reuse_wrapup_err(session, page)); if (track->ovfl_txnc[0] != NULL) { WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); ret = __ovfl_txnc_wrapup(session, page); WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); } return (0); }