/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_PAGE *page; int trigger, skip; bm = S2BT(session)->bm; WT_DSTAT_INCR(session, session_compact); WT_RET(__wt_config_gets(session, cfg, "trigger", &cval)); trigger = (int)cval.val; /* Check if compaction might be useful. */ WT_RET(bm->compact_skip(bm, session, trigger, &skip)); if (skip) return (0); /* * Walk the cache reviewing in-memory pages to see if they need to be * re-written. This requires looking at page reconciliation results, * which means the page cannot be reconciled at the same time as it's * being reviewed for compaction. The underlying functions ensure we * don't collide with page eviction, but we need to make sure we don't * collide with checkpoints either, they are the other operation that * can reconcile a page. */ __wt_spin_lock(session, &S2C(session)->metadata_lock); WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT)); __wt_spin_unlock(session, &S2C(session)->metadata_lock); /* * Walk the tree, reviewing on-disk pages to see if they need to be * re-written. */ for (page = NULL;;) { WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); if (page == NULL) break; /* * The only pages returned by the tree walk function are pages * we want to re-write; mark the page and tree dirty. */ if ((ret = __wt_page_modify_init(session, page)) != 0) { WT_TRET(__wt_page_release(session, page)); WT_RET(ret); } __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); } return (0); }
/* * __wt_bt_cache_force_write -- * Dirty the root page of the tree so it gets written. */ int __wt_bt_cache_force_write(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_PAGE *page; btree = session->btree; page = btree->root_page; /* Dirty the root page to ensure a write. */ WT_RET(__wt_page_modify_init(session, page)); __wt_page_modify_set(session, page); return (0); }
/* * __merge_new_page -- * Create a new in-memory internal page. */ static int __merge_new_page(WT_SESSION_IMPL *session, uint8_t type, uint32_t entries, int merge, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *newpage; /* Allocate a new internal page and fill it in. */ WT_RET(__wt_page_alloc(session, type, entries, &newpage)); newpage->read_gen = WT_READ_GEN_NOTSET; newpage->entries = entries; WT_ERR(__wt_page_modify_init(session, newpage)); if (merge) F_SET(newpage->modify, WT_PM_REC_SPLIT_MERGE); else __wt_page_modify_set(session, newpage); *pagep = newpage; return (0); err: __wt_page_out(session, &newpage); return (ret); }
/* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_UPDATE **upd_array, *upd; size_t size; uint32_t i; btree = S2BT(session); page = ref->page; page_del = ref->page_del; /* * Give the page a modify structure. * * If the tree is already dirty and so will be written, mark the page * dirty. (We'd like to free the deleted pages, but if the handle is * read-only or if the application never modifies the tree, we're not * able to do so.) */ if (btree->modified) { WT_RET(__wt_page_modify_init(session, page)); __wt_page_modify_set(session, page); } /* * An operation is accessing a "deleted" page, and we're building an * in-memory version of the page (making it look like all entries in * the page were individually updated by a remove operation). There * are two cases where we end up here: * * First, a running transaction used a truncate call to delete the page * without reading it, in which case the page reference includes a * structure with a transaction ID; the page we're building might split * in the future, so we update that structure to include references to * all of the update structures we create, so the transaction can abort. * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old * version of the page around, then we crashed and recovered, and now * we're being forced to read that page. * * In the first case, we have a page reference structure, in the second * second, we don't. * * Allocate the per-reference update array; in the case of instantiating * a page, deleted by a running transaction that might eventually abort, * we need a list of the update structures so we can do that abort. The * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ if (page_del != NULL) WT_RET(__wt_calloc_def( session, page->pg_row_entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); page->pg_row_upd = upd_array; /* * Fill in the per-reference update array with references to update * structures, fill in the per-page update array with references to * deleted items. */ for (i = 0, size = 0; i < page->pg_row_entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); if (page_del == NULL) upd->txnid = WT_TXN_NONE; /* Globally visible */ else { upd->txnid = page_del->txnid; page_del->update_list[i] = upd; } upd->next = upd_array[i]; upd_array[i] = upd; size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd); } __wt_cache_page_inmem_incr(session, page, size); return (0); err: /* * There's no need to free the page update structures on error, our * caller will discard the page and do that work for us. We could * similarly leave the per-reference update array alone because it * won't ever be used by any page that's not in-memory, but cleaning * it up makes sense, especially if we come back in to this function * attempting to instantiate this page again. */ if (page_del != NULL) __wt_free(session, page_del->update_list); return (ret); }
/* * __wt_row_modify -- * Row-store insert, update and delete. */ int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) { WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM *key, *value; WT_PAGE *page; WT_UPDATE *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; u_int i, skipdepth; int logged; key = &cbt->iface.key; value = is_remove ? NULL : &cbt->iface.value; page = cbt->page; /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); ins = NULL; upd = NULL; logged = 0; /* * Modify: allocate an update array as necessary, build a WT_UPDATE * structure, and call a serialized function to insert the WT_UPDATE * structure. * * Insert: allocate an insert array as necessary, build a WT_INSERT * and WT_UPDATE structure pair, and call a serialized function to * insert the WT_INSERT structure. */ if (cbt->compare == 0) { if (cbt->ins == NULL) { /* Allocate an update array as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, page->u.row.upd, upd_entry, page->entries); /* Set the WT_UPDATE array reference. */ upd_entry = &page->u.row.upd[cbt->slot]; } else upd_entry = &cbt->ins->upd; /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check(session, old_upd = *upd_entry)); /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, cbt, upd)); logged = 1; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, upd_entry, &upd, upd_size)); } else { /* * Allocate the insert array as necessary. * * We allocate an additional insert array slot for insert keys * sorting less than any key on the page. The test to select * that slot is baroque: if the search returned the first page * slot, we didn't end up processing an insert list, and the * comparison value indicates the search key was smaller than * the returned slot, then we're using the smallest-key insert * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, page->u.row.ins, ins_headp, page->entries + 1); ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? page->entries : cbt->slot; ins_headp = &page->u.row.ins[ins_slot]; /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it. */ WT_ERR(__wt_row_insert_alloc( session, key, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); ins->upd = upd; ins_size += upd_size; /* * Update the cursor: the WT_INSERT_HEAD might be allocated, * the WT_INSERT was allocated. */ cbt->ins_head = ins_head; cbt->ins = ins; WT_ERR(__wt_txn_modify(session, cbt, upd)); logged = 1; /* * If there was no insert list during the search, the cursor's * information cannot be correct, search couldn't have * initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (WT_SKIP_FIRST(ins_head) == NULL) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Insert the WT_INSERT structure. */ WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth)); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); cbt->ins = NULL; __wt_free(session, upd); } return (ret); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; bool block_manager_begin, skip; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = false; WT_STAT_FAST_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); /* * Get the tree handle's flush lock which blocks threads writing leaf * pages. */ __wt_spin_lock(session, &btree->flush_lock); /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = true; /* Walk the tree reviewing pages to see if they should be re-written. */ for (;;) { /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; session->compact_state = WT_COMPACT_SUCCESS; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; WT_UPDATE *old_upd; size_t ins_size, upd_size; u_int i, skipdepth; int append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; append = logged = 0; /* This code expects a remove to have a NULL value. */ if (is_remove) { if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; } else { /* * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. In addition, a recno of 0 * implies an append operation, we're allocating a new row. */ if (recno == 0 || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page))) append = 1; } /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, create a new WT_UPDATE * entry and have a serialized function link it into an existing * WT_INSERT entry's WT_UPDATE list. * * Else, allocate an insert array as necessary, build a WT_INSERT and * WT_UPDATE structure pair, and call a serialized function to insert * the WT_INSERT structure. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); logged = 1; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_append, ins_headp, 1); ins_headp = &page->modify->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_update, ins_headp, 1); ins_headp = &page->modify->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_update, ins_headp, page->pg_var_entries); ins_headp = &page->modify->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; if (upd == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); logged = 1; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth)); } /* If the update was successful, add it to the in-memory log. */ if (logged) WT_ERR(__wt_txn_log_op(session, cbt)); if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); __wt_free(session, upd); } return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_prev); flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); retry: WT_RET(__cursor_func_init(cbt, 0)); __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * If this is a modification, we're about to read information from the * page, save the write generation. */ page = cbt->page; if (discard && page != NULL) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen); } /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } cbt->page = NULL; WT_ERR(__wt_tree_walk(session, &page, flags)); WT_ERR_TEST(page == NULL, WT_NOTFOUND); WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ if (discard) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ( cbt->write_gen, page->modify->write_gen); } /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__cursor_func_resolve(cbt, ret)); return (ret); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins, *ins_copy; WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist; WT_ITEM *value, _value; WT_PAGE *page; WT_UPDATE *old_upd, *upd, *upd_obsolete; size_t ins_size, new_inshead_size, new_inslist_size, upd_size; uint64_t recno; u_int skipdepth; int i, logged; btree = cbt->btree; page = cbt->page; recno = cbt->iface.recno; logged = 0; WT_ASSERT(session, op != 1); switch (op) { case 2: /* Remove */ if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; break; case 3: /* Insert/Update */ default: value = &cbt->iface.value; /* * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. */ if (recno == 0 || recno > __col_last_recno(page)) op = 1; break; } /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); ins = NULL; new_inshead = NULL; new_inslist = NULL; upd = NULL; /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, create a new WT_UPDATE * entry and have a serialized function link it into an existing * WT_INSERT entry's WT_UPDATE list. * * Else, allocate an insert array as necessary, build a WT_INSERT and * WT_UPDATE structure pair, and call a serialized function to insert * the WT_INSERT structure. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* Make sure the update can proceed. */ WT_ERR( __wt_update_check(session, page, old_upd = cbt->ins->upd)); /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, &upd->txnid)); logged = 1; /* Serialize the update. */ WT_ERR(__wt_update_serial(session, page, cbt->write_gen, &cbt->ins->upd, old_upd, NULL, 0, &upd, upd_size, &upd_obsolete)); /* Discard any obsolete WT_UPDATE structures. */ if (upd_obsolete != NULL) __wt_update_obsolete_free(session, page, upd_obsolete); } else { /* Make sure the update can proceed. */ WT_ERR(__wt_update_check(session, page, NULL)); /* There may be no insert list, allocate as necessary. */ new_inshead_size = new_inslist_size = 0; if (op == 1) { if (page->modify->append == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); WT_ERR( __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; } else inshead = &page->modify->append[0]; cbt->ins_head = *inshead; } else if (page->type == WT_PAGE_COL_FIX) { if (page->modify->update == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); WT_ERR( __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; } else inshead = &page->modify->update[0]; } else { if (page->modify->update == NULL) { new_inslist_size = page->entries * sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_def( session, page->entries, &new_inslist)); inshead = &new_inslist[cbt->slot]; } else inshead = &page->modify->update[cbt->slot]; } /* There may be no WT_INSERT list, allocate as necessary. */ if (*inshead == NULL) { new_inshead_size = sizeof(WT_INSERT_HEAD); WT_ERR(__wt_calloc_def(session, 1, &new_inshead)); for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { cbt->ins_stack[i] = &new_inshead->head[i]; cbt->next_stack[i] = NULL; } cbt->ins_head = new_inshead; } /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it. */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, &upd->txnid)); logged = 1; ins->upd = upd; ins_size += upd_size; cbt->ins = ins; /* Insert or append the WT_INSERT structure. */ if (op == 1) { /* * The serialized function clears ins: take a copy of * the pointer so we can look up the record number. */ ins_copy = ins; WT_ERR(__wt_col_append_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, cbt->next_stack, &new_inslist, new_inslist_size, &new_inshead, new_inshead_size, &ins, ins_size, skipdepth)); /* Put the new recno into the cursor. */ cbt->recno = WT_INSERT_RECNO(ins_copy); } else WT_ERR(__wt_insert_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, cbt->next_stack, &new_inslist, new_inslist_size, &new_inshead, new_inshead_size, &ins, ins_size, skipdepth)); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); __wt_free(session, upd); } __wt_free(session, new_inslist); __wt_free(session, new_inshead); return (ret); }
/*对文件进行compact操作*/ int __wt_compact(WT_SESSION_IMPL* session, const char* cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; int block_manager_begin, evict_reset, skip; WT_UNUSED(cfg); conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = 0; WT_STAT_FAST_DATA_INCR(session, session_compact); /*检查bm对相应的blocks是否可以compact,如果不可以,直接返回*/ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return 0; /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); /*获得btree flusk_lock,防止在文件空间compact被其他线程flush*/ __wt_spin_lock(session, &btree->flush_lock); conn->compact_in_memory_pass = 1; WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; session->compaction = 1; for (;;){ WT_ERR(__wt_tree_walk(session, &ref, NULL, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; /*进行compact标记*/ WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; /*如果需要compact的page需要标记为脏page,通过内存驱逐来回写compact结果*/ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); /*结束compact动作*/ if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* * Unlock will be a release barrier, use it to update the compaction * status for reconciliation. */ conn->compact_in_memory_pass = 0; __wt_spin_unlock(session, &btree->flush_lock); return ret; }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 }; WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; bool append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; upd = upd_arg; append = logged = false; if (upd_arg == NULL) { if (modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE) { /* * Fixed-size column-store doesn't have on-page deleted * values, it's a nul byte. */ if (modify_type == WT_UPDATE_TOMBSTONE && btree->type == BTREE_COL_FIX) { modify_type = WT_UPDATE_STANDARD; value = &col_fix_remove; } } /* * There's a chance the application specified a record past the * last record on the page. If that's the case and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. * Ignore any information obtained from the search. */ WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0); if (cbt->compare != 0 && (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(cbt->ref) : __col_fix_last_recno(cbt->ref)))) { append = true; cbt->ins = NULL; cbt->ins_head = NULL; } } /* We're going to modify the page, we should have loaded history. */ WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO); /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; /* * If modifying a record not previously modified, but which is in the * same update slot as a previously modified record, cursor.ins will * not be set because there's no list of update records for this recno, * but cursor.ins_head will be set to point to the correct update slot. * Acquire the necessary insert information, then create a new update * entry and link it into the existing list. We get here if a page has * a single cell representing multiple records (the records have the * same value), and then a record in the cell is updated or removed, * creating the update list for the cell, and then a cursor iterates * into that same cell to update/remove a different record. We find the * correct slot in the update array, but we don't find an update list * (because it doesn't exist), and don't have the information we need * to do the insert. Normally, we wouldn't care (we could fail and do * a search for the record which would configure everything for the * insert), but range truncation does this pattern for every record in * the cell, and the performance is terrible. For that reason, catch it * here. */ if (cbt->ins == NULL && cbt->ins_head != NULL) { cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); if (cbt->ins != NULL) { if (WT_INSERT_RECNO(cbt->ins) == recno) cbt->compare = 0; else { /* * The test below is for cursor.compare set to 0 * and cursor.ins set: cursor.compare wasn't set * by the search we just did, and has an unknown * value. Clear cursor.ins to avoid the test. */ cbt->ins = NULL; } } } /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, cursor.ins will be set to * point to the correct update list. Create a new update entry and link * it into the existing list. * * Else, allocate an insert array as necessary, build an insert/update * structure pair, and link it into place. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size, false)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_append, ins_headp, 1); ins_headp = &mod->mod_col_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; /* * Check for insert split and checkpoint races in column-store: * it's easy (as opposed to in row-store) and a difficult bug to * otherwise diagnose. */ WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB || (recno != WT_RECNO_OOB && mod->mod_col_split_recno > recno)); if (upd_arg == NULL) { WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth, exclusive)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth, exclusive)); } /* If the update was successful, add it to the in-memory log. */ if (logged && modify_type != WT_UPDATE_RESERVE) { WT_ERR(__wt_txn_log_op(session, cbt)); /* * In case of append, the recno (key) for the value is assigned * now. Set the recno in the transaction operation to be used * incase this transaction is prepared to retrieve the update * corresponding to this operation. */ __wt_txn_op_set_recno(session, cbt->recno); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); if (upd_arg == NULL) __wt_free(session, upd); } return (ret); }
/* * __wt_compact_evict -- * Helper routine to decide if a file's size would benefit from re-writing * this page. */ int __wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_PAGE_MODIFY *mod; int skip; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; mod = page->modify; /* * We have to review page reconciliation information as an in-memory * page's original disk addresses might have been fine for compaction * but its replacement addresses might be a problem. To review page * reconciliation information, we have to lock out both eviction and * checkpoints, as those are the other two operations that can write * a page. * * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (WT_PAGE_IS_ROOT(page)) return (0); /* * If the page is already dirty, skip some work, it will be written in * any case. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * If the page is a split, ignore it, it will be merged into the parent. */ if (mod == NULL) goto disk; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case 0: disk: __wt_get_addr(page->parent, page->ref, &addr, &addr_size); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, &skip)); if (skip) return (0); break; case WT_PM_REC_EMPTY: return (0); case WT_PM_REC_REPLACE: WT_RET(bm->compact_page_skip(bm, session, mod->u.replace.addr, mod->u.replace.size, &skip)); if (skip) return (0); break; case WT_PM_REC_SPLIT: case WT_PM_REC_SPLIT_MERGE: return (0); } /* Mark the page and tree dirty, we want to write this page. */ WT_RET(__wt_page_modify_init(session, page)); __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); return (0); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; u_int i; bool skip; btree = S2BT(session); bm = btree->bm; ref = NULL; WT_STAT_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are two ways we call reconciliation: checkpoints and eviction. * Get the tree's flush lock which blocks threads writing pages for * checkpoints. */ __wt_spin_lock(session, &btree->flush_lock); /* Walk the tree reviewing pages to see if they should be re-written. */ for (i = 0;;) { /* Periodically check if we've run out of time. */ if (++i > 100) { WT_ERR(__wt_session_compact_check_timeout(session)); i = 0; } /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; session->compact_state = WT_COMPACT_SUCCESS; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; int block_manager_begin, evict_reset, skip; WT_UNUSED(cfg); conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = 0; WT_STAT_FAST_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); /* * Get the tree handle's flush lock which blocks threads writing leaf * pages. */ __wt_spin_lock(session, &btree->flush_lock); /* * That leaves eviction, we don't want to block eviction. Set a flag * so reconciliation knows compaction is running. If reconciliation * sees the flag it locks the page it's writing, we acquire the same * lock when reading the page's modify information, serializing access. * The same page lock blocks work on the page, but compaction is an * uncommon, heavy-weight operation. If it's ever a problem, there's * no reason we couldn't use an entirely separate lock than the page * lock. * * We also need to ensure we don't race with an on-going reconciliation. * After we set the flag, wait for eviction of this file to drain, and * then let eviction continue; */ conn->compact_in_memory_pass = 1; WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; /* Walk the tree reviewing pages to see if they should be re-written. */ session->compaction = 1; for (;;) { /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, NULL, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* * Unlock will be a release barrier, use it to update the compaction * status for reconciliation. */ conn->compact_in_memory_pass = 0; __wt_spin_unlock(session, &btree->flush_lock); return (ret); }