/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_PAGE *page; int trigger, skip; bm = S2BT(session)->bm; WT_DSTAT_INCR(session, session_compact); WT_RET(__wt_config_gets(session, cfg, "trigger", &cval)); trigger = (int)cval.val; /* Check if compaction might be useful. */ WT_RET(bm->compact_skip(bm, session, trigger, &skip)); if (skip) return (0); /* * Walk the cache reviewing in-memory pages to see if they need to be * re-written. This requires looking at page reconciliation results, * which means the page cannot be reconciled at the same time as it's * being reviewed for compaction. The underlying functions ensure we * don't collide with page eviction, but we need to make sure we don't * collide with checkpoints either, they are the other operation that * can reconcile a page. */ __wt_spin_lock(session, &S2C(session)->metadata_lock); WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT)); __wt_spin_unlock(session, &S2C(session)->metadata_lock); /* * Walk the tree, reviewing on-disk pages to see if they need to be * re-written. */ for (page = NULL;;) { WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); if (page == NULL) break; /* * The only pages returned by the tree walk function are pages * we want to re-write; mark the page and tree dirty. */ if ((ret = __wt_page_modify_init(session, page)) != 0) { WT_TRET(__wt_page_release(session, page)); WT_RET(ret); } __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); } return (0); }
/* * __wt_update_serial_func -- * Server function to add an WT_UPDATE entry in the page array. */ int __wt_update_serial_func(WT_SESSION_IMPL *session, void *args) { WT_PAGE *page; WT_UPDATE **new_upd, *upd, **upd_entry, **upd_obsolete; uint32_t write_gen; __wt_update_unpack( args, &page, &write_gen, &upd_entry, &new_upd, &upd, &upd_obsolete); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); upd->next = *upd_entry; /* * Publish: there must be a barrier to ensure the new entry's next * pointer is set before we update the linked list. */ WT_PUBLISH(*upd_entry, upd); __wt_update_upd_taken(session, args, page); /* * If the page needs an update array (column-store pages and inserts on * row-store pages do not use the update array), our caller passed us * one of the correct size. Check the page still needs one (the write * generation test should have caught that, though). * * NOTE: it is important to do this after publishing that the update is * set. Code can assume that if the array is set, it is non-empty. */ if (new_upd != NULL && page->u.row.upd == NULL) { page->u.row.upd = new_upd; __wt_update_new_upd_taken(session, args, page); } /* Discard obsolete WT_UPDATE structures. */ *upd_obsolete = __wt_update_obsolete_check(session, upd->next); __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __wt_insert_serial_func -- * Server function to add an WT_INSERT entry to the page. */ int __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args) { WT_INSERT *new_ins, ***ins_stack; WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead; WT_PAGE *page; uint32_t write_gen; u_int i, skipdepth; __wt_insert_unpack(args, &page, &write_gen, &insheadp, &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); /* * Publish: First, point the new WT_INSERT item's skiplist references * to the next elements in the insert list, then flush memory. Second, * update the skiplist elements that reference the new WT_INSERT item, * this ensures the list is never inconsistent. */ if ((inshead = *insheadp) == NULL) inshead = new_inshead; for (i = 0; i < skipdepth; i++) new_ins->next[i] = *ins_stack[i]; WT_WRITE_BARRIER(); for (i = 0; i < skipdepth; i++) { if (inshead->tail[i] == NULL || ins_stack[i] == &inshead->tail[i]->next[i]) inshead->tail[i] = new_ins; *ins_stack[i] = new_ins; } __wt_insert_new_ins_taken(session, args, page); /* * If the insert head does not yet have an insert list, our caller * passed us one. * * NOTE: it is important to do this after the item has been added to * the list. Code can assume that if the list is set, it is non-empty. */ if (*insheadp == NULL) { WT_PUBLISH(*insheadp, new_inshead); __wt_insert_new_inshead_taken(session, args, page); } /* * If the page does not yet have an insert array, our caller passed * us one. * * NOTE: it is important to do this after publishing the list entry. * Code can assume that if the array is set, it is non-empty. */ if (page->type == WT_PAGE_ROW_LEAF) { if (page->u.row.ins == NULL) { page->u.row.ins = new_inslist; __wt_insert_new_inslist_taken(session, args, page); } } else if (page->modify->update == NULL) { page->modify->update = new_inslist; __wt_insert_new_inslist_taken(session, args, page); } __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __wt_col_append_serial_func -- * Server function to append an WT_INSERT entry to the tree. */ int __wt_col_append_serial_func(WT_SESSION_IMPL *session, void *args) { WT_BTREE *btree; WT_INSERT *ins, *new_ins, ***ins_stack, **next_stack; WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead; WT_PAGE *page; uint64_t recno; uint32_t write_gen; u_int i, skipdepth; btree = S2BT(session); __wt_col_append_unpack(args, &page, &write_gen, &insheadp, &ins_stack, &next_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); if ((inshead = *insheadp) == NULL) inshead = new_inshead; /* * If the application specified a record number, there's a race: the * application may have searched for the record, not found it, then * called into the append code, and another thread might have added * the record. Fortunately, we're in the right place because if the * record didn't exist at some point, it can only have been created * on this list. Search for the record, if specified. */ if ((recno = WT_INSERT_RECNO(new_ins)) == 0) recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno; ins = __col_insert_search(inshead, ins_stack, next_stack, recno); /* If we find the record number, there's been a race. */ if (ins != NULL && WT_INSERT_RECNO(ins) == recno) WT_RET(WT_RESTART); /* * Publish: First, point the new WT_INSERT item's skiplist references * to the next elements in the insert list, then flush memory. Second, * update the skiplist elements that reference the new WT_INSERT item, * this ensures the list is never inconsistent. */ for (i = 0; i < skipdepth; i++) new_ins->next[i] = *ins_stack[i]; WT_WRITE_BARRIER(); for (i = 0; i < skipdepth; i++) { if (inshead->tail[i] == NULL || ins_stack[i] == &inshead->tail[i]->next[i]) inshead->tail[i] = new_ins; *ins_stack[i] = new_ins; } __wt_col_append_new_ins_taken(args); /* * If the insert head does not yet have an insert list, our caller * passed us one. * * NOTE: it is important to do this after the item has been added to * the list. Code can assume that if the list is set, it is non-empty. */ if (*insheadp == NULL) { WT_PUBLISH(*insheadp, new_inshead); __wt_col_append_new_inshead_taken(args); } /* * If the page does not yet have an insert array, our caller passed * us one. * * NOTE: it is important to do this after publishing the list entry. * Code can assume that if the array is set, it is non-empty. */ if (page->modify->append == NULL) { page->modify->append = new_inslist; __wt_col_append_new_inslist_taken(args); } /* * If we don't find the record, check to see if we extended the file, * and update the last record number. */ if (recno > btree->last_recno) btree->last_recno = recno; __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __wt_compact_evict -- * Helper routine to decide if a file's size would benefit from re-writing * this page. */ int __wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_PAGE_MODIFY *mod; int skip; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; mod = page->modify; /* * We have to review page reconciliation information as an in-memory * page's original disk addresses might have been fine for compaction * but its replacement addresses might be a problem. To review page * reconciliation information, we have to lock out both eviction and * checkpoints, as those are the other two operations that can write * a page. * * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (WT_PAGE_IS_ROOT(page)) return (0); /* * If the page is already dirty, skip some work, it will be written in * any case. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * If the page is a split, ignore it, it will be merged into the parent. */ if (mod == NULL) goto disk; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case 0: disk: __wt_get_addr(page->parent, page->ref, &addr, &addr_size); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, &skip)); if (skip) return (0); break; case WT_PM_REC_EMPTY: return (0); case WT_PM_REC_REPLACE: WT_RET(bm->compact_page_skip(bm, session, mod->u.replace.addr, mod->u.replace.size, &skip)); if (skip) return (0); break; case WT_PM_REC_SPLIT: case WT_PM_REC_SPLIT_MERGE: return (0); } /* Mark the page and tree dirty, we want to write this page. */ WT_RET(__wt_page_modify_init(session, page)); __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); return (0); }