/* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the * pages it references). */ void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages) { WT_IKEY *ikey; if (ref == NULL) return; /* * Optionally free the referenced pages. (The path to free referenced * page is used for error cleanup, no instantiated and then discarded * page should have WT_REF entries with real pages. The page may have * been marked dirty as well; page discard checks for that, so we mark * it clean explicitly.) */ if (free_pages && ref->page != NULL) { __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } /* * Optionally free row-store WT_REF key allocation. Historic versions of * this code looked in a passed-in page argument, but that is dangerous, * some of our error-path callers create WT_REF structures without ever * setting WT_REF.home or having a parent page to which the WT_REF will * be linked. Those WT_REF structures invariably have instantiated keys, * (they obviously cannot be on-page keys), and we must free the memory. */ switch (page_type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) __wt_free(session, ikey); break; } /* * Free any address allocation; if there's no linked WT_REF page, it * must be allocated. */ __wt_ref_addr_free(session, ref); /* Free any page-deleted information. */ if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } __wt_overwrite_and_free(session, ref); }
/* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the * pages it references). */ void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages) { WT_IKEY *ikey; if (ref == NULL) return; /* * Optionally free the referenced pages. (The path to free referenced * page is used for error cleanup, no instantiated and then discarded * page should have WT_REF entries with real pages. The page may have * been marked dirty as well; page discard checks for that, so we mark * it clean explicitly.) */ if (free_pages && ref->page != NULL) { __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } /* Free any key allocation. */ switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) __wt_free(session, ikey); break; } /* Free any address allocation. */ if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } /* Free any page-deleted information. */ if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } __wt_overwrite_and_free(session, ref); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, 1)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock)); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; int i; /* * Make sure no other thread has a hazard pointer on the page we are * about to discard. This is complicated by the fact that readers * publish their hazard pointer before re-checking the page state, so * our check can race with readers without indicating a real problem. * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { if ((hp = __wt_page_hazard_check(session, page)) == NULL) break; __wt_sleep(0, 10000); } if (hp != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", hp->page, hp->file, hp->line); WT_ASSERT(session, hp == NULL); } #endif /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any disk image. */ dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)__wt_mmap_discard(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; bool evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. */ WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: /* * Dead handles may reference dirty pages; clean the * page, both to keep statistics correct, and to let * the page-discard function assert no dirty page is * ever discarded. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, false, NULL)); __wt_evict_page_clean_update(session, ref, 1); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (evict_reset) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_cache_page_image_decr(session, dsk->mem_size); /* Discard any mapped image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)S2BT(session)->bm->map_discard( S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any allocated disk image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }