/* * __wt_page_modify_alloc -- * Allocate a page's modification structure. */ int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CONNECTION_IMPL *conn; WT_PAGE_MODIFY *modify; conn = S2C(session); WT_RET(__wt_calloc_one(session, &modify)); /* * Select a spinlock for the page; let the barrier immediately below * keep things from racing too badly. */ modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS; /* * Multiple threads of control may be searching and deciding to modify * a page. If our modify structure is used, update the page's memory * footprint, else discard the modify structure, another thread did the * work. */ if (__wt_atomic_cas_ptr(&page->modify, NULL, modify)) __wt_cache_page_inmem_incr(session, page, sizeof(*modify)); else __wt_free(session, modify); return (0); }
/* * __wt_ovfl_txnc_add -- * Add a new entry to the page's list of transaction-cached overflow * records. */ int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size) { WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc; size_t size; u_int i, skipdepth; uint8_t *p; if (page->modify->ovfl_track == NULL) WT_RET(__ovfl_track_init(session, page)); head = page->modify->ovfl_track->ovfl_txnc; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate the WT_OVFL_TXNC structure, next pointers for the skip * list, room for the address and value, then copy everything into * place. * * To minimize the WT_OVFL_TXNC structure size, the address offset * and size are single bytes: that's safe because the address follows * the structure (which can't be more than about 100B), and address * cookies are limited to 255B. */ size = sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size; WT_RET(__wt_calloc(session, 1, size, &txnc)); p = (uint8_t *)txnc + sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *); txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc); txnc->addr_size = (uint8_t)addr_size; memcpy(p, addr, addr_size); p += addr_size; txnc->value_offset = WT_PTRDIFF32(p, txnc); txnc->value_size = WT_STORE_SIZE(value_size); memcpy(p, value, value_size); txnc->current = __wt_txn_new_id(session); __wt_cache_page_inmem_incr( session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC)); /* Insert the new entry into the skiplist. */ __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size); for (i = 0; i < skipdepth; ++i) { txnc->next[i] = *stack[i]; *stack[i] = txnc; } if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add")); return (0); }
/* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_UPDATE **upd_array, *upd; size_t size; uint32_t i; btree = S2BT(session); page = ref->page; page_del = ref->page_del; /* * Give the page a modify structure. * * If the tree is already dirty and so will be written, mark the page * dirty. (We'd like to free the deleted pages, but if the handle is * read-only or if the application never modifies the tree, we're not * able to do so.) */ if (btree->modified) { WT_RET(__wt_page_modify_init(session, page)); __wt_page_modify_set(session, page); } /* * An operation is accessing a "deleted" page, and we're building an * in-memory version of the page (making it look like all entries in * the page were individually updated by a remove operation). There * are two cases where we end up here: * * First, a running transaction used a truncate call to delete the page * without reading it, in which case the page reference includes a * structure with a transaction ID; the page we're building might split * in the future, so we update that structure to include references to * all of the update structures we create, so the transaction can abort. * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old * version of the page around, then we crashed and recovered, and now * we're being forced to read that page. * * In the first case, we have a page reference structure, in the second * second, we don't. * * Allocate the per-reference update array; in the case of instantiating * a page, deleted by a running transaction that might eventually abort, * we need a list of the update structures so we can do that abort. The * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ if (page_del != NULL) WT_RET(__wt_calloc_def( session, page->pg_row_entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); page->pg_row_upd = upd_array; /* * Fill in the per-reference update array with references to update * structures, fill in the per-page update array with references to * deleted items. */ for (i = 0, size = 0; i < page->pg_row_entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); if (page_del == NULL) upd->txnid = WT_TXN_NONE; /* Globally visible */ else { upd->txnid = page_del->txnid; page_del->update_list[i] = upd; } upd->next = upd_array[i]; upd_array[i] = upd; size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd); } __wt_cache_page_inmem_incr(session, page, size); return (0); err: /* * There's no need to free the page update structures on error, our * caller will discard the page and do that work for us. We could * similarly leave the per-reference update array alone because it * won't ever be used by any page that's not in-memory, but cleaning * it up makes sense, especially if we come back in to this function * attempting to instantiate this page again. */ if (page_del != NULL) __wt_free(session, page_del->update_list); return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, 1)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; *pagep = NULL; dsk = image; alloc_entries = 0; /* * Figure out how many underlying objects the page references so we can * allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * value item). * * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each entry is a * location cookie). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each entry is a key and * location cookie pair). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). * If that flag is not set, there are more keys than values, * we have to walk the page to figure it out. */ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) alloc_entries = dsk->u.entries; else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc( session, dsk->type, dsk->recno, alloc_entries, 1, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new internal page to the parent. */ if (ref != NULL) { switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_parent_ref = ref; break; } ref->page = page; } *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_page_alloc -- * Create or read a page into the cache. */ int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep) { WT_CACHE *cache; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; size_t size; uint32_t i; void *p; *pagep = NULL; cache = S2C(session)->cache; page = NULL; size = sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: break; case WT_PAGE_COL_VAR: /* * Variable-length column-store leaf page: allocate memory to * describe the page's contents with the initial allocation. */ size += alloc_entries * sizeof(WT_COL); break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page: allocate memory to describe the page's * contents with the initial allocation. */ size += alloc_entries * sizeof(WT_ROW); break; WT_ILLEGAL_VALUE(session); } WT_RET(__wt_calloc(session, 1, size, &page)); page->type = type; page->read_gen = WT_READGEN_NOTSET; switch (type) { case WT_PAGE_COL_FIX: page->pg_fix_recno = recno; page->pg_fix_entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_recno = recno; /* * Internal pages have an array of references to objects so they * can split. Allocate the array of references and optionally, * the objects to which they point. */ WT_ERR(__wt_calloc(session, 1, sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *), &p)); size += sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *); pindex = p; pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1); pindex->entries = alloc_entries; WT_INTL_INDEX_SET(page, pindex); if (alloc_refs) for (i = 0; i < pindex->entries; ++i) { WT_ERR(__wt_calloc_def( session, 1, &pindex->index[i])); size += sizeof(WT_REF); } if (0) { err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) __wt_free(session, pindex->index[i]); __wt_free(session, pindex); } __wt_free(session, page); return (ret); } break; case WT_PAGE_COL_VAR: page->pg_var_recno = recno; page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_var_entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_row_entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); } /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1); *pagep = page; return (0); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk, int disk_not_alloc, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; uint32_t alloc_entries; size_t size; alloc_entries = 0; *pagep = NULL; /* * Figure out how many underlying objects the page references so * we can allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: /* * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each physical entry * is an offset object). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * data item). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each in-memory entry is a * key item and location cookie). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page entries map in an indeterminate way to * the physical entries on the page, we have to walk the page * to figure it out. */ WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page)); page->dsk = dsk; page->read_gen = WT_READ_GEN_NOTSET; if (disk_not_alloc) F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = disk_not_alloc ? 0 : dsk->mem_size; switch (page->type) { case WT_PAGE_COL_FIX: page->entries = dsk->u.entries; page->u.col_fix.recno = dsk->recno; __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: page->entries = dsk->u.entries; page->u.intl.recno = dsk->recno; __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: page->entries = dsk->u.entries; page->u.col_var.recno = dsk->recno; WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: page->entries = dsk->u.entries / 2; WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: page->entries = alloc_entries; WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new page into the parent. */ if (parent_ref != NULL) WT_LINK_PAGE(parent, parent_ref, page); *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_page_alloc -- * Create or read a page into the cache. */ int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep) { WT_CACHE *cache; WT_PAGE *page; size_t size; void *p; *pagep = NULL; cache = S2C(session)->cache; /* * Allocate a page, and for most page types, the additional information * it needs to describe the disk image. */ size = sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: size += alloc_entries * sizeof(WT_REF); break; case WT_PAGE_COL_VAR: size += alloc_entries * sizeof(WT_COL); break; case WT_PAGE_ROW_LEAF: size += alloc_entries * sizeof(WT_ROW); break; WT_ILLEGAL_VALUE(session); } WT_RET(__wt_calloc(session, 1, size, &page)); p = (uint8_t *)page + sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->u.intl.t = p; break; case WT_PAGE_COL_VAR: page->u.col_var.d = p; break; case WT_PAGE_ROW_LEAF: page->u.row.d = p; break; WT_ILLEGAL_VALUE(session); } /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); (void)WT_ATOMIC_ADD(cache->pages_inmem, 1); /* The one page field we set is the type. */ page->type = type; *pagep = page; return (0); }
/* * __ovfl_reuse_wrapup -- * Resolve the page's overflow reuse list after a page is written. */ static int __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_OVFL_REUSE **e, **head, *reuse; size_t incr, decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that aren't in-use, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; *e != NULL;) { if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) { e = &(*e)->next[i]; continue; } *e = (*e)->next[i]; } /* * Second, discard any overflow record without an in-use flag, clear * the flags for the next run. * * As part of the pass through the lowest level, figure out how much * space we added/subtracted from the page, and update its footprint. * We don't get it exactly correct because we don't know the depth of * the skiplist here, but it's close enough, and figuring out the * memory footprint change in the reconciliation wrapup code means * fewer atomic updates and less code overall. */ incr = decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) incr += WT_OVFL_SIZE(WT_OVFL_REUSE) + reuse->addr_size + reuse->value_size; F_CLR(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); e = &(*e)->next[0]; continue; } *e = (*e)->next[0]; WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); decr += WT_OVFL_SIZE(WT_OVFL_REUSE) + reuse->addr_size + reuse->value_size; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_RET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); __wt_free(session, reuse); } if (incr > decr) __wt_cache_page_inmem_incr(session, page, incr - decr); if (decr > incr) __wt_cache_page_inmem_decr(session, page, decr - incr); return (0); }
/* * __wt_rec_track -- * Add an object to the page's list of tracked objects. */ int __wt_rec_track(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, uint32_t addr_size, const void *data, uint32_t data_size, uint32_t flags) { WT_PAGE_MODIFY *mod; WT_PAGE_TRACK *empty, *track; uint8_t *p; uint32_t i; mod = page->modify; /* Find an empty slot. */ empty = NULL; for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) if (!F_ISSET(track, WT_TRK_OBJECT)) { empty = track; break; } /* Reallocate space as necessary. */ if (empty == NULL) { WT_RET(__rec_track_extend(session, page)); empty = &mod->track[mod->track_entries - 1]; } track = empty; /* * Minor optimization: allocate a single chunk of space instead of two * separate ones: be careful when it's freed. */ WT_RET(__wt_calloc_def(session, addr_size + data_size, &p)); /* * Set the just-added flag so we clean up should reconciliation fail, * except for cached overflow values, which don't get discarded, even * if reconciliation fails. */ track->flags = (uint8_t)flags | WT_TRK_OBJECT; if (!F_ISSET(track, WT_TRK_OVFL_VALUE)) F_SET(track, WT_TRK_JUST_ADDED); track->addr.addr = p; track->addr.size = addr_size; memcpy(track->addr.addr, addr, addr_size); if (data_size) { p += addr_size; track->data = p; track->size = data_size; memcpy(track->data, data, data_size); } /* * Overflow items are potentially large and on-page items remain in the * tracking list until the page is evicted. If we're tracking a lot of * them, their memory might matter: increment the page and cache memory * totals. This is unlikely to matter, but it's inexpensive (unless * there are lots of them, in with case I guess the memory matters). * * If this reconciliation were to fail, we would reasonably perform the * inverse operation in __wt_rec_track_wrapup_err. I'm not bothering * with that because we'd have to crack the structure itself to figure * out how much to decrement and I don't think it's worth the effort. * The potential problem is repeatedly failing reconciliation of a page * with a large number of overflow items, which causes the page's memory * memory footprint to become incorrectly high, causing us to push the * page out of cache unnecessarily. Like I said, not worth the effort. */ if (LF_ISSET(WT_TRK_ONPAGE)) __wt_cache_page_inmem_incr( session, page, addr_size + data_size); if (WT_VERBOSE_ISSET(session, reconcile)) WT_RET(__track_msg(session, page, "add", track)); return (0); }