/* * __wt_cache_read -- * Read a page from the file. */ int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; WT_PAGE_STATE previous_state; size_t addr_size; const uint8_t *addr; page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. * Otherwise, there's an address, read the backing disk page and build * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; } else { /* Read the backing disk page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); /* Build the in-memory version of the page. */ WT_ERR(__wt_page_inmem(session, ref, tmp.data, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); } WT_ERR(__wt_verbose(session, WT_VERB_READ, "page %p: %s", page, __wt_page_type_string(page->type))); WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ WT_RET(__wt_evict_file_exclusive_on(session)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk( session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk(session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. */ WT_ERR(__wt_evict(session, ref, true)); break; case WT_SYNC_DISCARD: /* * Discard the page regardless of whether it is dirty. */ WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, NULL)); __wt_ref_out(session, ref); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } __wt_evict_file_exclusive_off(session); return (ret); }