/* * __hazard_exclusive -- * Request exclusive access to a page. */ static int __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top) { /* * Make sure there is space to track exclusive access so we can unlock * to clean up. */ WT_RET(__wt_realloc_def(session, &session->excl_allocated, session->excl_next + 1, &session->excl)); /* * Hazard pointers are acquired down the tree, which means we can't * deadlock. * * Request exclusive access to the page. The top-level page should * already be in the locked state, lock child pages in memory. * If another thread already has this page, give up. */ if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED)) return (EBUSY); /* We couldn't change the state. */ WT_ASSERT(session, ref->state == WT_REF_LOCKED); session->excl[session->excl_next++] = ref; /* Check for a matching hazard pointer. */ if (__wt_page_hazard_check(session, ref->page) == NULL) return (0); WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard); WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard); WT_VERBOSE_RET( session, evict, "page %p hazard request failed", ref->page); return (EBUSY); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) WT_ERR(__wt_realloc(session, &lsm_tree->chunk_alloc, WT_MAX(10 * sizeof(*lsm_tree->chunk), 2 * lsm_tree->chunk_alloc), &lsm_tree->chunk)); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); ++lsm_tree->dsk_gen; WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); }
/* * __wt_cond_wait * Wait on a mutex, optionally timing out. */ int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs) { struct timespec ts; WT_DECL_RET; int locked; locked = 0; /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL) { WT_VERBOSE_RET( session, mutex, "wait %s cond (%p)", cond->name, cond); WT_CSTAT_INCR(session, cond_wait); } WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = 1; while (!cond->signalled) { if (usecs > 0) { WT_ERR(__wt_epoch(session, &ts)); ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION; ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION; ret = pthread_cond_timedwait( &cond->cond, &cond->mtx, &ts); if (ret == ETIMEDOUT) { ret = 0; break; } } else ret = pthread_cond_wait(&cond->cond, &cond->mtx); /* * Check pthread_cond_wait() return for EINTR, ETIME and * ETIMEDOUT, some systems return these errors. */ if (ret == EINTR || #ifdef ETIME ret == ETIME || #endif ret == ETIMEDOUT) ret = 0; WT_ERR(ret); } cond->signalled = 0; err: if (locked) WT_TRET(pthread_mutex_unlock(&cond->mtx)); if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_cond_wait"); }
/* * __wt_rwunlock -- * Release a read/write lock. */ int __wt_rwunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { WT_DECL_RET; WT_VERBOSE_RET(session, mutex, "rwlock: unlock %s (%p)", rwlock->name, rwlock); if ((ret = pthread_rwlock_unlock(&rwlock->rwlock)) == 0) return (0); WT_RET_MSG(session, ret, "pthread_rwlock_unlock: %s", rwlock->name); }
/* * __track_dump -- * Dump the list of tracked objects. */ static int __track_dump(WT_SESSION_IMPL *session, WT_PAGE *page, const char *tag) { WT_PAGE_MODIFY *mod; WT_PAGE_TRACK *track; uint32_t i; mod = page->modify; if (mod->track_entries == 0) return (0); WT_VERBOSE_RET(session, reconcile, "\n"); WT_VERBOSE_RET(session, reconcile, "page %p tracking list at %s:", page, tag); for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) if (F_ISSET(track, WT_TRK_OBJECT)) WT_RET(__track_msg(session, page, "dump", track)); WT_VERBOSE_RET(session, reconcile, "\n"); return (0); }
/* * __wt_writelock -- * Wait to get an exclusive lock. */ int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { WT_DECL_RET; WT_VERBOSE_RET(session, mutex, "rwlock: writelock %s (%p)", rwlock->name, rwlock); WT_STAT_FAST_CONN_INCR(session, rwlock_write); if ((ret = pthread_rwlock_wrlock(&rwlock->rwlock)) == 0) return (0); WT_RET_MSG(session, ret, "pthread_rwlock_wrlock: %s", rwlock->name); }
/* * __wt_munmap -- * Remove a memory mapping. */ int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len) { WT_VERBOSE_RET(session, fileops, "%s: unmap %" PRIuMAX " bytes", fh->name, (uintmax_t)len); if (munmap(map, len) == 0) return (0); WT_RET_MSG(session, __wt_errno(), "%s unmap error: failed to unmap %" PRIuMAX " bytes", fh->name, (uintmax_t)len); }
/* * __wt_fsync -- * Flush a file handle. */ int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) { WT_DECL_RET; WT_VERBOSE_RET(session, fileops, "%s: fsync", fh->name); WT_SYSCALL_RETRY(fsync(fh->fd), ret); if (ret != 0) WT_RET_MSG(session, ret, "%s fsync error", fh->name); return (0); }
/* * __wt_try_writelock * Try to get an exclusive lock, or fail immediately if unavailable. */ int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { WT_DECL_RET; WT_VERBOSE_RET(session, mutex, "rwlock: try_writelock %s (%p)", rwlock->name, rwlock); WT_CSTAT_INCR(session, rwlock_write); if ((ret = pthread_rwlock_trywrlock(&rwlock->rwlock)) == 0 || ret == EBUSY) return (ret); WT_RET_MSG(session, ret, "pthread_rwlock_trywrlock: %s", rwlock->name); }
/* * __wt_block_compact_skip -- * Return if compaction will shrink the file. */ int __wt_block_compact_skip( WT_SESSION_IMPL *session, WT_BLOCK *block, int trigger, int *skipp) { WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t avail, half; int pct; fh = block->fh; *skipp = 1; /* * We do compaction by copying blocks from the end of the file to the * beginning of the file, and we need some metrics to decide if it's * worth doing. Ignore small files, and files where we are unlikely * to recover the specified percentage of the file. (The calculation * is if at least N % of the file appears in the available list, and * in the first half of the file. In other words, don't bother with * compaction unless we have an expectation of moving N % of the file * from the last half of the file to the first half of the file.) */ if (fh->size <= 10 * 1024) return (0); __wt_spin_lock(session, &block->live_lock); avail = 0; half = fh->size / 2; el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < half) avail += ext->size; pct = (int)((avail * 100) / fh->size); __wt_spin_unlock(session, &block->live_lock); if (pct >= trigger) *skipp = 0; WT_VERBOSE_RET(session, block, "%s: compaction %s, %d%% of the free space in the available " "list appears in the first half of the file", block->name, pct < trigger ? "skipped" : "proceeding", pct); return (0); }
/* * __wt_rwlock_destroy -- * Destroy a mutex. */ int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) { WT_DECL_RET; WT_RWLOCK *rwlock; rwlock = *rwlockp; /* Clear our caller's reference. */ *rwlockp = NULL; WT_VERBOSE_RET(session, mutex, "rwlock: destroy %s (%p)", rwlock->name, rwlock); if ((ret = pthread_rwlock_destroy(&rwlock->rwlock)) == 0) { __wt_free(session, rwlock); return (0); } /* Deliberately leak memory on error. */ WT_RET_MSG(session, ret, "pthread_rwlock_destroy: %s", rwlock->name); }
/* * __wt_rename -- * Rename a file. */ int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) { WT_DECL_RET; const char *from_path, *to_path; WT_VERBOSE_RET(session, fileops, "rename %s to %s", from, to); WT_RET(__wt_filename(session, from, &from_path)); WT_RET(__wt_filename(session, to, &to_path)); WT_SYSCALL_RETRY(rename(from_path, to_path), ret); __wt_free(session, from_path); __wt_free(session, to_path); if (ret == 0) return (0); WT_RET_MSG(session, ret, "rename %s to %s", from, to); }
/* * __wt_mmap -- * Map a file into memory. */ int __wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp) { void *map; WT_VERBOSE_RET(session, fileops, "%s: map %" PRIuMAX " bytes", fh->name, (uintmax_t)fh->file_size); if ((map = mmap(NULL, (size_t)fh->file_size, PROT_READ, #ifdef MAP_NOCORE MAP_NOCORE | #endif MAP_PRIVATE, fh->fd, (off_t)0)) == MAP_FAILED) { WT_RET_MSG(session, __wt_errno(), "%s map error: failed to map %" PRIuMAX " bytes", fh->name, (uintmax_t)fh->file_size); } *(void **)mapp = map; *lenp = (size_t)fh->file_size; return (0); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; off_t offset; uint32_t align_size; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = align_size; /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!locked) __wt_spin_lock(session, &block->live_lock); ret = __wt_block_alloc(session, block, &offset, (off_t)align_size); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); #if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE) /* * Extend the file in chunks. We aren't holding a lock and we'd prefer * to limit the number of threads extending the file at the same time, * so choose the one thread that's crossing the extended boundary. We * don't extend newly created files, and it's theoretically possible we * might wait so long our extension of the file is passed by another * thread writing single blocks, that's why there's a check in case the * extended file size becomes too small: if the file size catches up, * every thread will try to extend it. */ if (fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; #if defined(HAVE_POSIX_FALLOCATE) if ((ret = posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0) WT_RET_MSG( session, ret, "%s: posix_fallocate", fh->name); #elif defined(HAVE_FTRUNCATE) if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0) WT_RET_MSG(session, ret, "%s: ftruncate", fh->name); #endif } #endif if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!locked) __wt_spin_lock(session, &block->live_lock); WT_TRET( __wt_block_off_free(session, block, offset, align_size)); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { block->os_cache_dirty = 0; if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) WT_RET_MSG( session, ret, "%s: sync_file_range", block->name); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); WT_VERBOSE_RET(session, write, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, align_size, blk->cksum); *offsetp = offset; *sizep = align_size; *cksump = blk->cksum; return (ret); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_REF *ref; uint64_t recno; uint32_t entry, i; int found, lno; bm = S2BT(session)->bm; unpack = &_unpack; WT_VERBOSE_RET(session, verify, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type))); #endif /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress every 10 pages. */ if (++vs->fcnt % 10 == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the page in debugging mode. */ if (vs->dump_blocks && page->dsk != NULL) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->u.col_fix.recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->u.intl.recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->u.col_var.recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, vs->tmp1, page), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, page, vs)); break; } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. * * Object if a leaf-no-overflow address cell references a page that has * overflow keys, but don't object if a standard address cell references * a page without overflow keys. The leaf-no-overflow address cell is * an optimization for trees without few, if any, overflow items, and * may not be set by reconciliation in all possible cases. */ if (WT_PAGE_IS_ROOT(page)) lno = 0; else { __wt_cell_unpack(page->ref->addr, unpack); lno = unpack->raw == WT_CELL_ADDR_LNO ? 1 : 0; } switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, page, &found, vs)); if (found && lno) WT_RET_MSG(session, WT_ERROR, "page at %s referenced in its parent by a cell of " "type %s illegally contains overflow items", __wt_page_addr_string(session, vs->tmp1, page), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; default: if (lno) WT_RET_MSG(session, WT_ERROR, "page at %s is of type %s and is illegally " "referenced in its parent by a cell of type %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (ref->u.recno != vs->record_total + 1) { __wt_cell_unpack(ref->addr, unpack); WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, vs->tmp1, page), ref->u.recno, vs->record_total + 1); } /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, ref, entry, vs)); /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); }
/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard pointer. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_PAGE *t; uint32_t i; btree = session->btree; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!exclusive) WT_RET(__hazard_exclusive(session, ref, top)); /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ WT_RET(__rec_review(session, ref, ref->page, exclusive, merge, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_EVICT_FORCE: /* Forced evict */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * If the file is being checkpointed, we cannot evict dirty pages, * because that may free a page that appears on an internal page in the * checkpoint. Don't rely on new updates being skipped by the * transaction used for transaction reads: (1) there are paths that * dirty pages for artificial reasons; (2) internal pages aren't * transactional; and (3) if an update was skipped during the * checkpoint (leaving the page dirty), then rolled back, we could * still successfully overwrite a page and corrupt the checkpoint. * * Further, even for clean pages, the checkpoint's reconciliation of an * internal page might race with us as we evict a child in the page's * subtree. * * One half of that test is in the reconciliation code: the checkpoint * thread waits for eviction-locked pages to settle before determining * their status. The other half of the test is here: after acquiring * the exclusive eviction lock on a page, confirm no page in the page's * stack of pages from the root is being reconciled in a checkpoint. * This ensures we either see the checkpoint-walk state here, or the * reconciliation of the internal page sees our exclusive lock on the * child page and waits until we're finished evicting the child page * (or give up if eviction isn't possible). * * We must check the full stack (we might be attempting to evict a leaf * page multiple levels beneath the internal page being reconciled as * part of the checkpoint, and all of the intermediate nodes are being * merged into the internal page). * * There's no simple test for knowing if a page in our page stack is * involved in a checkpoint. The internal page's checkpoint-walk flag * is the best test, but it's not set anywhere for the root page, it's * not a complete test. * * Quit for any page that's not a simple, in-memory page. (Almost the * same as checking for the checkpoint-walk flag. I don't think there * are code paths that change the page's status from checkpoint-walk, * but these races are hard enough I'm not going to proceed if there's * anything other than a vanilla, in-memory tree stack.) Climb until * we find a page which can't be merged into its parent, and failing if * we never find such a page. */ if (btree->checkpointing && !merge && __wt_page_is_modified(page)) { ckpt: WT_CSTAT_INCR(session, cache_eviction_checkpoint); WT_DSTAT_INCR(session, cache_eviction_checkpoint); return (EBUSY); } if (btree->checkpointing && top) for (t = page->parent;; t = t->parent) { if (t == NULL || t->ref == NULL) /* root */ goto ckpt; if (t->ref->state != WT_REF_MEM) /* scary */ goto ckpt; if (t->modify == NULL || /* not merged */ !F_ISSET(t->modify, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) break; } /* * If we are merging internal pages, we just need exclusive access, we * don't need to write everything. */ if (merge) return (0); /* * Fail if any page in the top-level page's subtree won't be merged into * its parent, the page that cannot be merged must be evicted first. * The test is necessary but should not fire much: the eviction code is * biased for leaf pages, an internal page shouldn't be selected for * eviction until its children have been evicted. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* * If the page is dirty and can possibly change state, write it so we * know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT); /* * Update the page's modification reference, reconciliation * might have changed it. */ mod = page->modify; /* If there are unwritten changes on the page, give up. */ if (ret == EBUSY) { WT_VERBOSE_RET(session, evict, "eviction failed, reconciled page not clean"); /* * We may be able to discard any "update" memory the * page no longer needs. */ switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); WT_ASSERT(session, __wt_page_is_modified(page) == 0); } /* * Repeat the test: fail if any page in the top-level page's subtree * won't be merged into its parent. */ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); return (0); }
/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard reference. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, uint32_t flags, int top) { WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_TXN *txn; uint32_t i; txn = &session->txn; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!LF_ISSET(WT_REC_SINGLE)) WT_RET(__hazard_exclusive(session, ref, top)); /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ WT_RET(__rec_review( session, ref, ref->page, flags, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * Check if this page can be evicted: * * Fail if the top-level page is a page expected to be removed from the * tree as part of eviction (an empty page or a split-merge page). Note * "split" pages are NOT included in this test, because a split page can * be separately evicted, at which point it's replaced in its parent by * a reference to a split-merge page. That's a normal part of the leaf * page life-cycle if it grows too large and must be pushed out of the * cache. There is also an exception for empty pages, the root page may * be empty when evicted, but that only happens when the tree is closed. * * Fail if any page in the top-level page's subtree can't be merged into * its parent. You can't evict a page that references such in-memory * pages, they must be evicted first. The test is necessary but should * not fire much: the LRU-based eviction code is biased for leaf pages, * an internal page shouldn't be selected for LRU-based eviction until * its children have been evicted. Empty, split and split-merge pages * are all included in this test, they can all be merged into a parent. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. * * We don't do a cheap test for the top-level page: we're not called * to evict split-merge pages, which means the only interesting case * is an empty page. If the eviction thread picked an "empty" page * for eviction, it must have had reason, probably the empty page got * really, really full. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* If the page is dirty, write it so we know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, flags); /* If there are unwritten changes on the page, give up. */ if (ret == 0 && !LF_ISSET(WT_REC_SINGLE) && __wt_page_is_modified(page)) ret = EBUSY; if (ret == EBUSY) { WT_VERBOSE_RET(session, evict, "page %p written but not clean", page); if (F_ISSET(txn, TXN_RUNNING) && ++txn->eviction_fails >= 100) { txn->eviction_fails = 0; ret = WT_DEADLOCK; WT_STAT_INCR( S2C(session)->stats, txn_fail_cache); } /* * If there aren't multiple cursors active, there * are no consistency issues: try to bump our snapshot. */ if (session->ncursors <= 1) { __wt_txn_read_last(session); __wt_txn_read_first(session); } switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); txn->eviction_fails = 0; } /* * Repeat the eviction tests. * * Fail if the top-level page should be merged into its parent, and it's * not the root page. * * Fail if a page in the top-level page's subtree can't be merged into * its parent. */ if (top) { /* * We never get a top-level split-merge page to evict, they are * ignored by the eviction thread. Check out of sheer paranoia. */ if (mod != NULL) { if (F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) return (EBUSY); if (F_ISSET(mod, WT_PM_REC_EMPTY) && !WT_PAGE_IS_ROOT(page)) return (EBUSY); } } else if (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) return (EBUSY); return (0); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; uint32_t page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * If we're compressing the file blocks, place the initial read into a * scratch buffer, we're going to have to re-allocate more memory for * decompression. Else check the caller's buffer size and grow it as * necessary, there will only be one buffer. */ if (block->compressor == NULL) { F_SET(buf, WT_ITEM_ALIGNED); WT_RET(__wt_buf_init(session, buf, size)); buf->size = size; dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, size, &tmp)); tmp->size = size; dsk = tmp->mem; } /* Read. */ WT_ERR(__wt_read(session, block->fh, offset, size, dsk)); blk = WT_BLOCK_HEADER_REF(dsk); /* Validate the checksum. */ if (block->checksum && cksum != WT_BLOCK_CHECKSUM_NOT_SET && blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) { blk->cksum = 0; page_cksum = __wt_cksum(dsk, size); if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET) ++page_cksum; if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); WT_ERR(WT_ERROR); } } /* * If the in-memory block size is larger than the on-disk block size, * the block is compressed. Size the user's buffer, copy the skipped * bytes of the original image into place, then decompress. * * If the in-memory block size is less than or equal to the on-disk * block size, the block is not compressed. */ if (blk->disk_size < dsk->size) { if (block->compressor == NULL) WT_ERR(__wt_illegal_value(session, block->name)); WT_ERR(__wt_buf_init(session, buf, dsk->size)); buf->size = dsk->size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(block->compressor->decompress( block->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->size - WT_BLOCK_COMPRESS_SKIP, &result_len)); if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP) WT_ERR(__wt_illegal_value(session, block->name)); } else if (block->compressor == NULL) buf->size = dsk->size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, why configure a compressor that * doesn't work? Allocate a buffer of the right size * (we used a scratch buffer which might be large), and * copy the data into place. */ WT_ERR( __wt_buf_set(session, buf, tmp->data, dsk->size)); WT_BSTAT_INCR(session, page_read); WT_CSTAT_INCR(session, block_read); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; mode_t mode; int direct_io, f, fd, matched; const char *path; conn = S2C(session); fh = NULL; fd = -1; path = NULL; WT_VERBOSE_RET(session, fileops, "%s: open", name); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->refcnt; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); WT_RET(__wt_filename(session, name, &path)); f = O_RDWR; #ifdef O_BINARY /* Windows clones: we always want to treat the file as a binary. */ f |= O_BINARY; #endif #ifdef O_CLOEXEC /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. */ f |= O_CLOEXEC; #endif #ifdef O_NOATIME /* Avoid updating metadata for read-only workloads. */ if (dio_type == WT_FILE_TYPE_DATA) f |= O_NOATIME; #endif if (ok_create) { f |= O_CREAT; if (exclusive) f |= O_EXCL; mode = 0666; } else mode = 0; direct_io = 0; #ifdef O_DIRECT if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= O_DIRECT; direct_io = 1; } #endif if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) #ifdef O_DSYNC f |= O_DSYNC; #elif defined(O_SYNC) f |= O_SYNC; #else WT_ERR_MSG(session, ENOTSUP, "Unsupported log sync mode requested"); #endif WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); #if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC) /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. There's an obvious * race here, so we prefer the flag to open if available. */ if ((f = fcntl(fd, F_GETFD)) == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name); #endif #if defined(HAVE_POSIX_FADVISE) /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA) WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); #endif if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_ERR(__open_directory_sync(session)); WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->fd = fd; fh->refcnt = 1; fh->direct_io = direct_io; /* Set the file's size. */ WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA) fh->extend_len = conn->data_extend_len; /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->refcnt; *fhp = tfh; matched = 1; break; } if (!matched) { TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (fd != -1) (void)close(fd); } __wt_free(session, path); return (ret); }
/* * __wt_hazard_set -- * Set a hazard pointer. */ int __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_HAZARD *hp; int restarts = 0; btree = S2BT(session); *busyp = 0; /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) return (0); /* * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the * page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can * use the page because the page eviction server will see our hazard * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). * * For sessions with many active hazard pointers, skip most of the * active slots: there may be a free slot in there, but checking is * expensive. Most hazard pointers are released quickly: optimize * for that case. */ for (hp = session->hazard + session->nhazard;; ++hp) { /* Expand the number of hazard pointers if available.*/ if (hp >= session->hazard + session->hazard_size) { if (session->hazard_size >= S2C(session)->hazard_max) break; /* Restart the search. */ if (session->nhazard < session->hazard_size && restarts++ == 0) { hp = session->hazard; continue; } WT_PUBLISH(session->hazard_size, WT_MIN(session->hazard_size + WT_HAZARD_INCR, S2C(session)->hazard_max)); } if (hp->page != NULL) continue; hp->page = ref->page; #ifdef HAVE_DIAGNOSTIC hp->file = file; hp->line = line; #endif /* Publish the hazard pointer before reading page's state. */ WT_FULL_BARRIER(); /* * Check if the page state is still valid, where valid means a * state of WT_REF_MEM or WT_REF_EVICT_WALK and the pointer is * unchanged. (The pointer can change, it means the page was * evicted between the time we set our hazard pointer and the * publication. It would theoretically be possible for the * page to be evicted and a different page read into the same * memory, so the pointer hasn't changed but the contents have. * That's OK, we found this page using the tree's key space, * whatever page we find here is the page for us to use.) */ if (ref->page == hp->page && (ref->state == WT_REF_MEM || ref->state == WT_REF_EVICT_WALK)) { WT_VERBOSE_RET(session, hazard, "session %p hazard %p: set", session, ref->page); ++session->nhazard; return (0); } /* * The page isn't available, it's being considered for eviction * (or being evicted, for all we know). If the eviction server * sees our hazard pointer before evicting the page, it will * return the page to use, no harm done, if it doesn't, it will * go ahead and complete the eviction. * * We don't bother publishing this update: the worst case is we * prevent some random page from being evicted. */ hp->page = NULL; *busyp = 1; return (0); } __wt_errx(session, "session %p: hazard pointer table full", session); #ifdef HAVE_DIAGNOSTIC __hazard_dump(session); #endif return (ENOMEM); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/* * __wt_block_salvage_next -- * Return the next block from the file. */ int __wt_block_salvage_next( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) { WT_BLOCK_HEADER *blk; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; offset = block->slvg_off; fh = block->fh; allocsize = block->allocsize; WT_RET(__wt_buf_initsize(session, buf, allocsize)); /* Read through the file, looking for pages with valid checksums. */ for (max = fh->file_size;;) { if (offset >= max) { /* Check eof. */ *eofp = 1; return (0); } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. */ WT_RET(__wt_read(session, fh, offset, allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The page size isn't insane, read the entire page: reading the * page validates the checksum and then decompresses the page as * needed. If reading the page fails, it's probably corruption, * we ignore this block. */ if (__wt_block_read_off( session, block, buf, offset, size, cksum)) { skip: WT_VERBOSE_RET(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* * Free the block and make sure we don't return it more * than once. */ WT_RET(__wt_block_off_free( session, block, offset, (off_t)allocsize)); block->slvg_off = offset += allocsize; continue; } /* * Valid block, return to our caller. * * The buffer may have grown: make sure we read from the full * page image. */ blk = WT_BLOCK_HEADER_REF(buf->mem); break; } /* * Track the largest write-generation we've seen in the file so future * writes, done after salvage completes, are preferred to these blocks. */ *write_genp = blk->write_gen; if (block->live.write_gen < blk->write_gen) block->live.write_gen = blk->write_gen; /* Re-create the address cookie that should reference this block. */ endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); /* We're successfully returning the page, move past it. */ block->slvg_off = offset + size; return (0); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_DECL_RET; WT_PAGE_MODIFY *mod; int merge; WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); /* * If we get a split-merge page during normal eviction, try to collapse * it. During close, it will be merged into its parent. */ mod = page->modify; merge = __wt_btree_mergeable(page); if (merge && exclusive) return (EBUSY); WT_ASSERT(session, merge || mod == NULL || !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if exclusive is set: we won't check * hazard pointers in that case. */ WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1)); /* Try to merge internal pages. */ if (merge) WT_ERR(__wt_merge_tree(session, page)); /* * Update the page's modification reference, reconciliation might have * changed it. */ mod = page->modify; /* Count evictions of internal pages during normal operation. */ if (!exclusive && !merge && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { WT_CSTAT_INCR(session, cache_eviction_internal); WT_DSTAT_INCR(session, cache_eviction_internal); } /* * Update the parent and discard the page. */ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) { WT_ASSERT(session, exclusive || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_clean); WT_DSTAT_INCR(session, cache_eviction_clean); } else { if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_dirty); WT_DSTAT_INCR(session, cache_eviction_dirty); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); WT_CSTAT_INCR(session, cache_eviction_fail); WT_DSTAT_INCR(session, cache_eviction_fail); } session->excl_next = 0; return (ret); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int single; conn = S2C(session); WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); single = LF_ISSET(WT_REC_SINGLE) ? 1 : 0; /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if WT_REC_SINGLE is set: we won't * check hazard references in that case. */ WT_ERR(__rec_review(session, page->ref, page, flags, 1)); /* Count evictions of internal pages during normal operation. */ if (!single && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) WT_STAT_INCR(conn->stats, cache_evict_internal); /* Update the parent and discard the page. */ if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) { WT_STAT_INCR(conn->stats, cache_evict_unmodified); WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, single); } else { WT_STAT_INCR(conn->stats, cache_evict_modified); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, single); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); } session->excl_next = 0; return (ret); }