/* * __wt_merge_tree -- * Attempt to collapse a stack of split-merge pages in memory into a * shallow tree. If enough keys are found, create a real internal node * that can be evicted (and, if necessary, split further). * * This code is designed to deal with workloads that otherwise create * arbitrarily deep (and slow) trees in memory. */ int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) { WT_DECL_RET; WT_PAGE *lchild, *newtop, *rchild; WT_REF *newref; WT_VISIT_STATE visit_state; uint32_t refcnt, split; int promote; u_int levels; uint8_t page_type; WT_CLEAR(visit_state); visit_state.session = session; lchild = newtop = rchild = NULL; page_type = top->type; WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); /* * Don't allow split merges to generate arbitrarily large pages. * Ideally we would choose a size based on the internal_page_max * setting for the btree, but we don't have the correct btree handle * available. */ if (visit_state.refcnt > WT_MERGE_MAX_REFS) return (EBUSY); /* * Now we either collapse the internal pages into one split-merge page, * or if there are "enough" keys, we split into two equal internal * pages, each of which can be evicted independently. * * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ promote = 0; refcnt = (uint32_t)visit_state.refcnt; if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. * * Handle the case where the only live child is first / last * specially: put the live child into the top-level page. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ if (visit_state.first_live == visit_state.last_live && (visit_state.first_live == 0 || visit_state.first_live == refcnt - 1)) split = (visit_state.first_live == 0) ? 1 : refcnt - 1; else split = (refcnt + 1) / 2; /* Only promote if we can create a real page. */ if (split == 1 || split == refcnt - 1) promote = 1; else if (split >= WT_MERGE_FULL_PAGE && visit_state.first_live >= split) promote = 1; else if (refcnt - split >= WT_MERGE_FULL_PAGE && visit_state.last_live < split) promote = 1; } if (promote) { /* Create a new top-level split-merge page with two entries. */ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); visit_state.split = split; /* Left split. */ if (split == 1) visit_state.first = newtop; else { WT_ERR(__merge_new_page(session, page_type, split, visit_state.first_live < split, &lchild)); visit_state.first = lchild; } /* Right split. */ if (split == refcnt - 1) { visit_state.second = newtop; visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__merge_new_page(session, page_type, refcnt - split, visit_state.last_live >= split, &rchild)); visit_state.second = rchild; visit_state.second_ref = &visit_state.second->u.intl.t[0]; } } else { /* * Create a new split-merge page for small merges, or if the * page above is a split merge page. When we do a big enough * merge, we create a real page at the top and don't consider * it as a merge candidate again. Over time with an insert * workload the tree will grow deeper, but that's inevitable, * and this keeps individual merges small. */ WT_ERR(__merge_new_page(session, page_type, refcnt, refcnt < WT_MERGE_FULL_PAGE || __wt_btree_mergeable(top->parent), &newtop)); visit_state.first = newtop; } /* * Copy the references into the new tree, but don't update anything in * the locked tree in case there is an error and we need to back out. * We do this in a separate pass so that we can figure out the key for * the split point: that allocates memory and so it could still fail. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); if (promote) { /* Promote keys into the top-level page. */ if (lchild != NULL) { newref = &newtop->u.intl.t[0]; WT_LINK_PAGE(newtop, newref, lchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } if (rchild != NULL) { newref = &newtop->u.intl.t[1]; WT_LINK_PAGE(newtop, newref, rchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } } /* * We have copied everything into place and allocated all of the memory * we need. Now link all pages into the new tree and unlock them. * * The only way this could fail is if a reference state has been * changed by another thread since they were locked. Panic in that * case: that should never happen. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); if (ret != 0) WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); newtop->u.intl.recno = top->u.intl.recno; newtop->parent = top->parent; newtop->ref = top->ref; #ifdef HAVE_DIAGNOSTIC /* * Before swapping in the new tree, walk the pages we are discarding, * check that everything looks right. */ __merge_check_discard(session, top); #endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ top->modify->flags = WT_PM_REC_SPLIT; top->modify->u.split = newtop; WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); /* Evict new child pages as soon as possible. */ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) lchild->read_gen = WT_READ_GEN_OLDEST; if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) rchild->read_gen = WT_READ_GEN_OLDEST; /* Update statistics. */ WT_CSTAT_INCR(session, cache_eviction_merge); WT_DSTAT_INCR(session, cache_eviction_merge); /* How many levels did we remove? */ levels = visit_state.maxdepth - (promote ? 2 : 1); WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels); WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels); return (0); err: WT_VERBOSE_TRET(session, evict, "Failed to merge %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", visit_state.maxdepth, refcnt); WT_CSTAT_INCR(session, cache_eviction_merge_fail); WT_DSTAT_INCR(session, cache_eviction_merge_fail); if (newtop != NULL) __wt_page_out(session, &newtop); if (lchild != NULL) __wt_page_out(session, &lchild); if (rchild != NULL) __wt_page_out(session, &rchild); return (ret); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; off_t offset; uint32_t align_size; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = align_size; /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!locked) __wt_spin_lock(session, &block->live_lock); ret = __wt_block_alloc(session, block, &offset, (off_t)align_size); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); #if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE) /* * Extend the file in chunks. We aren't holding a lock and we'd prefer * to limit the number of threads extending the file at the same time, * so choose the one thread that's crossing the extended boundary. We * don't extend newly created files, and it's theoretically possible we * might wait so long our extension of the file is passed by another * thread writing single blocks, that's why there's a check in case the * extended file size becomes too small: if the file size catches up, * every thread will try to extend it. */ if (fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; #if defined(HAVE_POSIX_FALLOCATE) if ((ret = posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0) WT_RET_MSG( session, ret, "%s: posix_fallocate", fh->name); #elif defined(HAVE_FTRUNCATE) if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0) WT_RET_MSG(session, ret, "%s: ftruncate", fh->name); #endif } #endif if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!locked) __wt_spin_lock(session, &block->live_lock); WT_TRET( __wt_block_off_free(session, block, offset, align_size)); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { block->os_cache_dirty = 0; if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) WT_RET_MSG( session, ret, "%s: sync_file_range", block->name); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); WT_VERBOSE_RET(session, write, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, align_size, blk->cksum); *offsetp = offset; *sizep = align_size; *cksump = blk->cksum; return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; btree = session->btree; bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->mem; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_init(session, buf, dsk->mem_size)); buf->size = dsk->mem_size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len)); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR) ? WT_ERROR : __wt_illegal_value(session, btree->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_CSTAT_INCR(session, cache_read); WT_DSTAT_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_DSTAT_INCR(session, compress_read); WT_CSTAT_INCRV(session, cache_bytes_read, addr_size); WT_DSTAT_INCRV(session, cache_bytes_read, addr_size); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_size, int checkpoint, int compressed) { WT_BM *bm; WT_BTREE *btree; WT_ITEM *ip; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t len, src_len, dst_len, result_len, size; int data_cksum, compression_failed; uint8_t *src, *dst; btree = session->btree; bm = btree->bm; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (checkpoint == 0 && addr != NULL && addr_size != NULL) || (checkpoint == 1 && addr == NULL && addr_size == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(&tmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (buf->size <= btree->allocsize || btree->compressor == NULL || btree->compressor->compress == NULL || compressed) { ip = buf; WT_DSTAT_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &tmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; /* * If compression fails, fallback to the original version. This * isn't unexpected: if compression doesn't work for some chunk * of bytes for some reason (noting there's likely additional * format/header information which compressed output requires), * it just means the uncompressed version is as good as it gets, * and that's what we use. */ compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); if (compression_failed) { ip = buf; WT_DSTAT_INCR(session, compress_write_fail); } else { compressed = 1; WT_DSTAT_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = 1; break; case CKSUM_OFF: data_cksum = 0; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_size, data_cksum)); WT_CSTAT_INCR(session, cache_write); WT_DSTAT_INCR(session, cache_write); WT_CSTAT_INCRV(session, cache_bytes_write, ip->size); WT_DSTAT_INCRV(session, cache_bytes_write, ip->size); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { WT_BLOCK *block; off_t offset; uint32_t size, cksum; int mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * Clear buffers previously used for mapped memory, we may be forced * to read into this buffer. */ if (F_ISSET(buf, WT_ITEM_MAPPED)) __wt_buf_free(session, buf); /* * If we're going to be able to return mapped memory and the buffer * has allocated memory, discard it. */ mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen; if (buf->mem != NULL && mapped) __wt_buf_free(session, buf); /* Map the block if it's possible. */ if (mapped) { buf->mem = (uint8_t *)bm->map + offset; buf->memsize = size; buf->data = buf->mem; buf->size = size; F_SET(buf, WT_ITEM_MAPPED); WT_RET(__wt_mmap_preload(session, buf->mem, buf->size)); WT_CSTAT_INCR(session, block_map_read); WT_CSTAT_INCRV(session, block_byte_map_read, size); return (0); } /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system's buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += size) > block->os_cache_max) { WT_DECL_RET; block->os_cache = 0; if ((ret = posix_fadvise(block->fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif return (0); }