/* * __wt_block_compact_page_skip -- * Return if writing a particular page will shrink the file. */ int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, uint32_t addr_size, int *skipp) { WT_FH *fh; off_t offset; uint32_t size, cksum; WT_UNUSED(addr_size); *skipp = 0; /* Paranoia: skip on error. */ fh = block->fh; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * If this block appears in the last half of the file, rewrite it. * * It's unclear we need to lock: the chances of a smashed read are close * to non-existent and the worst thing that can happen is we rewrite a * block we didn't want to rewrite. On the other hand, compaction is * not expected to be a common operation in WiredTiger, we shouldn't be * here a lot. */ __wt_spin_lock(session, &block->live_lock); *skipp = offset > fh->size / 2 ? 0 : 1; __wt_spin_unlock(session, &block->live_lock); return (0); }
/* * __wt_block_salvage_valid -- * Let salvage know if a block is valid. */ int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, bool valid) { wt_off_t offset; uint32_t size, checksum; WT_UNUSED(addr_size); /* * Crack the cookie. * If the upper layer took the block, move past it; if the upper layer * rejected the block, move past an allocation size chunk and free it. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); if (valid) block->slvg_off = offset + size; else { WT_RET(__wt_block_off_free( session, block, offset, (wt_off_t)block->allocsize)); block->slvg_off = offset + block->allocsize; } return (0); }
/* * __wt_block_addr_invalid -- * Return an error code if an address cookie is invalid. */ int __wt_block_addr_invalid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, bool live) { wt_off_t offset; uint32_t checksum, size; WT_UNUSED(session); WT_UNUSED(addr_size); WT_UNUSED(live); /* Crack the cookie. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the address isn't on the available list, * or for live systems, the discard list. */ WT_RET(__wt_block_misplaced( session, block, "addr-valid", offset, size, live)); #endif /* Check if the address is past the end of the file. */ return (offset + size > block->size ? EINVAL : 0); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; wt_off_t offset; uint32_t cksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * Map the block if it's possible. */ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; WT_RET(__wt_mmap_preload(session, buf->data, buf->size)); WT_STAT_FAST_CONN_INCR(session, block_map_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size); return (0); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced( session, block, "read", offset, size, bm->is_live)); #endif /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system's buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += size) > block->os_cache_max) { WT_DECL_RET; block->os_cache = 0; /* Ignore EINVAL - some file systems don't support the flag. */ if ((ret = posix_fadvise(block->fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0 && ret != EINVAL) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif return (0); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t checksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); /* * Map the block if it's possible. */ handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; ret = handle->fh_map_preload(handle, (WT_SESSION *)session, buf->data, buf->size,bm->mapped_cookie); WT_STAT_CONN_INCR(session, block_map_read); WT_STAT_CONN_INCRV(session, block_byte_map_read, size); return (ret); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced(session, block, "read", offset, size, bm->is_live, __func__, __LINE__)); #endif /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET( __wt_block_read_off(session, block, buf, offset, size, checksum)); /* Optionally discard blocks from the system's buffer cache. */ WT_RET(__wt_block_discard(session, block, (size_t)size)); return (0); }
/* * __wt_block_addr_valid -- * Return if an address cookie is valid. */ int __wt_block_addr_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size) { off_t offset; uint32_t cksum, size; WT_UNUSED(session); WT_UNUSED(addr_size); /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* All we care about is if it's past the end of the file. */ return (offset + size > block->fh->size ? 0 : 1); }
/* * __wt_block_addr_string -- * Return a printable string representation of an address cookie. */ int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { off_t offset; uint32_t cksum, size; WT_UNUSED(addr_size); /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* Printable representation. */ WT_RET(__wt_buf_fmt(session, buf, "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]", (uintmax_t)offset, (uintmax_t)offset + size, size, cksum)); return (0); }
/* * __wt_bm_preload -- * Pre-load a page. */ int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t offset; uint32_t cksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* * Turn off pre-load when direct I/O is configured for the file, * the kernel cache isn't interesting. */ if (block->fh->direct_io) return (0); WT_STAT_FAST_CONN_INCR(session, block_preload); /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* Check for a mapped block. */ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped) return (__wt_mmap_preload( session, (uint8_t *)bm->map + offset, size)); #ifdef HAVE_POSIX_FADVISE if (posix_fadvise(block->fh->fd, (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED) == 0) return (0); #endif WT_RET(__wt_scr_alloc(session, size, &tmp)); ret = __wt_block_read_off(session, block, tmp, offset, size, cksum); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_salvage_valid -- * Inform salvage a block is valid. */ int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t addr_size) { off_t offset; uint32_t size, cksum; WT_UNUSED(session); WT_UNUSED(addr_size); /* * The upper layer accepted a block we gave it, move past it. * * Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); block->slvg_off = offset + size; return (0); }
/* * __wt_block_read -- * Read filesystem cookie referenced block into a buffer. */ int __wt_block_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { off_t offset; uint32_t size, cksum; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); /* Optionally verify the page. */ if (block->verify) WT_RET(__wt_block_verify( session, block, buf, addr, addr_size, offset, size)); return (0); }
/* * __wt_bm_preload -- * Pre-load a page. */ int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_RET; off_t offset; uint32_t cksum, size; int mapped; WT_UNUSED(addr_size); block = bm->block; ret = EINVAL; /* Play games due to conditional compilation */ /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* Check for a mapped block. */ mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen; if (mapped) WT_RET(__wt_mmap_preload( session, (uint8_t *)bm->map + offset, size)); else { #ifdef HAVE_POSIX_FADVISE ret = posix_fadvise(block->fh->fd, (off_t)offset, (off_t)size, POSIX_FADV_WILLNEED); #endif if (ret != 0) { WT_DECL_ITEM(tmp); WT_RET(__wt_scr_alloc(session, size, &tmp)); ret = __wt_block_read_off( session, block, tmp, offset, size, cksum); __wt_scr_free(&tmp); WT_RET(ret); } } WT_STAT_FAST_CONN_INCR(session, block_preload); return (0); }
/* * __wt_bm_corrupt -- * Report a block has been corrupted, external API. */ int __wt_bm_corrupt(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t offset; uint32_t checksum, size; /* Read the block. */ WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__wt_bm_read(bm, session, tmp, addr, addr_size)); /* Crack the cookie, dump the block. */ WT_ERR(__wt_block_buffer_to_addr( bm->block, addr, &offset, &size, &checksum)); WT_ERR(__wt_bm_corrupt_dump(session, tmp, offset, size, checksum)); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_bm_preload -- * Pre-load a page. */ int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t cksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; WT_STAT_FAST_CONN_INCR(session, block_preload); /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) ret = handle->fh_map_preload(handle, (WT_SESSION *)session, (uint8_t *)bm->map + offset, size, bm->mapped_cookie); if (!mapped && handle->fh_advise != NULL) ret = handle->fh_advise(handle, (WT_SESSION *)session, (wt_off_t)offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED); if (ret != EBUSY && ret != ENOTSUP) return (ret); /* If preload isn't supported, do it the slow way. */ WT_RET(__wt_scr_alloc(session, 0, &tmp)); ret = __wt_bm_read(bm, session, tmp, addr, addr_size); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_compact_page_skip -- * Return if writing a particular page will shrink the file. */ int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp) { WT_DECL_RET; WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t ninety, offset; uint32_t size, cksum; WT_UNUSED(addr_size); *skipp = 1; /* Return a default skip. */ fh = block->fh; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); __wt_spin_lock(session, &block->live_lock); /* * If this block is in the last 10% of the file and there's a block on * the available list that's in the first 90% of the file, rewrite the * block. Checking the available list is necessary (otherwise writing * the block would extend the file), but there's an obvious race if the * file is sufficiently busy. */ ninety = fh->size - fh->size / 10; if (offset > ninety) { el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < ninety && ext->size >= size) { *skipp = 0; break; } }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { WT_BLOCK *block; off_t offset; uint32_t size, cksum; int mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * Clear buffers previously used for mapped memory, we may be forced * to read into this buffer. */ if (F_ISSET(buf, WT_ITEM_MAPPED)) __wt_buf_free(session, buf); /* * If we're going to be able to return mapped memory and the buffer * has allocated memory, discard it. */ mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen; if (buf->mem != NULL && mapped) __wt_buf_free(session, buf); /* Map the block if it's possible. */ if (mapped) { buf->mem = (uint8_t *)bm->map + offset; buf->memsize = size; buf->data = buf->mem; buf->size = size; F_SET(buf, WT_ITEM_MAPPED); WT_RET(__wt_mmap_preload(session, buf->mem, buf->size)); WT_CSTAT_INCR(session, block_map_read); WT_CSTAT_INCRV(session, block_byte_map_read, size); return (0); } /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system's buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += size) > block->os_cache_max) { WT_DECL_RET; block->os_cache = 0; if ((ret = posix_fadvise(block->fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif return (0); }