/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t checksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); /* * Map the block if it's possible. */ handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; ret = handle->fh_map_preload(handle, (WT_SESSION *)session, buf->data, buf->size,bm->mapped_cookie); WT_STAT_CONN_INCR(session, block_map_read); WT_STAT_CONN_INCRV(session, block_byte_map_read, size); return (ret); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced(session, block, "read", offset, size, bm->is_live, __func__, __LINE__)); #endif /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET( __wt_block_read_off(session, block, buf, offset, size, checksum)); /* Optionally discard blocks from the system's buffer cache. */ WT_RET(__wt_block_discard(session, block, (size_t)size)); return (0); }
/* * __block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ static int __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; wt_off_t offset; size_t align_size; uint32_t checksum; bool local_locked; *offsetp = 0; /* -Werror=maybe-uninitialized */ *sizep = 0; /* -Werror=maybe-uninitialized */ *checksump = 0; /* -Werror=maybe-uninitialized */ fh = block->fh; /* * Clear the block header to ensure all of it is initialized, even the * unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything * in little-endian format. The checksum is (potentially) returned in a * big-endian format, swap it into place in a separate step. */ blk->flags = 0; if (data_checksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->checksum = 0; __wt_block_header_byteswap(blk); blk->checksum = checksum = __wt_checksum( buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->checksum = __wt_bswap32(blk->checksum); #endif /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. * Allocate space for the write, and optionally extend the file (note * the block-extend function may release the lock). * Release any locally acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend( session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && fh->written > block->os_cache_dirty_max && __wt_session_can_wait(session)) { fh->written = 0; if ((ret = __wt_fsync(session, fh, false)) != 0) { /* * Ignore ENOTSUP, but don't try again. */ if (ret != ENOTSUP) return (ret); block->os_cache_dirty_max = 0; } } /* Optionally discard blocks from the buffer cache. */ WT_RET(__wt_block_discard(session, block, align_size)); WT_STAT_CONN_INCR(session, block_write); WT_STAT_CONN_INCRV(session, block_byte_write, align_size); if (checkpoint_io) WT_STAT_CONN_INCRV( session, block_byte_write_checkpoint, align_size); __wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32, (uintmax_t)offset, (uintmax_t)align_size, checksum); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *checksump = checksum; return (0); }