Beispiel #1
0
/*
 * __wt_block_read_off_blind --
 *	Read the block at an offset, try to figure out what it looks like,
 * debugging only.
 */
int
__wt_block_read_off_blind(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset)
{
	WT_BLOCK_HEADER *blk;
	uint32_t cksum, size;

	/*
	 * Make sure the buffer is large enough for the header and read the
	 * the first allocation-size block.
	 */
	WT_RET(__wt_buf_init(session, buf, block->allocsize));
	WT_RET(__wt_read(
	    session, block->fh, offset, (size_t)block->allocsize, buf->mem));
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	__wt_block_header_byteswap(blk);

	/*
	 * Copy out the size and checksum (we're about to re-use the buffer),
	 * and if the size isn't insane, read the rest of the block.
	 */
	size = blk->disk_size;
	cksum = blk->cksum;
	if (__wt_block_offset_invalid(block, offset, size))
		WT_RET_MSG(session, EINVAL,
		    "block at offset %" PRIuMAX " cannot be a valid block, no "
		    "read attempted",
		    (uintmax_t)offset);
	return (__wt_block_read_off(session, block, buf, offset, size, cksum));
}
Beispiel #2
0
/*
 * __wt_block_read_off_blind --
 *	Read the block at an offset, return the size and checksum, debugging
 * only.
 */
int
__wt_block_read_off_blind(WT_SESSION_IMPL *session,
    WT_BLOCK *block, wt_off_t offset, uint32_t *sizep, uint32_t *checksump)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	*sizep = 0;
	*checksump = 0;

	/*
	 * Make sure the buffer is large enough for the header and read the
	 * the first allocation-size block.
	 */
	WT_RET(__wt_scr_alloc(session, block->allocsize, &tmp));
	WT_ERR(__wt_read(
	    session, block->fh, offset, (size_t)block->allocsize, tmp->mem));
	blk = WT_BLOCK_HEADER_REF(tmp->mem);
	__wt_block_header_byteswap(blk);

	*sizep = blk->disk_size;
	*checksump = blk->checksum;

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Beispiel #3
0
/*
 * __wt_block_salvage_next --
 *	Return the address for the next potential block from the file.
 */
int
__wt_block_salvage_next(WT_SESSION_IMPL *session,
    WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *eofp)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_FH *fh;
	wt_off_t max, offset;
	uint32_t allocsize, checksum, size;
	uint8_t *endp;

	*eofp = 0;

	fh = block->fh;
	allocsize = block->allocsize;
	WT_ERR(__wt_scr_alloc(session, allocsize, &tmp));

	/* Read through the file, looking for pages. */
	for (max = block->size;;) {
		offset = block->slvg_off;
		if (offset >= max) {			/* Check eof. */
			*eofp = 1;
			goto done;
		}

		/*
		 * Read the start of a possible page (an allocation-size block),
		 * and get a page length from it.  Move to the next allocation
		 * sized boundary, we'll never consider this one again.
		 */
		WT_ERR(__wt_read(
		    session, fh, offset, (size_t)allocsize, tmp->mem));
		blk = WT_BLOCK_HEADER_REF(tmp->mem);
		__wt_block_header_byteswap(blk);
		size = blk->disk_size;
		checksum = blk->checksum;

		/*
		 * Check the block size: if it's not insane, read the block.
		 * Reading the block validates any checksum; if reading the
		 * block succeeds, return its address as a possible page,
		 * otherwise, move past it.
		 */
		if (!__wt_block_offset_invalid(block, offset, size) &&
		    __wt_block_read_off(
		    session, block, tmp, offset, size, checksum) == 0)
			break;

		/* Free the allocation-size block. */
		__wt_verbose(session, WT_VERB_SALVAGE,
		    "skipping %" PRIu32 "B at file offset %" PRIuMAX,
		    allocsize, (uintmax_t)offset);
		WT_ERR(__wt_block_off_free(
		    session, block, offset, (wt_off_t)allocsize));
		block->slvg_off += allocsize;
	}

	/* Re-create the address cookie that should reference this block. */
	endp = addr;
	WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum));
	*addr_sizep = WT_PTRDIFF(endp, addr);

done:
err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Beispiel #4
0
/*
 * __wt_block_write_off --
 *	Write a buffer into a block, returning the block's offset, size and
 * checksum.
 */
int
__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
    bool data_cksum, bool caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	size_t align_size;
	wt_off_t offset;
	uint32_t cksum;
	bool local_locked;

	fh = block->fh;

	/*
	 * Clear the block header to ensure all of it is initialized, even the
	 * unused fields.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	memset(blk, 0, sizeof(*blk));

	/*
	 * Swap the page-header as needed; this doesn't belong here, but it's
	 * the best place to catch all callers.
	 */
	__wt_page_header_byteswap(buf->mem);

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer too large to write");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = WT_STORE_SIZE(align_size);

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption. If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 *
	 * Checksum a little-endian version of the header, and write everything
	 * in little-endian format. The checksum is (potentially) returned in a
	 * big-endian format, swap it into place in a separate step.
	 */
	blk->flags = 0;
	if (data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->cksum = 0;
	__wt_block_header_byteswap(blk);
	blk->cksum = cksum = __wt_cksum(
	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
#ifdef WORDS_BIGENDIAN
	blk->cksum = __wt_bswap32(blk->cksum);
#endif

	/* Pre-allocate some number of extension structures. */
	WT_RET(__wt_block_ext_prealloc(session, 5));

	/*
	 * Acquire a lock, if we don't already hold one.
	 * Allocate space for the write, and optionally extend the file (note
	 * the block-extend function may release the lock).
	 * Release any locally acquired lock.
	 */
	local_locked = false;
	if (!caller_locked) {
		__wt_spin_lock(session, &block->live_lock);
		local_locked = true;
	}
	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
	if (ret == 0)
		ret = __wt_block_extend(
		    session, block, fh, offset, align_size, &local_locked);
	if (local_locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/* Write the block. */
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(__wt_block_off_free(
		    session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache, but only if the current session can wait.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max &&
	    __wt_session_can_wait(session)) {
		block->os_cache_dirty = 0;
		WT_RET(__wt_fsync_async(session, fh));
	}
#endif
#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd,
		    (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	WT_STAT_FAST_CONN_INCR(session, block_write);
	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);

	WT_RET(__wt_verbose(session, WT_VERB_WRITE,
	    "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
	    (uintmax_t)offset, (uintmax_t)align_size, cksum));

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*cksump = cksum;

	return (0);
}