コード例 #1
0
ファイル: log_sys.c プロジェクト: DINKIN/mongo
/*
 * __wt_log_system_record --
 *	Write a system log record for the previous LSN.
 */
int
__wt_log_system_record(
    WT_SESSION_IMPL *session, WT_FH *log_fh, WT_LSN *lsn)
{
	WT_DECL_ITEM(logrec_buf);
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	WT_LOGSLOT tmp;
	WT_MYSLOT myslot;
	const char *fmt = WT_UNCHECKED_STRING(I);
	uint32_t rectype = WT_LOGREC_SYSTEM;
	size_t recsize;

	log = S2C(session)->log;
	WT_RET(__wt_logrec_alloc(session, log->allocsize, &logrec_buf));
	memset((uint8_t *)logrec_buf->mem, 0, log->allocsize);

	WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype));
	WT_ERR(__wt_struct_pack(session,
	    (uint8_t *)logrec_buf->data + logrec_buf->size, recsize, fmt,
	    rectype));
	logrec_buf->size += recsize;
	WT_ERR(__wt_logop_prev_lsn_pack(session, logrec_buf, lsn));
	WT_ASSERT(session, logrec_buf->size <= log->allocsize);

	logrec = (WT_LOG_RECORD *)logrec_buf->mem;

	/*
	 * We know system records are this size.  And we have to adjust
	 * the size now because we're not going through the normal log
	 * write path and the packing functions needed the correct offset
	 * earlier.
	 */
	logrec_buf->size = logrec->len = log->allocsize;

	/* We do not compress nor encrypt this record. */
	logrec->checksum = 0;
	logrec->flags = 0;
	__wt_log_record_byteswap(logrec);
	logrec->checksum = __wt_checksum(logrec, log->allocsize);
#ifdef WORDS_BIGENDIAN
	logrec->checksum = __wt_bswap32(logrec->checksum);
#endif
	WT_CLEAR(tmp);
	memset(&myslot, 0, sizeof(myslot));
	myslot.slot = &tmp;
	__wt_log_slot_activate(session, &tmp);
	/*
	 * Override the file handle to the one we're using.
	 */
	tmp.slot_fh = log_fh;
	WT_ERR(__wt_log_fill(session, &myslot, true, logrec_buf, NULL));
err:	__wt_logrec_free(session, &logrec_buf);
	return (ret);
}
コード例 #2
0
ファイル: salvage.c プロジェクト: Machyne/mongo
/*
 * copy --
 *	Copy the created page to the end of the salvage file.
 */
void
copy(u_int gen, u_int recno)
{
	FILE *ifp, *ofp;
	WT_PAGE_HEADER *dsk;
	WT_BLOCK_HEADER *blk;
	char buf[PSIZE];

	CHECK((ifp = fopen(LOAD, "r")) != NULL);

	/*
	 * If the salvage file doesn't exist, then we're creating it:
	 * copy the first sector (the file description).
	 * Otherwise, we are appending to an existing file.
	 */
	if (file_exists(SLVG))
		CHECK((ofp = fopen(SLVG, "a")) != NULL);
	else {
		CHECK((ofp = fopen(SLVG, "w")) != NULL);
		CHECK(fread(buf, 1, PSIZE, ifp) == PSIZE);
		CHECK(fwrite(buf, 1, PSIZE, ofp) == PSIZE);
	}

	/*
	 * If there's data, copy/update the first formatted page.
	 */
	if (gen != 0) {
		CHECK(fseek(ifp, (long)PSIZE, SEEK_SET) == 0);
		CHECK(fread(buf, 1, PSIZE, ifp) == PSIZE);
		dsk = (void *)buf;
		if (page_type != WT_PAGE_ROW_LEAF)
			dsk->recno = recno;
		dsk->write_gen = gen;
		blk = WT_BLOCK_HEADER_REF(buf);
		blk->checksum = 0;
		blk->checksum = __wt_checksum(dsk, PSIZE);
		CHECK(fwrite(buf, 1, PSIZE, ofp) == PSIZE);
	}

	CHECK(fclose(ifp) == 0);
	CHECK(fclose(ofp) == 0);
}
コード例 #3
0
ファイル: block_read.c プロジェクト: DINKIN/mongo
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum)
{
	WT_BLOCK_HEADER *blk, swap;
	size_t bufsize;
	uint32_t page_checksum;

	__wt_verbose(session, WT_VERB_READ,
	    "off %" PRIuMAX ", size %" PRIu32 ", checksum %" PRIu32,
	    (uintmax_t)offset, size, checksum);

	WT_STAT_CONN_INCR(session, block_read);
	WT_STAT_CONN_INCRV(session, block_byte_read, size);

	/*
	 * Grow the buffer as necessary and read the block.  Buffers should be
	 * aligned for reading, but there are lots of buffers (for example, file
	 * cursors have two buffers each, key and value), and it's difficult to
	 * be sure we've found all of them.  If the buffer isn't aligned, it's
	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
	 * the time on reads, the buffer memory has not yet been allocated, so
	 * we're not adding any additional processing time.)
	 */
	if (F_ISSET(buf, WT_ITEM_ALIGNED))
		bufsize = size;
	else {
		F_SET(buf, WT_ITEM_ALIGNED);
		bufsize = WT_MAX(size, buf->memsize + 10);
	}
	WT_RET(__wt_buf_init(session, buf, bufsize));
	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
	buf->size = size;

	/*
	 * We incrementally read through the structure before doing a checksum,
	 * do little- to big-endian handling early on, and then select from the
	 * original or swapped structure as needed.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	__wt_block_header_byteswap_copy(blk, &swap);
	if (swap.checksum == checksum) {
		blk->checksum = 0;
		page_checksum = __wt_checksum(buf->mem,
		    F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ?
		    size : WT_BLOCK_COMPRESS_SKIP);
		if (page_checksum == checksum) {
			/*
			 * Swap the page-header as needed; this doesn't belong
			 * here, but it's the best place to catch all callers.
			 */
			__wt_page_header_byteswap(buf->mem);
			return (0);
		}

		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
			__wt_errx(session,
			    "read checksum error for %" PRIu32 "B block at "
			    "offset %" PRIuMAX ": calculated block checksum "
			    "of %" PRIu32 " doesn't match expected checksum "
			    "of %" PRIu32,
			    size, (uintmax_t)offset, page_checksum, checksum);
	} else
		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
			__wt_errx(session,
			    "read checksum error for %" PRIu32 "B block at "
			    "offset %" PRIuMAX ": block header checksum "
			    "of %" PRIu32 " doesn't match expected checksum "
			    "of %" PRIu32,
			    size, (uintmax_t)offset, swap.checksum, checksum);

	/* Panic if a checksum fails during an ordinary read. */
	return (block->verify ||
	    F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ?
	    WT_ERROR : __wt_illegal_value(session, block->name));
}
コード例 #4
0
ファイル: block_write.c プロジェクト: mongodb/mongo
/*
 * __block_write_off --
 *	Write a buffer into a block, returning the block's offset, size and
 * checksum.
 */
static int
__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
    bool data_checksum, bool checkpoint_io, bool caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	wt_off_t offset;
	size_t align_size;
	uint32_t checksum;
	bool local_locked;

	*offsetp = 0;			/* -Werror=maybe-uninitialized */
	*sizep = 0;			/* -Werror=maybe-uninitialized */
	*checksump = 0;			/* -Werror=maybe-uninitialized */

	fh = block->fh;

	/*
	 * Clear the block header to ensure all of it is initialized, even the
	 * unused fields.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	memset(blk, 0, sizeof(*blk));

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer too large to write");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = WT_STORE_SIZE(align_size);

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption. If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 *
	 * Checksum a little-endian version of the header, and write everything
	 * in little-endian format. The checksum is (potentially) returned in a
	 * big-endian format, swap it into place in a separate step.
	 */
	blk->flags = 0;
	if (data_checksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->checksum = 0;
	__wt_block_header_byteswap(blk);
	blk->checksum = checksum = __wt_checksum(
	    buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
#ifdef WORDS_BIGENDIAN
	blk->checksum = __wt_bswap32(blk->checksum);
#endif

	/* Pre-allocate some number of extension structures. */
	WT_RET(__wt_block_ext_prealloc(session, 5));

	/*
	 * Acquire a lock, if we don't already hold one.
	 * Allocate space for the write, and optionally extend the file (note
	 * the block-extend function may release the lock).
	 * Release any locally acquired lock.
	 */
	local_locked = false;
	if (!caller_locked) {
		__wt_spin_lock(session, &block->live_lock);
		local_locked = true;
	}
	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
	if (ret == 0)
		ret = __wt_block_extend(
		    session, block, fh, offset, align_size, &local_locked);
	if (local_locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/* Write the block. */
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(__wt_block_off_free(
		    session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache, but only if the current session can wait.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    fh->written > block->os_cache_dirty_max &&
	    __wt_session_can_wait(session)) {
		fh->written = 0;
		if ((ret = __wt_fsync(session, fh, false)) != 0) {
			 /*
			  * Ignore ENOTSUP, but don't try again.
			  */
			if (ret != ENOTSUP)
				return (ret);
			block->os_cache_dirty_max = 0;
		}
	}

	/* Optionally discard blocks from the buffer cache. */
	WT_RET(__wt_block_discard(session, block, align_size));

	WT_STAT_CONN_INCR(session, block_write);
	WT_STAT_CONN_INCRV(session, block_byte_write, align_size);
	if (checkpoint_io)
		WT_STAT_CONN_INCRV(
		    session, block_byte_write_checkpoint, align_size);

	__wt_verbose(session, WT_VERB_WRITE,
	    "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32,
	    (uintmax_t)offset, (uintmax_t)align_size, checksum);

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*checksump = checksum;

	return (0);
}