Exemplo n.º 1
0
/*
 * __log_acquire --
 *	Called with the log slot lock held.  Can be called recursively
 *	from __wt_log_newfile when we change log files.
 */
static int
__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;
	/*
	 * Called locked.  Add recsize to alloc_lsn.  Save our starting LSN
	 * where the previous allocation finished for the release LSN.
	 * That way when log files switch, we're waiting for the correct LSN
	 * from outstanding writes.
	 */
	slot->slot_release_lsn = log->alloc_lsn;
	if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
		WT_RET(__wt_log_newfile(session, 0));
		if (log->log_close_fh != NULL)
			F_SET(slot, SLOT_CLOSEFH);
	}
	/*
	 * Need to minimally fill in slot info here.  Our slot start LSN
	 * comes after any potential new log file creations.
	 */
	slot->slot_start_lsn = log->alloc_lsn;
	slot->slot_start_offset = log->alloc_lsn.offset;
	/*
	 * Pre-allocate on the first real write into the log file.
	 */
	if (log->alloc_lsn.offset == LOG_FIRST_RECORD)
		WT_RET(__wt_fallocate(session,
		    log->log_fh, LOG_FIRST_RECORD, conn->log_file_max));
	log->alloc_lsn.offset += (off_t)recsize;
	slot->slot_end_lsn = log->alloc_lsn;
	slot->slot_error = 0;
	slot->slot_fh = log->log_fh;
	return (0);
}
Exemplo n.º 2
0
/*
 * __wt_block_extend --
 *	Extend the file.
 */
static inline int
__wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_FH *fh, wt_off_t offset, size_t align_size, bool *release_lockp)
{
	WT_DECL_RET;
	bool locked;

	/*
	 * The locking in this function is messy: by definition, the live system
	 * is locked when we're called, but that lock may have been acquired by
	 * our caller or our caller's caller. If our caller's lock, release_lock
	 * comes in set, indicating this function can unlock it before returning
	 * (either before extending the file or afterward, depending on the call
	 * used). If it is our caller's caller, then release_lock comes in not
	 * set, indicating it cannot be released here.
	 *
	 * If we unlock here, we clear release_lock. But if we then find out we
	 * need a lock after all, we re-acquire the lock and set release_lock so
	 * our caller knows to release it.
	 */
	locked = true;

	/* If not configured to extend the file, we're done. */
	if (fh->extend_len == 0)
		return (0);

	/*
	 * Extend the file in chunks.  We want to limit the number of threads
	 * extending the file at the same time, so choose the one thread that's
	 * crossing the extended boundary.  We don't extend newly created files,
	 * and it's theoretically possible we might wait so long our extension
	 * of the file is passed by another thread writing single blocks, that's
	 * why there's a check in case the extended file size becomes too small:
	 * if the file size catches up, every thread tries to extend it.
	 */
	if (fh->extend_size > fh->size &&
	    (offset > fh->extend_size ||
	    offset + fh->extend_len + (wt_off_t)align_size < fh->extend_size))
		return (0);

	/*
	 * File extension may require locking: some variants of the system call
	 * used to extend the file initialize the extended space. If a writing
	 * thread races with the extending thread, the extending thread might
	 * overwrite already written data, and that would be very, very bad.
	 *
	 * Some variants of the system call to extend the file fail at run-time
	 * based on the filesystem type, fall back to ftruncate in that case,
	 * and remember that ftruncate requires locking.
	 */
	if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) {
		/*
		 * Release any locally acquired lock if not needed to extend the
		 * file, extending the file may require updating on-disk file's
		 * metadata, which can be slow. (It may be a bad idea to
		 * configure for file extension on systems that require locking
		 * over the extend call.)
		 */
		if (!fh->fallocate_requires_locking && *release_lockp) {
			*release_lockp = locked = false;
			__wt_spin_unlock(session, &block->live_lock);
		}

		/*
		 * Extend the file: there's a race between setting the value of
		 * extend_size and doing the extension, but it should err on the
		 * side of extend_size being smaller than the actual file size,
		 * and that's OK, we simply may do another extension sooner than
		 * otherwise.
		 */
		fh->extend_size = fh->size + fh->extend_len * 2;
		if ((ret = __wt_fallocate(
		    session, fh, fh->size, fh->extend_len * 2)) == 0)
			return (0);
		if (ret != ENOTSUP)
			return (ret);
	}

	/*
	 * We may have a caller lock or a locally acquired lock, but we need a
	 * lock to call ftruncate.
	 */
	if (!locked) {
		__wt_spin_lock(session, &block->live_lock);
		*release_lockp = true;
	}

	/*
	 * The underlying truncate call initializes allocated space, reset the
	 * extend length after locking so we don't overwrite already-written
	 * blocks.
	 */
	fh->extend_size = fh->size + fh->extend_len * 2;

	/*
	 * The truncate might fail if there's a mapped file (in other words, if
	 * there's an open checkpoint on the file), that's OK.
	 */
	if ((ret = __wt_ftruncate(session, fh, fh->extend_size)) == EBUSY)
		ret = 0;
	return (ret);
}
Exemplo n.º 3
0
/*
 * __wt_block_write_off --
 *	Write a buffer into a block, returning the block's offset, size and
 * checksum.
 */
int
__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
    int data_cksum, int caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	size_t align_size;
	wt_off_t offset;
	int local_locked;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;
	local_locked = 0;

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer too large to write");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = WT_STORE_SIZE(align_size);

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption.   If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 */
	blk->flags = 0;
	if (data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->cksum = 0;
	blk->cksum = __wt_cksum(
	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!caller_locked) {
		WT_RET(__wt_block_ext_prealloc(session, 5));
		__wt_spin_lock(session, &block->live_lock);
		local_locked = 1;
	}
	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);

	/*
	 * Extend the file in chunks.  We want to limit the number of threads
	 * extending the file at the same time, so choose the one thread that's
	 * crossing the extended boundary.  We don't extend newly created files,
	 * and it's theoretically possible we might wait so long our extension
	 * of the file is passed by another thread writing single blocks, that's
	 * why there's a check in case the extended file size becomes too small:
	 * if the file size catches up, every thread tries to extend it.
	 *
	 * File extension may require locking: some variants of the system call
	 * used to extend the file initialize the extended space. If a writing
	 * thread races with the extending thread, the extending thread might
	 * overwrite already written data, and that would be very, very bad.
	 *
	 * Some variants of the system call to extend the file fail at run-time
	 * based on the filesystem type, fall back to ftruncate in that case,
	 * and remember that ftruncate requires locking.
	 */
	if (ret == 0 &&
	    fh->extend_len != 0 &&
	    (fh->extend_size <= fh->size ||
	    (offset + fh->extend_len <= fh->extend_size &&
	    offset +
	    fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) {
		fh->extend_size = offset + fh->extend_len * 2;
		if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) {
			/*
			 * Release any locally acquired lock if it's not needed
			 * to extend the file, extending the file might require
			 * updating file metadata, which can be slow. (It may be
			 * a bad idea to configure for file extension on systems
			 * that require locking over the extend call.)
			 */
			if (!fh->fallocate_requires_locking && local_locked) {
				__wt_spin_unlock(session, &block->live_lock);
				local_locked = 0;
			}

			/* Extend the file. */
			if ((ret = __wt_fallocate(session,
			    fh, offset, fh->extend_len * 2)) == ENOTSUP) {
				ret = 0;
				goto extend_truncate;
			}
		} else {
extend_truncate:	/*
			 * We may have a caller lock or a locally acquired lock,
			 * but we need a lock to call ftruncate.
			 */
			if (!caller_locked && local_locked == 0) {
				__wt_spin_lock(session, &block->live_lock);
				local_locked = 1;
			}
			/*
			 * The truncate might fail if there's a file mapping
			 * (if there's an open checkpoint on the file), that's
			 * OK.
			 */
			if ((ret = __wt_ftruncate(
			    session, fh, offset + fh->extend_len * 2)) == EBUSY)
				ret = 0;
		}
	}
	/* Release any locally acquired lock. */
	if (local_locked) {
		__wt_spin_unlock(session, &block->live_lock);
		local_locked = 0;
	}
	WT_RET(ret);

	/* Write the block. */
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(__wt_block_off_free(
		    session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache, but only if the current session can wait.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max &&
	    __wt_session_can_wait(session)) {
		block->os_cache_dirty = 0;
		WT_RET(__wt_fsync_async(session, fh));
	}
#endif
#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd,
		    (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	WT_STAT_FAST_CONN_INCR(session, block_write);
	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);

	WT_RET(__wt_verbose(session, WT_VERB_WRITE,
	    "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
	    (uintmax_t)offset, (uintmax_t)align_size, blk->cksum));

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*cksump = blk->cksum;

	return (ret);
}
Exemplo n.º 4
0
/*将buffer的数据写入到block对应的文件中,并计算checksum和size*/
int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, 
						uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	size_t align_size;
	wt_off_t offset;
	int local_locked;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;
	local_locked = 0;

	/*buf不是对齐模式,不能进行写,因为这个是和磁盘相关的写入,必须是对齐的*/
	if(!F_ISSET(buf, WT_ITEM_ALIGNED)){
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated");
	}

	/*计算buf->size按block对齐,对齐后有可能会比现有的buf->memsize大,如果大的话,不能进行写,有可能会缓冲区溢出*/
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated");
	}
	/*超过4G*/
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write");
	}

	/*将对其后pading的buffer位置进行清0*/
	memset((uint8_t*)buf->mem + buf->size, 0, align_size - buf->size);

	/*设置block header,计算存储的数据长度*/
	blk->disk_size = WT_STORE_SIZE(align_size);
	blk->flags = 0;
	if(data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);

	/*计算buf的cksum*/
	blk->cksum = __wt_cksum(buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!caller_locked) {
		WT_RET(__wt_block_ext_prealloc(session, 5));
		__wt_spin_lock(session, &block->live_lock);
		local_locked = 1;
	}

	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
	/*判断文件是否需要进行扩大,如果不扩大就有可能存不下写入的block数据*/
	if(ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size ||
		(offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))){
			/*调整extend_size为原来的offset + extend_len的两倍*/
			fh->extend_size = offset + fh->extend_len * 2;
			if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) {
				/*释放block->live_lock的自旋锁,因为重设文件大小会时间比较长,需要先释放自旋锁,防止CPU空转*/
				if (!fh->fallocate_requires_locking && local_locked) {
					__wt_spin_unlock(session, &block->live_lock);
					local_locked = 0;
				}

				/*扩大文件的占用空间*/
				if ((ret = __wt_fallocate(session,fh, offset, fh->extend_len * 2)) == ENOTSUP) {
					ret = 0;
					goto extend_truncate;
				}
			}
			else{
extend_truncate:
				if (!caller_locked && local_locked == 0) {
					__wt_spin_lock(session, &block->live_lock);
					local_locked = 1;
				}
				/*直接调整文件大小,这个比__wt_fallocate更慢*/
				if ((ret = __wt_ftruncate(session, fh, offset + fh->extend_len * 2)) == EBUSY)
					ret = 0;
			}
	}

	if(local_locked){
		__wt_spin_unlock(session, &block->live_lock);
		local_locked = 0;
	}

	WT_RET(ret);
	/*进行block的数据写入*/
	ret =__wt_write(session, fh, offset, align_size, buf->mem);
	if (ret != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		/*没写成功,将ext对应的数据返回给avail list*/
		WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);

		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*需要进行fsync操作,脏页太多,进行一次异步刷盘*/
	if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) {
			block->os_cache_dirty = 0;
			WT_RET(__wt_fsync_async(session, fh));
	}
#endif

#ifdef HAVE_POSIX_FADVISE
	/*清理fh->fd文件对应的system page cache中的数据,这个过程可能会有IO操作,相当于同步的sync调用*/
	if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name);
	}
#endif

	WT_STAT_FAST_CONN_INCR(session, block_write);
	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);

	WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, 
							(uintmax_t)offset, (uintmax_t)align_size, blk->cksum));

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*cksump = blk->cksum;

	return ret;
}