/* * __wt_log_slot_free -- * Free a slot back into the pool. */ int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_DECL_RET; ret = 0; /* * Grow the buffer if needed before returning it to the pool. */ if (F_ISSET(slot, WT_SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } err: /* * No matter if there is an error, we always want to free * the slot back to the pool. */ /* * Make sure flags don't get retained between uses. * We have to reset them them here because multiple threads may * change the flags when joining the slot. */ slot->flags = WT_SLOT_INIT_FLAGS; slot->slot_state = WT_LOG_SLOT_FREE; return (ret); }
/* * __log_fill -- * Copy a thread's log records into the assigned slot. */ static int __log_fill(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp) { WT_DECL_RET; WT_LOG_RECORD *logrec; logrec = (WT_LOG_RECORD *)record->mem; /* * Call __wt_write. For now the offset is the real byte offset. * If the offset becomes a unit of LOG_ALIGN this is where we would * multiply by LOG_ALIGN to get the real file byte offset for write(). */ if (direct) WT_ERR(__wt_write(session, myslot->slot->slot_fh, myslot->offset + myslot->slot->slot_start_offset, (size_t)logrec->len, (void *)logrec)); else memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, logrec, logrec->len); WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len); if (lsnp != NULL) { *lsnp = myslot->slot->slot_start_lsn; lsnp->offset += (off_t)myslot->offset; } err: if (ret != 0 && myslot->slot->slot_error == 0) myslot->slot->slot_error = ret; return (ret); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; wt_off_t offset; uint32_t cksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * Map the block if it's possible. */ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; WT_RET(__wt_mmap_preload(session, buf->data, buf->size)); WT_STAT_FAST_CONN_INCR(session, block_map_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size); return (0); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced( session, block, "read", offset, size, bm->is_live)); #endif /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system's buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += size) > block->os_cache_max) { WT_DECL_RET; block->os_cache = 0; /* Ignore EINVAL - some file systems don't support the flag. */ if ((ret = posix_fadvise(block->fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0 && ret != EINVAL) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif return (0); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; size_t bufsize; uint32_t page_cksum; WT_RET(__wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum)); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); page_cksum = blk->cksum; if (page_cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); } if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_log_slot_grow_buffers -- * Increase the buffer size of all available slots in the buffer pool. * Go to some lengths to include active (but unused) slots to handle * the case where all log write record sizes exceed the size of the * active buffer. */ int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int64_t orig_state; uint64_t old_size, total_growth; int i; conn = S2C(session); log = conn->log; total_growth = 0; WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); /* * Take the log slot lock to prevent other threads growing buffers * at the same time. Could tighten the scope of this lock, or have * a separate lock if there is contention. */ __wt_spin_lock(session, &log->log_slot_lock); for (i = 0; i < SLOT_POOL; i++) { slot = &log->slot_pool[i]; /* Avoid atomic operations if they won't succeed. */ if (slot->slot_state != WT_LOG_SLOT_FREE && slot->slot_state != WT_LOG_SLOT_READY) continue; /* Don't keep growing unrelated buffers. */ if (slot->slot_buf.memsize > (10 * newsize) && !F_ISSET(slot, SLOT_BUF_GROW)) continue; orig_state = WT_ATOMIC_CAS_VAL8( slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_FREE) { orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_READY) continue; } /* We have a slot - now go ahead and grow the buffer. */ old_size = slot->slot_buf.memsize; F_CLR(slot, SLOT_BUF_GROW); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, WT_MAX(slot->slot_buf.memsize * 2, newsize))); slot->slot_state = orig_state; total_growth += slot->slot_buf.memsize - old_size; } err: __wt_spin_unlock(session, &log->log_slot_lock); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); return (ret); }
/* * __wt_log_slot_init -- * Initialize the slot array. */ int __wt_log_slot_init(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int32_t i; conn = S2C(session); log = conn->log; WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool); for (i = 0; i < WT_SLOT_POOL; i++) log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; /* * Allocate memory for buffers now that the arrays are setup. Separate * this from the loop above to make error handling simpler. */ /* * !!! If the buffer size is too close to the log file size, we will * switch log files very aggressively. Scale back the buffer for * small log file sizes. */ log->slot_buf_size = (uint32_t)WT_MIN( (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE); for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, &log->slot_pool[i].slot_buf, log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); /* * Set up the available slot from the pool the first time. */ slot = &log->slot_pool[0]; /* * We cannot initialize the release LSN in the activate function * because that function can be called after a log file switch. */ slot->slot_release_lsn = log->alloc_lsn; __wt_log_slot_activate(session, slot); log->active_slot = slot; if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); } return (ret); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t cksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * Map the block if it's possible. */ handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; ret = handle->fh_map_preload(handle, (WT_SESSION *)session, buf->data, buf->size,bm->mapped_cookie); WT_STAT_FAST_CONN_INCR(session, block_map_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size); return (ret); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced( session, block, "read", offset, size, bm->is_live)); #endif /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); /* Optionally discard blocks from the system's buffer cache. */ WT_RET(__wt_block_discard(session, block, (size_t)size)); return (0); }
/* * __wt_log_slot_init -- * Initialize the slot array. */ int __wt_log_slot_init(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int32_t i; conn = S2C(session); log = conn->log; for (i = 0; i < WT_SLOT_POOL; i++) { log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX; } /* * Set up the available slots from the pool the first time. */ for (i = 0; i < WT_SLOT_ACTIVE; i++) { slot = &log->slot_pool[i]; slot->slot_index = (uint32_t)i; slot->slot_state = WT_LOG_SLOT_READY; log->slot_array[i] = slot; } /* * Allocate memory for buffers now that the arrays are setup. Split * this out to make error handling simpler. * * Cap the slot buffer to the log file size. */ log->slot_buf_size = WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, &log->slot_pool[i].slot_buf, log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); } return (ret); }
/* * __wt_log_slot_close -- * Close a slot and do not allow any other threads to join this slot. * Remove this from the active slot array and move a new slot from * the pool into its place. Set up the size of this group; * Must be called with the logging spinlock held. */ int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *newslot; int64_t old_state; conn = S2C(session); log = conn->log; /* * Find an unused slot in the pool. */ WT_RET(__log_slot_find_free(session, &newslot)); /* * Swap out the slot we're going to use and put a free one in the * slot array in its place so that threads can use it right away. */ WT_STAT_FAST_CONN_INCR(session, log_slot_closes); newslot->slot_state = WT_LOG_SLOT_READY; newslot->slot_index = slot->slot_index; log->slot_array[newslot->slot_index] = newslot; old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING); slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); /* * Note that this statistic may be much bigger than in reality, * especially when compared with the total bytes written in * __log_fill. The reason is that this size reflects any * rounding up that is needed and the total bytes in __log_fill * is the amount of user bytes. */ WT_STAT_FAST_CONN_INCRV(session, log_slot_consolidated, (uint64_t)slot->slot_group_size); return (0); }
/* * __wt_log_write -- * Write a record into the log. */ int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; uint32_t rdup_len; int locked; conn = S2C(session); log = conn->log; locked = 0; INIT_LSN(&lsn); myslot.slot = NULL; /* * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has * a header at the beginning for us to fill in. * * If using direct_io, the caller should pass us an aligned record. * But we need to make sure it is big enough and zero-filled so * that we can write the full amount. Do this whether or not * direct_io is in use because it makes the reading code cleaner. */ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size); rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ASSERT(session, record->data == record->mem); /* * If the caller's record only partially fills the necessary * space, we need to zero-fill the remainder. */ if (record->size != rdup_len) { memset((uint8_t *)record->mem + record->size, 0, rdup_len - record->size); record->size = rdup_len; } logrec = (WT_LOG_RECORD *)record->mem; logrec->len = (uint32_t)record->size; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, record->size); WT_STAT_FAST_CONN_INCR(session, log_writes); if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { ret = __log_direct_write(session, record, lsnp, flags); if (ret == 0) return (0); if (ret != EAGAIN) WT_ERR(ret); /* * An EAGAIN return means we failed to get the try lock - * fall through to the consolidation code in that case. */ } /* * As soon as we see contention for the log slot, disable direct * log writes. We get better performance by forcing writes through * the consolidation code. This is because individual writes flood * the I/O system faster than they contend on the log slot lock. */ F_SET(log, WT_LOG_FORCE_CONSOLIDATE); if ((ret = __wt_log_slot_join( session, rdup_len, flags, &myslot)) == ENOMEM) { /* * If we couldn't find a consolidated slot for this record * write the record directly. */ while ((ret = __log_direct_write( session, record, lsnp, flags)) == EAGAIN) ; WT_ERR(ret); /* * Increase the buffer size of any slots we can get access * to, so future consolidations are likely to succeed. */ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len)); return (0); } WT_ERR(ret); if (myslot.offset == 0) { __wt_spin_lock(session, &log->log_slot_lock); locked = 1; WT_ERR(__wt_log_slot_close(session, myslot.slot)); WT_ERR(__log_acquire( session, myslot.slot->slot_group_size, myslot.slot)); __wt_spin_unlock(session, &log->log_slot_lock); locked = 0; WT_ERR(__wt_log_slot_notify(session, myslot.slot)); } else WT_ERR(__wt_log_slot_wait(session, myslot.slot)); WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { WT_ERR(__log_release(session, myslot.slot)); WT_ERR(__wt_log_slot_free(myslot.slot)); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); } err: if (locked) __wt_spin_unlock(session, &log->log_slot_lock); if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* * If we're synchronous and some thread had an error, we don't know * if our write made it out to the file or not. The error could be * before or after us. So, if anyone got an error, we report it. * If we're not synchronous, only report if our own operation got * an error. */ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) ret = myslot.slot->slot_error; return (ret); }
/* * __log_release -- * Release a log slot. */ static int __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN sync_lsn; size_t write_size; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ conn = S2C(session); log = conn->log; /* * If we're going to have to close our log file, make a local copy * of the file handle structure. */ close_fh = NULL; if (F_ISSET(slot, SLOT_CLOSEFH)) { close_fh = log->log_close_fh; log->log_close_fh = NULL; F_CLR(slot, SLOT_CLOSEFH); } /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, slot->slot_start_offset, write_size, slot->slot_buf.mem)); } /* * Wait for earlier groups to finish, otherwise there could be holes * in the log file. */ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) __wt_yield(); log->write_lsn = slot->slot_end_lsn; /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->write_lsn. */ while (F_ISSET(slot, SLOT_SYNC) && LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { (void)__wt_cond_wait( session, log->log_sync_cond, 10000); continue; } /* * Record the current end of log after we grabbed the lock. * That is how far our fsync call with guarantee. */ sync_lsn = log->write_lsn; if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_STAT_FAST_CONN_INCR(session, log_sync); ret = __wt_fsync(session, log->log_fh); if (ret == 0) { F_CLR(slot, SLOT_SYNC); log->sync_lsn = sync_lsn; ret = __wt_cond_signal( session, log->log_sync_cond); } } __wt_spin_unlock(session, &log->log_sync_lock); WT_ERR(ret); } if (F_ISSET(slot, SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); F_CLR(slot, SLOT_BUF_GROW); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } /* * If we have a file to close, close it now. */ if (close_fh) WT_ERR(__wt_close(session, close_fh)); err: if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; return (ret); }
/* * __log_slot_close -- * Close out the slot the caller is using. The slot may already be * closed or freed by another thread. */ static int __log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *releasep, bool forced) { WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, releasep != NULL); conn = S2C(session); log = conn->log; *releasep = 0; if (slot == NULL) return (WT_NOTFOUND); retry: old_state = slot->slot_state; /* * If this close is coming from a forced close and a thread is in * the middle of using the slot, return EBUSY. The caller can * decide if retrying is necessary or not. */ if (forced && WT_LOG_SLOT_INPROGRESS(old_state)) return (EBUSY); /* * If someone else is switching out this slot we lost. Nothing to * do but return. Return WT_NOTFOUND anytime the given slot was * processed by another closing thread. Only return 0 when we * actually closed the slot. */ if (WT_LOG_SLOT_CLOSED(old_state)) return (WT_NOTFOUND); /* * If someone completely processed this slot, we're done. */ if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) return (WT_NOTFOUND); new_state = (old_state | WT_LOG_SLOT_CLOSE); /* * Close this slot. If we lose the race retry. */ if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) goto retry; /* * We own the slot now. No one else can join. * Set the end LSN. */ WT_STAT_FAST_CONN_INCR(session, log_slot_closes); if (WT_LOG_SLOT_DONE(new_state)) *releasep = 1; slot->slot_end_lsn = slot->slot_start_lsn; end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; slot->slot_end_lsn.offset += (wt_off_t)end_offset; WT_STAT_FAST_CONN_INCRV(session, log_slot_consolidated, end_offset); /* * XXX Would like to change so one piece of code advances the LSN. */ log->alloc_lsn = slot->slot_end_lsn; WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file); return (0); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_cksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, page_cksum, cksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, swap.cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /* * Extend the file in chunks. We want to limit the number of threads * extending the file at the same time, so choose the one thread that's * crossing the extended boundary. We don't extend newly created files, * and it's theoretically possible we might wait so long our extension * of the file is passed by another thread writing single blocks, that's * why there's a check in case the extended file size becomes too small: * if the file size catches up, every thread tries to extend it. * * File extension may require locking: some variants of the system call * used to extend the file initialize the extended space. If a writing * thread races with the extending thread, the extending thread might * overwrite already written data, and that would be very, very bad. * * Some variants of the system call to extend the file fail at run-time * based on the filesystem type, fall back to ftruncate in that case, * and remember that ftruncate requires locking. */ if (ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /* * Release any locally acquired lock if it's not needed * to extend the file, extending the file might require * updating file metadata, which can be slow. (It may be * a bad idea to configure for file extension on systems * that require locking over the extend call.) */ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /* Extend the file. */ if ((ret = __wt_fallocate(session, fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else { extend_truncate: /* * We may have a caller lock or a locally acquired lock, * but we need a lock to call ftruncate. */ if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /* * The truncate might fail if there's a file mapping * (if there's an open checkpoint on the file), that's * OK. */ if ((ret = __wt_ftruncate( session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } /* Release any locally acquired lock. */ if (local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return (ret); }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool compressed) { WT_BM *bm; WT_BTREE *btree; WT_ITEM *ip; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t dst_len, len, result_len, size, src_len; int compression_failed; /* Extension API, so not a bool. */ uint8_t *dst, *src; bool data_cksum; btree = S2BT(session); bm = btree->bm; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (!checkpoint && addr != NULL && addr_sizep != NULL) || (checkpoint && addr == NULL && addr_sizep == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(session, &tmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (btree->compressor == NULL || btree->compressor->compress == NULL || compressed) ip = buf; else if (buf->size <= btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &tmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); result_len += WT_BLOCK_COMPRESS_SKIP; /* * If compression fails, or doesn't gain us at least one unit of * allocation, fallback to the original version. This isn't * unexpected: if compression doesn't work for some chunk of * data for some reason (noting likely additional format/header * information which compressed output requires), it just means * the uncompressed version is as good as it gets, and that's * what we use. */ if (compression_failed || buf->size / btree->allocsize <= result_len / btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_fail); } else { compressed = true; WT_STAT_FAST_DATA_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); tmp->size = result_len; ip = tmp; } } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = true; break; case CKSUM_OFF: data_cksum = false; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); WT_STAT_FAST_CONN_INCR(session, cache_write); WT_STAT_FAST_DATA_INCR(session, cache_write); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(etmp); WT_DECL_ITEM(tmp); WT_DECL_RET; WT_ENCRYPTOR *encryptor; WT_ITEM *ip; const WT_PAGE_HEADER *dsk; const char *fail_msg; size_t result_len; btree = S2BT(session); bm = btree->bm; fail_msg = NULL; /* -Wuninitialized */ /* * If anticipating a compressed or encrypted block, read into a scratch * buffer and decompress into the caller's buffer. Else, read directly * into the caller's buffer. */ if (btree->compressor == NULL && btree->kencryptor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; ip = NULL; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; ip = tmp; } /* * If the block is encrypted, copy the skipped bytes of the original * image into place, then decrypt. */ if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) { if (btree->kencryptor == NULL || (encryptor = btree->kencryptor->encryptor) == NULL || encryptor->decrypt == NULL) { fail_msg = "encrypted block in file for which no encryption " "configured"; goto corrupt; } WT_ERR(__wt_scr_alloc(session, 0, &etmp)); if ((ret = __wt_decrypt(session, encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) { fail_msg = "block decryption failed"; goto corrupt; } ip = etmp; dsk = ip->data; } else if (btree->kencryptor != NULL) { fail_msg = "unencrypted block in file for which encryption configured"; goto corrupt; } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) { fail_msg = "compressed block in file for which no compression " "configured"; goto corrupt; } /* * Size the buffer based on the in-memory bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { fail_msg = "block decryption failed"; goto corrupt; } } else /* * If we uncompressed above, the page is in the correct buffer. * If we get here the data may be in the wrong buffer and the * buffer may be the wrong size. If needed, get the page * into the destination buffer. */ if (ip != NULL) WT_ERR(__wt_buf_set( session, buf, ip->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); if (0) { corrupt: if (ret == 0) ret = WT_ERROR; if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { __wt_err(session, ret, "%s", fail_msg); ret = __wt_illegal_value(session, btree->dhandle->name); } } err: __wt_scr_free(session, &tmp); __wt_scr_free(session, &etmp); return (ret); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; start_id = 0; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count)); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count)); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close")); else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_log_slot_close -- * Close a slot and do not allow any other threads to join this slot. * Remove this from the active slot array and move a new slot from * the pool into its place. Set up the size of this group; * Must be called with the logging spinlock held. */ int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *newslot; int64_t old_state; int32_t yields; uint32_t pool_i, switch_fails; conn = S2C(session); log = conn->log; switch_fails = 0; retry: /* * Find an unused slot in the pool. */ pool_i = log->pool_index; newslot = &log->slot_pool[pool_i]; if (++log->pool_index >= SLOT_POOL) log->pool_index = 0; if (newslot->slot_state != WT_LOG_SLOT_FREE) { WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails); /* * If it takes a number of attempts to find an available slot * it's likely all slots are waiting to be released. This * churn is used to change how long we pause before closing * the slot - which leads to more consolidation and less churn. */ if (++switch_fails % SLOT_POOL == 0 && switch_fails != 0 && slot->slot_churn < 5) ++slot->slot_churn; __wt_yield(); goto retry; } else if (slot->slot_churn > 0) { --slot->slot_churn; WT_ASSERT(session, slot->slot_churn >= 0); } /* Pause to allow other threads a chance to consolidate. */ for (yields = slot->slot_churn; yields >= 0; yields--) __wt_yield(); /* * Swap out the slot we're going to use and put a free one in the * slot array in its place so that threads can use it right away. */ WT_STAT_FAST_CONN_INCR(session, log_slot_closes); newslot->slot_state = WT_LOG_SLOT_READY; newslot->slot_index = slot->slot_index; log->slot_array[newslot->slot_index] = &log->slot_pool[pool_i]; old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING); slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); /* * Note that this statistic may be much bigger than in reality, * especially when compared with the total bytes written in * __log_fill. The reason is that this size reflects any * rounding up that is needed and the total bytes in __log_fill * is the amount of user bytes. */ WT_STAT_FAST_CONN_INCRV(session, log_slot_consolidated, (uint64_t)slot->slot_group_size); return (0); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; bool busy, cache_work, evict_soon, stalled; int force_attempts; btree = S2BT(session); /* * Ignore reads of pages already known to be in cache, otherwise the * eviction server can dominate these statistics. */ if (!LF_ISSET(WT_READ_CACHE)) { WT_STAT_FAST_CONN_INCR(session, cache_pages_requested); WT_STAT_FAST_DATA_INCR(session, cache_pages_requested); } for (evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: if (LF_ISSET(WT_READ_NO_EMPTY) && __wt_delete_page_skip(session, ref, false)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_DISK: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); /* * If configured to not trash the cache, leave the page * generation unset, we'll set it before returning to * the oldest read generation, so the page is forcibly * evicted as soon as possible. We don't do that set * here because we don't want to evict the page before * we "acquire" it. */ evict_soon = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = true; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = true; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. * In-memory split of large pages is allowed while * no_eviction is set on btree, whereas reconciliation * is not allowed. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || (F_ISSET(btree, WT_BTREE_NO_EVICTION) && !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) goto skip_evict; /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, ref)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = true; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and are configured to not trash * the cache, and no other thread has already used the * page, set the oldest read generation so the page is * forcibly evicted as soon as possible. * * Otherwise, if we read the page, or, if configured to * update the page's read generation and the page isn't * already flagged for forced eviction, update the page * read generation. */ page = ref->page; if (page->read_gen == WT_READGEN_NOTSET) { if (evict_soon) __wt_page_evict_soon(session, ref); else __wt_cache_read_gen_new(session, page); } else if (!LF_ISSET(WT_READ_NO_GEN)) __wt_cache_read_gen_bump(session, page); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += WT_THOUSAND; else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __wt_merge_tree -- * Attempt to collapse a stack of split-merge pages in memory into a * shallow tree. If enough keys are found, create a real internal node * that can be evicted (and, if necessary, split further). * * This code is designed to deal with workloads that otherwise create * arbitrarily deep (and slow) trees in memory. */ int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) { WT_DECL_RET; WT_PAGE *lchild, *newtop, *rchild; WT_REF *newref; WT_VISIT_STATE visit_state; uint32_t refcnt, split; int promote; u_int levels; uint8_t page_type; WT_CLEAR(visit_state); visit_state.session = session; lchild = newtop = rchild = NULL; page_type = top->type; WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); /* Pages cannot grow larger than 2**32, but that should never happen. */ if (visit_state.refcnt > UINT32_MAX) return (ENOMEM); /* * Now we either collapse the internal pages into one split-merge page, * or if there are "enough" keys, we split into two equal internal * pages, each of which can be evicted independently. * * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ promote = 0; refcnt = (uint32_t)visit_state.refcnt; if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. * * Handle the case where the live children are all near the * beginning / end specially: put the last live child into the * top-level page, to avoid getting much deeper during * append-only workloads. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ if (visit_state.last_live <= refcnt / 10) split = 1; else if (visit_state.first_live >= (9 * refcnt) / 10) split = refcnt - 1; else split = (refcnt + 1) / 2; /* Only promote if we can create a real page. */ if (split == 1 || split == refcnt - 1) promote = 1; else if (split >= WT_MERGE_FULL_PAGE && visit_state.first_live >= split) promote = 1; else if (refcnt - split >= WT_MERGE_FULL_PAGE && visit_state.last_live < split) promote = 1; } if (promote) { /* Create a new top-level split-merge page with two entries. */ WT_ERR(__wt_btree_new_modified_page( session, page_type, 2, 1, &newtop)); visit_state.split = split; /* Left split. */ if (split == 1) visit_state.first = newtop; else { WT_ERR(__wt_btree_new_modified_page( session, page_type, split, split < WT_MERGE_FULL_PAGE, &lchild)); visit_state.first = lchild; } /* Right split. */ if (split == refcnt - 1) { visit_state.second = newtop; visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__wt_btree_new_modified_page( session, page_type, refcnt - split, refcnt - split < WT_MERGE_FULL_PAGE, &rchild)); visit_state.second = rchild; visit_state.second_ref = &visit_state.second->u.intl.t[0]; } } else { /* * Create a new split-merge page for small merges. When we do * a big enough merge, we create a real page at the top and * don't consider it as a merge candidate again. Over time * with an insert workload the tree will grow deeper, but * that's inevitable, and this keeps individual merges small. */ WT_ERR(__wt_btree_new_modified_page( session, page_type, refcnt, refcnt < WT_MERGE_FULL_PAGE, &newtop)); visit_state.first = newtop; } /* * Copy the references into the new tree, but don't update anything in * the locked tree in case there is an error and we need to back out. * We do this in a separate pass so that we can figure out the key for * the split point: that allocates memory and so it could still fail. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); if (promote) { /* Promote keys into the top-level page. */ if (lchild != NULL) { newref = &newtop->u.intl.t[0]; WT_LINK_PAGE(newtop, newref, lchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } if (rchild != NULL) { newref = &newtop->u.intl.t[1]; WT_LINK_PAGE(newtop, newref, rchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } } /* * We have copied everything into place and allocated all of the memory * we need. Now link all pages into the new tree and unlock them. * * The only way this could fail is if a reference state has been * changed by another thread since they were locked. Panic in that * case: that should never happen. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); if (ret != 0) WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); newtop->u.intl.recno = top->u.intl.recno; newtop->parent = top->parent; newtop->ref = top->ref; #ifdef HAVE_DIAGNOSTIC /* * Before swapping in the new tree, walk the pages we are discarding, * check that everything looks right. */ __merge_check_discard(session, top); #endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ top->modify->flags = WT_PM_REC_SPLIT; top->modify->u.split = newtop; WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); /* Evict new child pages as soon as possible. */ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) lchild->read_gen = WT_READ_GEN_OLDEST; if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) rchild->read_gen = WT_READ_GEN_OLDEST; /* Update statistics. */ WT_STAT_FAST_CONN_INCR(session, cache_eviction_merge); WT_STAT_FAST_DATA_INCR(session, cache_eviction_merge); /* How many levels did we remove? */ levels = visit_state.maxdepth - (promote ? 2 : 1); WT_STAT_FAST_CONN_INCRV(session, cache_eviction_merge_levels, levels); WT_STAT_FAST_DATA_INCRV(session, cache_eviction_merge_levels, levels); return (0); err: WT_VERBOSE_TRET(session, evict, "Failed to merge %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", visit_state.maxdepth, refcnt); WT_STAT_FAST_CONN_INCR(session, cache_eviction_merge_fail); WT_STAT_FAST_DATA_INCR(session, cache_eviction_merge_fail); if (newtop != NULL) __wt_page_out(session, &newtop); if (lchild != NULL) __wt_page_out(session, &lchild); if (rchild != NULL) __wt_page_out(session, &rchild); return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_read_blocked); break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } page = ref->page; WT_ASSERT(session, page != NULL); /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, page, flags)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; wait_cnt += 1000; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); break; } else WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (++wait_cnt < 1000) __wt_yield(); else { sleep_cnt = WT_MIN(wait_cnt, 10000); wait_cnt *= 2; WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } } }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen, stalled; btree = S2BT(session); stalled = 0; for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = 1; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = 1; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = 1; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += 1000; else if (++wait_cnt < 1000) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, bool data_cksum, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; uint32_t cksum; bool local_locked; fh = block->fh; /* * Clear the block header to ensure all of it is initialized, even the * unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* * Swap the page-header as needed; this doesn't belong here, but it's * the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything * in little-endian format. The checksum is (potentially) returned in a * big-endian format, swap it into place in a separate step. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; __wt_block_header_byteswap(blk); blk->cksum = cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->cksum = __wt_bswap32(blk->cksum); #endif /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. * Allocate space for the write, and optionally extend the file (note * the block-extend function may release the lock). * Release any locally acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend( session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = cksum; return (0); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; const WT_PAGE_HEADER *dsk; size_t result_len; btree = S2BT(session); bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(btree, WT_BTREE_VERIFY) || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, btree->dhandle->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/*将buffer的数据写入到block对应的文件中,并计算checksum和size*/ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /*buf不是对齐模式,不能进行写,因为这个是和磁盘相关的写入,必须是对齐的*/ if(!F_ISSET(buf, WT_ITEM_ALIGNED)){ WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /*计算buf->size按block对齐,对齐后有可能会比现有的buf->memsize大,如果大的话,不能进行写,有可能会缓冲区溢出*/ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /*超过4G*/ if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /*将对其后pading的buffer位置进行清0*/ memset((uint8_t*)buf->mem + buf->size, 0, align_size - buf->size); /*设置block header,计算存储的数据长度*/ blk->disk_size = WT_STORE_SIZE(align_size); blk->flags = 0; if(data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); /*计算buf的cksum*/ blk->cksum = __wt_cksum(buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /*判断文件是否需要进行扩大,如果不扩大就有可能存不下写入的block数据*/ if(ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))){ /*调整extend_size为原来的offset + extend_len的两倍*/ fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /*释放block->live_lock的自旋锁,因为重设文件大小会时间比较长,需要先释放自旋锁,防止CPU空转*/ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /*扩大文件的占用空间*/ if ((ret = __wt_fallocate(session,fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else{ extend_truncate: if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /*直接调整文件大小,这个比__wt_fallocate更慢*/ if ((ret = __wt_ftruncate(session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } if(local_locked){ __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /*进行block的数据写入*/ ret =__wt_write(session, fh, offset, align_size, buf->mem); if (ret != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); /*没写成功,将ext对应的数据返回给avail list*/ WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /*需要进行fsync操作,脏页太多,进行一次异步刷盘*/ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /*清理fh->fd文件对应的system page cache中的数据,这个过程可能会有IO操作,相当于同步的sync调用*/ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return ret; }