/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t checksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); /* * Map the block if it's possible. */ handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; ret = handle->fh_map_preload(handle, (WT_SESSION *)session, buf->data, buf->size,bm->mapped_cookie); WT_STAT_CONN_INCR(session, block_map_read); WT_STAT_CONN_INCRV(session, block_byte_map_read, size); return (ret); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced(session, block, "read", offset, size, bm->is_live, __func__, __LINE__)); #endif /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET( __wt_block_read_off(session, block, buf, offset, size, checksum)); /* Optionally discard blocks from the system's buffer cache. */ WT_RET(__wt_block_discard(session, block, (size_t)size)); return (0); }
/* * __ref_descend_prev -- * Descend the tree one level, during a previous-cursor walk. */ static inline void __ref_descend_prev( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; uint64_t yield_count; /* * We're passed a child page into which we're descending, and on which * we have a hazard pointer. */ for (yield_count = 0;; yield_count++, __wt_yield()) { /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal * page into its parent, we move the WT_REF structures and * update the parent's page index before updating the split * page's page index, and it's not an atomic update. A thread * can read the parent page's replacement page index and then * read the split page's original index. * * This can create a race for previous-cursor movements. * * For example, imagine an internal page with 3 child pages, * with the namespaces a-f, g-h and i-j; the first child page * splits. The parent starts out with the following page-index: * * | ... | a | g | i | ... | * * The split page starts out with the following page-index: * * | a | b | c | d | e | f | * * The first step is to move the c-f ranges into a new subtree, * so, for example we might have two new internal pages 'c' and * 'e', where the new 'c' page references the c-d namespace and * the new 'e' page references the e-f namespace. The top of the * subtree references the parent page, but until the parent's * page index is updated, any threads in the subtree won't be * able to ascend out of the subtree. However, once the parent * page's page index is updated to this: * * | ... | a | c | e | g | i | ... | * * threads in the subtree can ascend into the parent. Imagine a * cursor in the c-d part of the namespace that ascends to the * parent's 'c' slot. It would then decrement to the slot before * the 'c' slot, the 'a' slot. * * The previous-cursor movement selects the last slot in the 'a' * page; if the split page's page-index hasn't been updated yet, * it will select the 'f' slot, which is incorrect. Once the * split page's page index is updated to this: * * | a | b | * * the previous-cursor movement will select the 'b' slot, which * is correct. * * This function takes an argument which is the internal page * from which we're descending. If the last slot on the page no * longer points to the current page as its "home", the page is * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. */ WT_INTL_INDEX_GET(session, ref->page, pindex); if (pindex->index[pindex->entries - 1]->home == ref->page) break; } *pindexp = pindex; WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count); }
/* * __ref_index_slot -- * Return the page's index and slot for a reference. */ static inline void __ref_index_slot(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_PAGE_INDEX *pindex; WT_REF **start, **stop, **p, **t; uint64_t sleep_count, yield_count; uint32_t entries, slot; /* * If we don't find our reference, the page split and our home * pointer references the wrong page. When internal pages * split, their WT_REF structure home values are updated; yield * and wait for that to happen. */ for (sleep_count = yield_count = 0;;) { /* * Copy the parent page's index value: the page can split at * any time, but the index's value is always valid, even if * it's not up-to-date. */ WT_INTL_INDEX_GET(session, ref->home, pindex); entries = pindex->entries; /* * Use the page's reference hint: it should be correct unless * there was a split or delete in the parent before our slot. * If the hint is wrong, it can be either too big or too small, * but often only by a small amount. Search up and down the * index starting from the hint. * * It's not an error for the reference hint to be wrong, it * just means the first retrieval (which sets the hint for * subsequent retrievals), is slower. */ slot = ref->pindex_hint; if (slot >= entries) slot = entries - 1; if (pindex->index[slot] == ref) goto found; for (start = &pindex->index[0], stop = &pindex->index[entries - 1], p = t = &pindex->index[slot]; p > start || t < stop;) { if (p > start && *--p == ref) { slot = (uint32_t)(p - start); goto found; } if (t < stop && *++t == ref) { slot = (uint32_t)(t - start); goto found; } } /* * We failed to get the page index and slot reference, yield * before retrying, and if we've yielded enough times, start * sleeping so we don't burn CPU to no purpose. */ __wt_ref_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked, sleep_count); } found: WT_ASSERT(session, pindex->index[slot] == ref); *pindexp = pindex; *slotp = slot; }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_checksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", checksum %" PRIu32, (uintmax_t)offset, size, checksum); WT_STAT_CONN_INCR(session, block_read); WT_STAT_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.checksum == checksum) { blk->checksum = 0; page_checksum = __wt_checksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_checksum == checksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, page_checksum, checksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, swap.checksum, checksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ __wt_lsm_tree_writelock(session, lsm_tree); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); __wt_lsm_tree_writeunlock(session, lsm_tree); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { __wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation); for (verb = start_chunk; verb < end_chunk + 1; verb++) __wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %" PRIu32 ", gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!lsm_tree->active) WT_ERR(EINTR); WT_STAT_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) __wt_bloom_insert(bloom, &key); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; __wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted", record_count, insert_count); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); __wt_lsm_tree_writelock(session, lsm_tree); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) __wt_lsm_tree_writeunlock(session, lsm_tree); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret, tret = __wt_schema_drop( session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) __wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close"); else __wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0)); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; uint64_t sleep_count, yield_count; /* * If the page is still "deleted", it's as we left it, reset the state * to on-disk and we're done. Otherwise, we expect the page is either * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ for (sleep_count = yield_count = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_LOOKASIDE: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: /* * We can't use the normal read path to get a copy of * the page because the session may have closed the * cursor, we no longer have the reference to the tree * required for a hazard pointer. We're safe because * with unresolved transactions, the page isn't going * anywhere. * * The page is in an in-memory state, walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; /* * Discard the memory, the transaction can't abort * twice. */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); return; } /* * We wait for the change in page state, yield before retrying, * and if we've yielded enough times, start sleeping so we don't * burn CPU to no purpose. */ __wt_ref_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_count); } }
/* * __log_slot_close -- * Close out the slot the caller is using. The slot may already be * closed or freed by another thread. */ static int __log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *releasep, bool forced) { WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; #ifdef HAVE_DIAGNOSTIC uint64_t time_start, time_stop; int count; #endif *releasep = false; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; if (slot == NULL) return (WT_NOTFOUND); retry: old_state = slot->slot_state; /* * If this close is coming from a forced close and a thread is in * the middle of using the slot, return EBUSY. The caller can * decide if retrying is necessary or not. */ if (forced && WT_LOG_SLOT_INPROGRESS(old_state)) return (__wt_set_return(session, EBUSY)); /* * If someone else is switching out this slot we lost. Nothing to * do but return. Return WT_NOTFOUND anytime the given slot was * processed by another closing thread. Only return 0 when we * actually closed the slot. */ if (WT_LOG_SLOT_CLOSED(old_state)) { WT_STAT_CONN_INCR(session, log_slot_close_race); return (WT_NOTFOUND); } /* * If someone completely processed this slot, we're done. */ if (FLD_LOG_SLOT_ISSET( (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) { WT_STAT_CONN_INCR(session, log_slot_close_race); return (WT_NOTFOUND); } new_state = (old_state | WT_LOG_SLOT_CLOSE); /* * Close this slot. If we lose the race retry. */ if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) goto retry; /* * We own the slot now. No one else can join. * Set the end LSN. */ WT_STAT_CONN_INCR(session, log_slot_closes); if (WT_LOG_SLOT_DONE(new_state)) *releasep = true; slot->slot_end_lsn = slot->slot_start_lsn; /* * A thread setting the unbuffered flag sets the unbuffered size after * setting the flag. There could be a delay between a thread setting * the flag, a thread closing the slot, and the original thread setting * that value. If the state is unbuffered, wait for the unbuffered * size to be set. */ #ifdef HAVE_DIAGNOSTIC count = 0; time_start = __wt_clock(session); #endif if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { while (slot->slot_unbuffered == 0) { WT_STAT_CONN_INCR(session, log_slot_close_unbuf); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; if (count > WT_MILLION) { time_stop = __wt_clock(session); if (WT_CLOCKDIFF_SEC( time_stop, time_start) > 10) { __wt_errx(session, "SLOT_CLOSE: Slot %" PRIu32 " Timeout unbuffered, state 0x%" PRIx64 " unbuffered %" PRId64, (uint32_t)(slot - &log->slot_pool[0]), (uint64_t)slot->slot_state, slot->slot_unbuffered); __log_slot_dump(session); __wt_abort(session); } count = 0; } #endif } } end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; slot->slot_end_lsn.l.offset += (uint32_t)end_offset; WT_STAT_CONN_INCRV(session, log_slot_consolidated, end_offset); /* * XXX Would like to change so one piece of code advances the LSN. */ log->alloc_lsn = slot->slot_end_lsn; WT_ASSERT(session, log->alloc_lsn.l.file >= log->write_lsn.l.file); return (0); }
/* * __wt_log_slot_join -- * Join a consolidated logging slot. */ void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; uint64_t time_start, time_stop, usecs; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join, wait_cnt; bool closed, diag_yield, raced, slept, unbuffered, yielded; conn = S2C(session); log = conn->log; time_start = time_stop = 0; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ unbuffered = yielded = false; closed = raced = slept = false; wait_cnt = 0; #ifdef HAVE_DIAGNOSTIC diag_yield = (++log->write_calls % 7) == 0; if ((log->write_calls % WT_THOUSAND) == 0 || mysize > WT_LOG_SLOT_BUF_MAX) { #else diag_yield = false; if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif unbuffered = true; F_SET(myslot, WT_MYSLOT_UNBUFFERED); } for (;;) { WT_BARRIER(); slot = log->active_slot; old_state = slot->slot_state; if (WT_LOG_SLOT_OPEN(old_state)) { /* * Try to join our size into the existing size and * atomically write it back into the state. */ flag_state = WT_LOG_SLOT_FLAGS(old_state); released = WT_LOG_SLOT_RELEASED(old_state); join_offset = WT_LOG_SLOT_JOINED(old_state); if (unbuffered) new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; else new_join = join_offset + (int32_t)mysize; new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( (int64_t)new_join, (int64_t)released, (int64_t)flag_state); /* * Braces used due to potential empty body warning. */ if (diag_yield) { WT_DIAGNOSTIC_YIELD; } /* * Attempt to swap our size into the state. */ if (__wt_atomic_casiv64( &slot->slot_state, old_state, new_state)) break; WT_STAT_CONN_INCR(session, log_slot_races); raced = true; } else { WT_STAT_CONN_INCR(session, log_slot_active_closed); closed = true; ++wait_cnt; } if (!yielded) time_start = __wt_clock(session); yielded = true; /* * The slot is no longer open or we lost the race to * update it. Yield and try again. */ if (wait_cnt < WT_THOUSAND) __wt_yield(); else { __wt_sleep(0, WT_THOUSAND); slept = true; } } /* * We joined this slot. Fill in our information to return to * the caller. */ if (!yielded) WT_STAT_CONN_INCR(session, log_slot_immediate); else { WT_STAT_CONN_INCR(session, log_slot_yield); time_stop = __wt_clock(session); usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, log_slot_yield_duration, usecs); if (closed) WT_STAT_CONN_INCR(session, log_slot_yield_close); if (raced) WT_STAT_CONN_INCR(session, log_slot_yield_race); if (slept) WT_STAT_CONN_INCR(session, log_slot_yield_sleep); } if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) F_SET(slot, WT_SLOT_FLUSH); if (LF_ISSET(WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC); if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { WT_ASSERT(session, slot->slot_unbuffered == 0); WT_STAT_CONN_INCR(session, log_slot_unbuffered); slot->slot_unbuffered = (int64_t)mysize; } myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); } /* * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to * signal it has completed copying its piece of the log into * the memory buffer. */ int64_t __wt_log_slot_release(WT_MYSLOT *myslot, int64_t size) { WT_LOGSLOT *slot; wt_off_t cur_offset, my_start; int64_t my_size, rel_size; slot = myslot->slot; my_start = slot->slot_start_offset + myslot->offset; /* * We maintain the last starting offset within this slot. * This is used to know the offset of the last record that * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { /* * Set our offset if we are larger. */ if (__wt_atomic_casiv64( &slot->slot_last_offset, cur_offset, my_start)) break; /* * If we raced another thread updating this, try again. */ WT_BARRIER(); } /* * Add my size into the state and return the new size. */ rel_size = size; if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) rel_size = WT_LOG_SLOT_UNBUFFERED; my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0); return (__wt_atomic_addiv64(&slot->slot_state, my_size)); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint64_t yield_count; uint32_t filenum; bool locked; session = arg; conn = S2C(session); log = conn->log; locked = false; yield_count = 0; while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * The closing file handle should have a correct close * LSN. */ WT_ASSERT(session, log->log_close_lsn.l.file == filenum); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ WT_ERR(__wt_fsync(session, close_fh, true)); /* * We want to have the file size reflect actual * data with minimal pre-allocated zeroed space. * We can't truncate the file during hot backup, * or the underlying file system may not support * truncate: both are OK, it's just more work * during cursor traversal. */ if (!conn->hot_backup) { __wt_readlock( session, &conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, close_fh, close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( session, &conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); __wt_spin_lock(session, &log->log_sync_lock); locked = true; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; __wt_cond_signal(session, log->log_sync_cond); locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. * * We also have to wait for the written LSN and the * sync LSN to be in the same file so that we know we * have synchronized all earlier log files. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { /* * If the sync file is behind either the one * wanted for a background sync or the write LSN * has moved to another file continue to let * this worker thread process that older file * immediately. */ if ((log->sync_lsn.l.file < log->bg_sync_lsn.l.file) || (log->sync_lsn.l.file < min_lsn.l.file)) continue; WT_ERR(__wt_fsync(session, log->log_fh, true)); __wt_spin_lock(session, &log->log_sync_lock); locked = true; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { WT_ASSERT(session, min_lsn.l.file == log->sync_lsn.l.file); log->sync_lsn = min_lsn; __wt_cond_signal( session, log->log_sync_cond); } locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { __wt_cond_signal(session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ yield_count++; __wt_yield(); continue; } } /* Wait until the next event. */ __wt_cond_wait(session, conn->log_file_cond, 100000, NULL); } if (0) { err: WT_PANIC_MSG(session, ret, "log close server error"); } WT_STAT_CONN_INCRV(session, log_server_sync_blocked, yield_count); if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ static int __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; wt_off_t offset; size_t align_size; uint32_t checksum; bool local_locked; *offsetp = 0; /* -Werror=maybe-uninitialized */ *sizep = 0; /* -Werror=maybe-uninitialized */ *checksump = 0; /* -Werror=maybe-uninitialized */ fh = block->fh; /* * Clear the block header to ensure all of it is initialized, even the * unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything * in little-endian format. The checksum is (potentially) returned in a * big-endian format, swap it into place in a separate step. */ blk->flags = 0; if (data_checksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->checksum = 0; __wt_block_header_byteswap(blk); blk->checksum = checksum = __wt_checksum( buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->checksum = __wt_bswap32(blk->checksum); #endif /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. * Allocate space for the write, and optionally extend the file (note * the block-extend function may release the lock). * Release any locally acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend( session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && fh->written > block->os_cache_dirty_max && __wt_session_can_wait(session)) { fh->written = 0; if ((ret = __wt_fsync(session, fh, false)) != 0) { /* * Ignore ENOTSUP, but don't try again. */ if (ret != ENOTSUP) return (ret); block->os_cache_dirty_max = 0; } } /* Optionally discard blocks from the buffer cache. */ WT_RET(__wt_block_discard(session, block, align_size)); WT_STAT_CONN_INCR(session, block_write); WT_STAT_CONN_INCRV(session, block_byte_write, align_size); if (checkpoint_io) WT_STAT_CONN_INCRV( session, block_byte_write_checkpoint, align_size); __wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32, (uintmax_t)offset, (uintmax_t)align_size, checksum); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *checksump = checksum; return (0); }