/* * __wt_ovfl_read -- * Bring an overflow item into memory. */ int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) { WT_DECL_RET; /* * If no page specified, there's no need to lock and there's no cache * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells. */ if (page == NULL) return ( __ovfl_read(session, unpack->data, unpack->size, store)); /* * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow * value, but there was still a reader in the system that might need it, * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM * and we will be passed a page so we can look-aside into the cache of * such values. * * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ WT_RET(__wt_readlock(session, S2BT(session)->ovfl_lock)); ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ? __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) : __ovfl_read(session, unpack->data, unpack->size, store); WT_TRET(__wt_readunlock(session, S2BT(session)->ovfl_lock)); return (ret); }
/* * __txn_rollback_to_stable_check -- * Ensure the rollback request is reasonable. */ static int __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) { WT_TXN_GLOBAL *txn_global; bool active_txns, stable_set; txn_global = &S2C(session)->txn_global; __wt_readlock(session, &txn_global->rwlock); stable_set = !__wt_timestamp_iszero(&txn_global->stable_timestamp); __wt_readunlock(session, &txn_global->rwlock); if (!stable_set) WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a " "stable timestamp"); /* * Help the user - see if they have any active transactions. I'd * like to check the transaction running flag, but that would * require peeking into all open sessions, which isn't really * kosher. */ WT_RET(__wt_txn_are_any_active(session, &active_txns)); if (active_txns) WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions"); return (0); }
/* * __wt_lsm_tree_readlock -- * Acquire a shared lock on an LSM tree. */ int __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_RET(__wt_readlock(session, lsm_tree->rwlock)); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); return (0); }
/* * __wt_lsm_tree_lock -- * Lock an LSM tree for reading or writing. */ int __wt_lsm_tree_lock( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int exclusive) { /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ F_SET(session, WT_SESSION_NO_SCHEMA_LOCK); if (exclusive) return (__wt_writelock(session, lsm_tree->rwlock)); else return (__wt_readlock(session, lsm_tree->rwlock)); }
/* * __wt_ovfl_read -- * Bring an overflow item into memory. */ int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) { WT_DECL_RET; WT_OVFL_TRACK *track; size_t i; *decoded = false; /* * If no page specified, there's no need to lock and there's no cache * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells. */ if (page == NULL) return ( __ovfl_read(session, unpack->data, unpack->size, store)); /* * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow * value, but there was still a reader in the system that might need it, * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM * and we will be passed a page so we can check the on-page cell. * * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { track = page->modify->ovfl_track; for (i = 0; i < track->remove_next; ++i) if (track->remove[i].cell == unpack->cell) { store->data = track->remove[i].data; store->size = track->remove[i].size; break; } WT_ASSERT(session, i < track->remove_next); *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); __wt_readunlock(session, &S2BT(session)->ovfl_lock); return (ret); }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { struct timespec start, now; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; uint64_t timediff; bool did_work, locked, signalled; session = arg; conn = S2C(session); log = conn->log; locked = signalled = false; /* * Set this to the number of milliseconds we want to run archive and * pre-allocation. Start it so that we run on the first time through. */ timediff = WT_THOUSAND; /* * The log server thread does a variety of work. It forces out any * buffered log writes. It pre-allocates log files and it performs * log archiving. The reason the wrlsn thread does not force out * the buffered writes is because we want to process and move the * write_lsn forward as quickly as possible. The same reason applies * to why the log file server thread does not force out the writes. * That thread does fsync calls which can take a long time and we * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ did_work = true; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the * wrlsn thread because of interaction advancing the write_lsn * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0, &did_work)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ if (timediff >= WT_THOUSAND || signalled) { /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) { /* * Log file pre-allocation is disabled when a * hot backup cursor is open because we have * agreed not to rename or remove any files in * the database directory. */ WT_ERR(__wt_readlock( session, conn->hot_backup_lock)); locked = true; if (!conn->hot_backup) WT_ERR(__log_prealloc_once(session)); WT_ERR(__wt_readunlock( session, conn->hot_backup_lock)); locked = false; } /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); WT_TRET(__wt_writeunlock( session, log->log_archive_lock)); WT_ERR(ret); } else WT_ERR( __wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open " "log cursor holding archive lock")); } } /* Wait until the next event. */ WT_ERR(__wt_epoch(session, &start)); WT_ERR(__wt_cond_auto_wait_signal(session, conn->log_cond, did_work, &signalled)); WT_ERR(__wt_epoch(session, &now)); timediff = WT_TIMEDIFF_MS(now, start); } if (0) { err: __wt_err(session, ret, "log server error"); if (locked) WT_TRET(__wt_readunlock( session, conn->hot_backup_lock)); } return (WT_THREAD_RET_VALUE); }
/* * __wt_curlog_open -- * Initialize a log cursor. */ int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curlog_compare, /* compare */ __wt_cursor_equals, /* equals */ __curlog_next, /* next */ __wt_cursor_notsup, /* prev */ __curlog_reset, /* reset */ __curlog_search, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LOG *log; WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0); conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_RET_MSG(session, EINVAL, "Cannot open a log cursor without logging enabled"); log = conn->log; cl = NULL; WT_RET(__wt_calloc_one(session, &cl)); cursor = &cl->iface; *cursor = iface; cursor->session = &session->iface; WT_ERR(__wt_calloc_one(session, &cl->cur_lsn)); WT_ERR(__wt_calloc_one(session, &cl->next_lsn)); WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); cursor->key_format = WT_LOGC_KEY_FORMAT; cursor->value_format = WT_LOGC_VALUE_FORMAT; WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); /* * The user may be trying to read a log record they just wrote. * Log records may be buffered, so force out any now. */ WT_ERR(__wt_log_force_write(session, 1)); /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else { __wt_free(session, cl->cur_lsn); __wt_free(session, cl->next_lsn); __wt_scr_free(session, &cl->logrec); __wt_scr_free(session, &cl->opkey); __wt_scr_free(session, &cl->opvalue); /* * NOTE: We cannot get on the error path with the * readlock held. No need to unlock it unless that * changes above. */ __wt_free(session, cl); } *cursorp = NULL; } return (ret); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. */ int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; uint64_t current_id, last_running, oldest_id; uint64_t prev_last_running, prev_oldest_id; bool strict, wait; conn = S2C(session); txn_global = &conn->txn_global; strict = LF_ISSET(WT_TXN_OLDEST_STRICT); wait = LF_ISSET(WT_TXN_OLDEST_WAIT); current_id = last_running = txn_global->current; prev_last_running = txn_global->last_running; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return (0); /* First do a read-only scan. */ if (wait) __wt_readlock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); __wt_readunlock(session, txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for * non-forced updates), give up. */ if ((oldest_id == prev_oldest_id || (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && ((last_running == prev_last_running) || (!strict && WT_TXNID_LT(last_running, prev_last_running + 100)))) return (0); /* It looks like an update is necessary, wait for exclusive access. */ if (wait) __wt_writelock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* * If the oldest ID has been updated while we waited, don't bother * scanning. */ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && WT_TXNID_LE(last_running, txn_global->last_running)) goto done; /* * Re-scan now that we have exclusive access. This is necessary because * threads get transaction snapshots with read locks, and we have to be * sure that there isn't a thread that has got a snapshot locally but * not yet published its snap_min. */ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); #ifdef HAVE_DIAGNOSTIC { /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. Coverity * complains if there are assignments only done in diagnostic builds, * and when the read is from a volatile. */ uint64_t id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); } #endif /* Update the oldest ID. */ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; if (WT_TXNID_LT(txn_global->last_running, last_running)) { txn_global->last_running = last_running; #ifdef HAVE_VERBOSE /* Output a verbose message about long-running transactions, * but only when some progress is being made. */ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } #endif } done: __wt_writeunlock(session, txn_global->scan_rwlock); return (ret); }
/* * __conn_dhandle_open_lock -- * Spin on the current data handle until either (a) it is open, read * locked; or (b) it is closed, write locked. If exclusive access is * requested and cannot be granted immediately because the handle is * in use, fail with EBUSY. * * Here is a brief summary of how different operations synchronize using * either the schema lock, handle locks or handle flags: * * open -- holds the schema lock, one thread gets the handle exclusive, * reverts to a shared handle lock and drops the schema lock * once the handle is open; * bulk load -- sets bulk and exclusive; * salvage, truncate, update, verify -- hold the schema lock, set a * "special" flag; * sweep -- gets a write lock on the handle, doesn't set exclusive * * The schema lock prevents a lot of potential conflicts: we should never * see handles being salvaged or verified because those operation hold the * schema lock. However, it is possible to see a handle that is being * bulk loaded, or that the sweep server is closing. * * The principle here is that application operations can cause other * application operations to fail (so attempting to open a cursor on a * file while it is being bulk-loaded will fail), but internal or * database-wide operations should not prevent application-initiated * operations. For example, attempting to verify a file should not fail * because the sweep server happens to be in the process of closing that * file. */ static int __conn_dhandle_open_lock( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; int is_open, lock_busy, want_exclusive; btree = dhandle->handle; lock_busy = 0; want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0; /* * Check that the handle is open. We've already incremented * the reference count, so once the handle is open it won't be * closed by another thread. * * If we can see the WT_DHANDLE_OPEN flag set while holding a * lock on the handle, then it's really open and we can start * using it. Alternatively, if we can get an exclusive lock * and WT_DHANDLE_OPEN is still not set, we need to do the open. */ for (;;) { /* * If the handle is already open for a special operation, * give up. */ if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) return (EBUSY); /* * If the handle is open, get a read lock and recheck. * * Wait for a read lock if we want exclusive access and failed * to get it: the sweep server may be closing this handle, and * we need to wait for it to complete. If we want exclusive * access and find the handle open once we get the read lock, * give up: some other thread has it locked for real. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { WT_RET(__wt_readlock(session, dhandle->rwlock)); is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0; if (is_open && !want_exclusive) return (0); WT_RET(__wt_readunlock(session, dhandle->rwlock)); } else is_open = 0; /* * It isn't open or we want it exclusive: try to get an * exclusive lock. There is some subtlety here: if we race * with another thread that successfully opens the file, we * don't want to block waiting to get exclusive access. */ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { /* * If it was opened while we waited, drop the write * lock and get a read lock instead. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !want_exclusive) { lock_busy = 0; WT_RET( __wt_writeunlock(session, dhandle->rwlock)); continue; } /* We have an exclusive lock, we're done. */ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); return (0); } else if (ret != EBUSY || (is_open && want_exclusive)) return (ret); else lock_busy = 1; /* Give other threads a chance to make progress. */ __wt_yield(); } }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, pinned_id; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); n = 0; /* We're going to scan the table: wait for the lock. */ __wt_readlock(session, &txn_global->rwlock); current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so * don't including it in the published pinned ID. */ if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) { txn->snapshot[n++] = id; txn_state->metadata_pinned = id; } /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->pinned_id = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, pinned_id)) pinned_id = id; } } /* * If we got a new snapshot, update the published pinned ID for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->pinned_id = pinned_id; done: __wt_readunlock(session, &txn_global->rwlock); __txn_sort_snapshot(session, n, current_id); }
/* * __txn_rollback_to_stable_btree -- * Called for each open handle - choose to either skip or wipe the commits */ static int __txn_rollback_to_stable_btree( WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; WT_DECL_TIMESTAMP(rollback_timestamp) WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; WT_UNUSED(cfg); btree = S2BT(session); txn_global = &S2C(session)->txn_global; /* * Immediately durable files don't get their commits wiped. This case * mostly exists to support the semantic required for the oplog in * MongoDB - updates that have been made to the oplog should not be * aborted. It also wouldn't be safe to roll back updates for any * table that had it's records logged, since those updates would be * recovered after a crash making them inconsistent. */ if (__wt_btree_immediately_durable(session)) { /* * Add the btree ID to the bitstring, so we can exclude any * lookaside entries for this btree. */ __bit_set( S2C(session)->stable_rollback_bitstring, btree->id); return (0); } /* There is never anything to do for checkpoint handles */ if (session->dhandle->checkpoint != NULL) return (0); /* There is nothing to do on an empty tree. */ if (btree->root.page == NULL) return (0); if (btree->type != BTREE_ROW) WT_RET_MSG(session, EINVAL, "rollback_to_stable " "is only supported for row store btrees"); /* * Copy the stable timestamp, otherwise we'd need to lock it each time * it's accessed. Even though the stable timestamp isn't supposed to be * updated while rolling back, accessing it without a lock would * violate protocol. */ __wt_readlock(session, &txn_global->rwlock); __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); __wt_readunlock(session, &txn_global->rwlock); /* * Ensure the eviction server is out of the file - we don't * want it messing with us. This step shouldn't be required, but * it simplifies some of the reasoning about what state trees can * be in. */ WT_RET(__wt_evict_file_exclusive_on(session)); ret = __txn_rollback_to_stable_btree_walk( session, &rollback_timestamp); __wt_evict_file_exclusive_off(session); return (ret); }
/* * __txn_rollback_to_stable_lookaside_fixup -- * Remove any updates that need to be rolled back from the lookaside file. */ static int __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_RET; WT_DECL_TIMESTAMP(rollback_timestamp) WT_ITEM las_addr, las_key, las_timestamp; WT_TXN_GLOBAL *txn_global; uint64_t las_counter, las_txnid, remove_cnt; uint32_t las_id, session_flags; conn = S2C(session); cursor = NULL; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_CLEAR(las_timestamp); /* * Copy the stable timestamp, otherwise we'd need to lock it each time * it's accessed. Even though the stable timestamp isn't supposed to be * updated while rolling back, accessing it without a lock would * violate protocol. */ txn_global = &S2C(session)->txn_global; __wt_readlock(session, &txn_global->rwlock); __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); __wt_readunlock(session, &txn_global->rwlock); __wt_las_cursor(session, &cursor, &session_flags); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; (ret = cursor->next(cursor)) == 0; ) { WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter, &las_txnid, &las_timestamp, &las_key)); /* Check the file ID so we can skip durable tables */ if (__bit_test(conn->stable_rollback_bitstring, las_id)) continue; /* * Entries with no timestamp will have a timestamp of zero, * which will fail the following check and cause them to never * be removed. */ if (__wt_timestamp_cmp( &rollback_timestamp, las_timestamp.data) < 0) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } WT_ERR_NOTFOUND_OK(ret); err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. Underflow * isn't fatal, but check anyway so we don't skew low over time. */ if (remove_cnt > conn->las_record_cnt) conn->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; bool locked; session = arg; conn = S2C(session); log = conn->log; locked = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.l.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ WT_ERR(__wt_fsync(session, close_fh, true)); /* * We want to have the file size reflect actual * data with minimal pre-allocated zeroed space. * We can't truncate the file during hot backup, * or the underlying file system may not support * truncate: both are OK, it's just more work * during cursor traversal. */ if (!conn->hot_backup) { __wt_readlock( session, conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, close_fh, close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( session, conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); __wt_spin_lock(session, &log->log_sync_lock); locked = true; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; __wt_cond_signal(session, log->log_sync_cond); locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. * * We also have to wait for the written LSN and the * sync LSN to be in the same file so that we know we * have synchronized all earlier log files. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { /* * If the sync file is behind either the one * wanted for a background sync or the write LSN * has moved to another file continue to let * this worker thread process that older file * immediately. */ if ((log->sync_lsn.l.file < log->bg_sync_lsn.l.file) || (log->sync_lsn.l.file < min_lsn.l.file)) continue; WT_ERR(__wt_fsync(session, log->log_fh, true)); __wt_spin_lock(session, &log->log_sync_lock); locked = true; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { WT_ASSERT(session, min_lsn.l.file == log->sync_lsn.l.file); log->sync_lsn = min_lsn; __wt_cond_signal( session, log->log_sync_cond); } locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { __wt_cond_auto_signal( session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; bool locked, readonly; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; #endif txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); readonly = txn->mod_count == 0; /* * Look for a commit timestamp. */ WT_ERR( __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval)); if (cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval, true, true, true)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } #ifdef HAVE_TIMESTAMPS /* * Debugging checks on timestamps, if user requested them. */ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " "none set on this transaction"); if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " "timestamp set on this transaction"); #endif /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. */ WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* * If the user chose the default setting, check whether sync is enabled * for this transaction (either inherited or via begin_transaction). * If sync is disabled, clear the field to avoid the log write being * flushed. * * Otherwise check for specific settings. We don't need to check for * "on" because that is the default inherited from the connection. If * the user set anything in begin_transaction, we only override with an * explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { /* * If the caller already set sync on begin_transaction then * they should not be using sync on commit_transaction. * Flag that as an error. */ if (F_ISSET(txn, WT_TXN_SYNC_SET)) WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction"); if (WT_STRING_MATCH("background", cval.str, cval.len)) txn->txn_logsync = WT_LOG_BACKGROUND; else if (WT_STRING_MATCH("off", cval.str, cval.len)) txn->txn_logsync = 0; /* * We don't need to check for "on" here because that is the * default to inherit from the connection setting. */ } /* Commit notification. */ if (txn->notify != NULL) WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a snapshot. */ if (session->ncursors > 0) { WT_DIAGNOSTIC_YIELD; WT_ERR(__wt_session_copy_values(session)); } /* If we are logging, write a commit log record. */ if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. * Release our snapshot in case it is keeping data pinned. * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); /* * We hold the visibility lock for reading from the time * we write our log record until the time we release our * transaction so that the LSN any checkpoint gets will * always reflect visible data. */ __wt_readlock(session, &txn_global->visibility_rwlock); locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } /* Note: we're going to commit: nothing can fail after this point. */ /* Process and free updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: /* * Switch reserved operations to abort to * simplify obsolete update list truncation. */ if (op->u.upd->type == WT_UPDATE_RESERVED) { op->u.upd->txnid = WT_TXN_ABORTED; break; } /* * Writes to the lookaside file can be evicted as soon * as they commit. */ if (conn->cache->las_fileid != 0 && op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && op->type != WT_TXN_OP_BASIC_TS) { WT_ASSERT(session, op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); } #endif break; case WT_TXN_OP_REF: #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) __wt_timestamp_set( &op->u.ref->page_del->timestamp, &txn->commit_timestamp); #endif break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* Other operations don't need timestamps. */ break; } __wt_txn_op_free(session, op); } txn->mod_count = 0; #ifdef HAVE_TIMESTAMPS /* * Track the largest commit timestamp we have seen. * * We don't actually clear the local commit timestamp, just the flag. * That said, we can't update the global commit timestamp until this * transaction is visible, which happens when we release it. */ update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); #endif __wt_txn_release(session); if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } /* * If it looks like we need to move the global commit timestamp, * write lock and re-check. */ if (update_timestamp) { #if WT_TIMESTAMP_SIZE == 8 while (__wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0) { if (__wt_atomic_cas64( &txn_global->commit_timestamp.val, prev_commit_timestamp.val, txn->commit_timestamp.val)) { txn_global->has_commit_timestamp = true; break; } __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp); } #else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { __wt_timestamp_set(&txn_global->commit_timestamp, &txn->commit_timestamp); txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); #endif } #endif /* * We're between transactions, if we need to block for eviction, it's * a good time to do so. Note that we must ignore any error return * because the user's data is committed. */ if (!readonly) (void)__wt_cache_eviction_check(session, false, false, NULL); return (0); err: /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); }
/* * __wt_txn_config -- * Configure a transaction. */ int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_TXN *txn; txn = &session->txn; WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); if (cval.len != 0) txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? WT_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED; /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. * * We want to distinguish between inheriting implicitly and explicitly. */ F_CLR(txn, WT_TXN_SYNC_SET); WT_RET(__wt_config_gets_def( session, cfg, "sync", (int)UINT_MAX, &cval)); if (cval.val == 0 || cval.val == 1) /* * This is an explicit setting of sync. Set the flag so * that we know not to overwrite it in commit_transaction. */ F_SET(txn, WT_TXN_SYNC_SET); /* * If sync is turned off explicitly, clear the transaction's sync field. */ if (cval.val == 0) txn->txn_logsync = 0; WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval)); if (cval.len > 0) /* * The layering here isn't ideal - the named snapshot get * function does both validation and setup. Otherwise we'd * need to walk the list of named snapshots twice during * transaction open. */ WT_RET(__wt_txn_named_snapshot_get(session, &cval)); WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); if (cval.len > 0) { #ifdef HAVE_TIMESTAMPS wt_timestamp_t ts; WT_TXN_GLOBAL *txn_global; char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; bool round_to_oldest; txn_global = &S2C(session)->txn_global; WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); /* * Read the configuration here to reduce the span of the * critical section. */ WT_RET(__wt_config_gets_def(session, cfg, "round_to_oldest", 0, &cval)); round_to_oldest = cval.val; /* * This code is not using the timestamp validate function to * avoid a race between checking and setting transaction * timestamp. */ __wt_readlock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0) { WT_RET(__wt_timestamp_to_hex_string(session, timestamp_buf, &ts)); /* * If given read timestamp is earlier than oldest * timestamp then round the read timestamp to * oldest timestamp. */ if (round_to_oldest) __wt_timestamp_set(&txn->read_timestamp, &txn_global->oldest_timestamp); else { __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "read timestamp " "%s older than oldest timestamp", timestamp_buf); } } else { __wt_timestamp_set(&txn->read_timestamp, &ts); /* * Reset to avoid a verbose message as read * timestamp is not rounded to oldest timestamp. */ round_to_oldest = false; } __wt_txn_set_read_timestamp(session); __wt_readunlock(session, &txn_global->rwlock); txn->isolation = WT_ISO_SNAPSHOT; if (round_to_oldest) { /* * This message is generated here to reduce the span of * critical section. */ __wt_verbose(session, WT_VERB_TIMESTAMP, "Read " "timestamp %s : Rounded to oldest timestamp", timestamp_buf); } #else WT_RET_MSG(session, EINVAL, "read_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } return (0); }
/* * __wt_curlog_open -- * Initialize a log cursor. */ int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curlog_compare, /* compare */ __wt_cursor_equals, /* equals */ __curlog_next, /* next */ __wt_cursor_notsup, /* prev */ __curlog_reset, /* reset */ __curlog_search, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_modify_notsup, /* modify */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __wt_cursor_notsup, /* cache */ __wt_cursor_reopen_notsup, /* reopen */ __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LOG *log; WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0); conn = S2C(session); log = conn->log; WT_RET(__wt_calloc_one(session, &cl)); cursor = (WT_CURSOR *)cl; *cursor = iface; cursor->session = (WT_SESSION *)session; cursor->key_format = WT_LOGC_KEY_FORMAT; cursor->value_format = WT_LOGC_VALUE_FORMAT; WT_ERR(__wt_calloc_one(session, &cl->cur_lsn)); WT_ERR(__wt_calloc_one(session, &cl->next_lsn)); WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (log != NULL) { /* * The user may be trying to read a log record they just wrote. * Log records may be buffered, so force out any now. */ WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ __wt_readlock(session, &log->log_archive_lock); F_SET(cl, WT_CURLOG_ARCHIVE_LOCK); (void)__wt_atomic_add32(&conn->log_cursors, 1); } if (0) { err: WT_TRET(__curlog_close(cursor)); *cursorp = NULL; } return (ret); }
/* * __log_archive_once -- * Perform one iteration of log archiving. Must be called with the * log archive lock held. */ static int __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t lognum, min_lognum; u_int i, locked, logcount; char **logfiles; conn = S2C(session); log = conn->log; logcount = 0; logfiles = NULL; /* * If we're coming from a backup cursor we want the smaller of * the last full log file copied in backup or the checkpoint LSN. * Otherwise we want the minimum of the last log file written to * disk and the checkpoint LSN. */ if (backup_file != 0) min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file); else min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_archive: archive to log number %" PRIu32, min_lognum)); /* * Main archive code. Get the list of all log files and * remove any earlier than the minimum log number. */ WT_RET(__wt_dirlist(session, conn->log_path, WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount)); /* * We can only archive files if a hot backup is not in progress or * if we are the backup. */ WT_RET(__wt_readlock(session, conn->hot_backup_lock)); locked = 1; if (conn->hot_backup == 0 || backup_file != 0) { for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum( session, logfiles[i], &lognum)); if (lognum < min_lognum) WT_ERR(__wt_log_remove( session, WT_LOG_FILENAME, lognum)); } } WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); locked = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; logcount = 0; /* * Indicate what is our new earliest LSN. It is the start * of the log file containing the last checkpoint. */ log->first_lsn.file = min_lognum; log->first_lsn.offset = 0; if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __wt_lsm_stat_init -- * Initialize a LSM statistics structure. */ int __wt_lsm_stat_init(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_CURSOR_STAT *cst, uint32_t flags) { WT_CURSOR *stat_cursor; WT_DECL_ITEM(uribuf); WT_DECL_RET; WT_DSRC_STATS *stats; WT_LSM_CHUNK *chunk; const char *cfg[] = API_CONF_DEFAULTS( session, open_cursor, "statistics_fast=on"); const char *disk_cfg[] = API_CONF_DEFAULTS(session, open_cursor, "checkpoint=WiredTigerCheckpoint,statistics_fast=on"); const char *desc, *pvalue; uint64_t value; u_int i; int locked, stat_key; WT_UNUSED(flags); locked = 0; WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Clear the statistics we are about to recalculate. */ if (cst->stats != NULL) stats = (WT_DSRC_STATS *)cst->stats; else { WT_ERR(__wt_calloc_def(session, 1, &stats)); __wt_stat_init_dsrc_stats(stats); cst->stats_first = cst->stats = (WT_STATS *)stats; cst->stats_count = sizeof(*stats) / sizeof(WT_STATS); } *stats = lsm_tree->stats; if (LF_ISSET(WT_STATISTICS_CLEAR)) __wt_stat_clear_dsrc_stats(&lsm_tree->stats); /* Hold the LSM lock so that we can safely walk through the chunks. */ WT_ERR(__wt_readlock(session, lsm_tree->rwlock)); locked = 1; /* Set the stats for this run. */ WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (chunk->generation > (uint32_t)WT_STAT(stats, lsm_generation_max)) WT_STAT_SET(stats, lsm_generation_max, chunk->generation); /* * LSM chunk reads happen from a checkpoint, so get the * statistics for a checkpoint if one exists. */ WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->uri)); ret = __wt_curstat_open(session, uribuf->data, F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg, &stat_cursor); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) ret = __wt_curstat_open( session, uribuf->data, cfg, &stat_cursor); WT_ERR(ret); while ((ret = stat_cursor->next(stat_cursor)) == 0) { WT_ERR(stat_cursor->get_key(stat_cursor, &stat_key)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRKV(stats, stat_key, value); } WT_ERR_NOTFOUND_OK(ret); WT_ERR(stat_cursor->close(stat_cursor)); if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) continue; WT_STAT_INCR(stats, bloom_count); WT_STAT_INCRV(stats, bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8); WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->bloom_uri)); WT_ERR(__wt_curstat_open(session, uribuf->data, cfg, &stat_cursor)); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_CLEAN); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_clean, value); WT_STAT_INCRV(stats, bloom_page_evict, value); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_DIRTY); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_dirty, value); WT_STAT_INCRV(stats, bloom_page_evict, value); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_FAIL); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_fail, value); stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_READ); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_read, value); WT_STAT_INCRV(stats, bloom_page_read, value); stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_WRITE); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_write, value); WT_ERR(stat_cursor->close(stat_cursor)); } err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); __wt_scr_free(&uribuf); return (ret); }