/* * __backup_stop -- * Stop a backup. */ static int __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int i; conn = S2C(session); /* Release all btree names held by the backup. */ __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = NULL; __wt_writeunlock(session, &conn->hot_backup_lock); if (cb->list != NULL) { for (i = 0; cb->list[i] != NULL; ++i) __wt_free(session, cb->list[i]); __wt_free(session, cb->list); } /* Remove any backup specific file. */ WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion can proceed, as can the next hot backup. */ __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = false; __wt_writeunlock(session, &conn->hot_backup_lock); return (ret); }
/* * __wt_log_truncate_files -- * Truncate log files via archive once. Requires that the server is not * currently running. */ int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t backup_file, locked; WT_UNUSED(cfg); conn = S2C(session); log = conn->log; if (F_ISSET(conn, WT_CONN_SERVER_RUN) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); backup_file = 0; if (cursor != NULL) backup_file = WT_CURSOR_BACKUP_ID(cursor); WT_ASSERT(session, backup_file <= log->alloc_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file)); WT_RET(__wt_writelock(session, log->log_archive_lock)); locked = 1; WT_ERR(__log_archive_once(session, backup_file)); WT_ERR(__wt_writeunlock(session, log->log_archive_lock)); locked = 0; err: if (locked) WT_RET(__wt_writeunlock(session, log->log_archive_lock)); return (ret); }
/* * __wt_conn_btree_get -- * Get an open btree file handle, otherwise open a new one. */ int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *cfg[], uint32_t flags) { WT_DATA_HANDLE *dhandle; WT_DECL_RET; if (LF_ISSET(WT_DHANDLE_HAVE_REF)) WT_RET( __conn_dhandle_open_lock(session, session->dhandle, flags)); else { WT_WITH_DHANDLE_LOCK(session, ret = __conn_dhandle_get(session, name, ckpt, flags)); WT_RET(ret); } dhandle = session->dhandle; if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) && (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) if ((ret = __conn_btree_open(session, cfg, flags)) != 0) { F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); } WT_ASSERT(session, ret != 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE) == F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); return (ret); }
/* * __wt_log_truncate_files -- * Truncate log files via archive once. Requires that the server is not * currently running. */ int __wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t backup_file; conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); if (!force && F_ISSET(conn, WT_CONN_SERVER_LOG) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); log = conn->log; backup_file = 0; if (cursor != NULL) { WT_ASSERT(session, force == false); backup_file = WT_CURSOR_BACKUP_ID(cursor); } WT_ASSERT(session, backup_file <= log->alloc_lsn.l.file); __wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file); __wt_writelock(session, &log->log_archive_lock); ret = __log_archive_once(session, backup_file); __wt_writeunlock(session, &log->log_archive_lock); return (ret); }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; u_int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) WT_ERR(__log_prealloc_once(session)); /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { locked = 1; WT_ERR(__log_archive_once(session, 0)); WT_ERR( __wt_writeunlock( session, log->log_archive_lock)); locked = 0; } else WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open log " "cursor holding archive lock")); } /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log server error"); } if (locked) (void)__wt_writeunlock(session, log->log_archive_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_lsm_tree_writeunlock -- * Release an exclusive lock on an LSM tree. */ int __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); return (0); }
/* * __wt_thread_group_resize -- * Resize an array of utility threads taking the lock. */ int __wt_thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t new_min, uint32_t new_max, uint32_t flags) { WT_DECL_RET; __wt_verbose(session, WT_VERB_THREAD_GROUP, "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32 " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); __wt_writelock(session, group->lock); WT_TRET(__thread_group_resize(session, group, new_min, new_max, flags)); __wt_writeunlock(session, group->lock); return (ret); }
/* * __wt_ovfl_discard -- * Discard an on-page overflow value, and reset the page's cell. */ int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) { WT_BM *bm; WT_BTREE *btree; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_RET; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; __wt_cell_unpack(cell, unpack); /* * Finally remove overflow key/value objects, called when reconciliation * finishes after successfully writing a page. * * Keys must have already been instantiated and value objects must have * already been cached (if they might potentially still be read by any * running transaction). * * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ WT_RET(__wt_writelock(session, btree->ovfl_lock)); switch (unpack->raw) { case WT_CELL_KEY_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM); break; case WT_CELL_VALUE_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM); break; WT_ILLEGAL_VALUE(session); } WT_TRET(__wt_writeunlock(session, btree->ovfl_lock)); /* Free the backing disk blocks. */ WT_TRET(bm->free(bm, session, unpack->data, unpack->size)); return (ret); }
/* * __backup_stop -- * Stop a backup. */ static int __backup_stop(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); /* Remove any backup specific file. */ ret = __wt_backup_file_remove(session); /* Checkpoint deletion can proceed, as can the next hot backup. */ WT_TRET(__wt_writelock(session, conn->hot_backup_lock)); conn->hot_backup = false; WT_TRET(__wt_writeunlock(session, conn->hot_backup_lock)); return (ret); }
/* * __wt_thread_group_start_one -- * Start a new thread if possible. */ int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) { WT_DECL_RET; if (group->current_threads >= group->max) return (0); if (wait) __wt_writelock(session, group->lock); else if (__wt_try_writelock(session, group->lock) != 0) return (0); /* Recheck the bounds now that we hold the lock */ if (group->current_threads < group->max) WT_TRET(__thread_group_grow( session, group, group->current_threads + 1)); __wt_writeunlock(session, group->lock); return (ret); }
/* * __wt_ovfl_track_wrapup_err -- * Resolve the page's overflow tracking on reconciliation error. */ int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_OVFL_TRACK *track; if (page->modify == NULL || page->modify->ovfl_track == NULL) return (0); track = page->modify->ovfl_track; if (track->discard != NULL) WT_RET(__ovfl_discard_wrapup_err(session, page)); if (track->ovfl_reuse[0] != NULL) WT_RET(__ovfl_reuse_wrapup_err(session, page)); if (track->ovfl_txnc[0] != NULL) { WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); ret = __ovfl_txnc_wrapup(session, page); WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); } return (0); }
/* * __wt_thread_group_create -- * Create a new thread group, assumes incoming group structure is * zero initialized. */ int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name, uint32_t min, uint32_t max, uint32_t flags, int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context)) { WT_DECL_RET; bool cond_alloced; /* Check that the structure is initialized as expected */ WT_ASSERT(session, group->alloc == 0); cond_alloced = false; __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); WT_ERR(__wt_cond_alloc( session, "Thread group cond", false, &group->wait_cond)); cond_alloced = true; __wt_writelock(session, group->lock); group->run_func = run_func; group->name = name; WT_TRET(__thread_group_resize(session, group, min, max, flags)); __wt_writeunlock(session, group->lock); /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { if (cond_alloced) WT_TRET(__wt_cond_destroy(session, &group->wait_cond)); __wt_rwlock_destroy(session, &group->lock); } return (ret); }
/* * __sweep -- * Close unused dhandles on the connection dhandle list. */ static int __sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *dhandle_next; WT_DECL_RET; time_t now; conn = S2C(session); /* Don't discard handles that have been open recently. */ WT_RET(__wt_seconds(session, &now)); dhandle = SLIST_FIRST(&conn->dhlh); for (; dhandle != NULL; dhandle = dhandle_next) { dhandle_next = SLIST_NEXT(dhandle, l); if (dhandle->session_ref != 0 || now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT) continue; /* * We have a candidate for closing; if it's open, flush dirty * leaf pages, then acquire an exclusive lock on the handle * and close it. We might be blocking opens for a long time * (over disk I/O), but the handle was quiescent for awhile. * * The close can fail if an update cannot be written (updates in * a no-longer-referenced file might not yet be globally visible * if sessions have disjoint sets of files open). If the handle * is busy, skip it, we'll retry the close the next time, after * the transaction state has progressed. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { WT_WITH_DHANDLE(session, dhandle, ret = __wt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES)); WT_RET(ret); /* Re-check that this looks like a good candidate. */ if (dhandle->session_ref != 0 || now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT) continue; /* * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we * want opens to block on us rather than returning an * EBUSY error to the application. */ ret = __wt_try_writelock(session, dhandle->rwlock); if (ret == EBUSY) { ret = 0; continue; } WT_RET(ret); WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, 0)); if (ret == EBUSY) ret = 0; WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); WT_RET(ret); } /* * Attempt to discard the handle (the called function checks the * handle-open flag after acquiring appropriate locks, which is * why we don't do any special handling of EBUSY returns above, * that path never cleared the handle-open flag. */ WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_dhandle_discard_single(session, 0)); if (ret == EBUSY) ret = 0; WT_RET(ret); } return (0); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. */ int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; uint64_t current_id, last_running, oldest_id; uint64_t prev_last_running, prev_oldest_id; bool strict, wait; conn = S2C(session); txn_global = &conn->txn_global; strict = LF_ISSET(WT_TXN_OLDEST_STRICT); wait = LF_ISSET(WT_TXN_OLDEST_WAIT); current_id = last_running = txn_global->current; prev_last_running = txn_global->last_running; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return (0); /* First do a read-only scan. */ if (wait) __wt_readlock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); __wt_readunlock(session, txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for * non-forced updates), give up. */ if ((oldest_id == prev_oldest_id || (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && ((last_running == prev_last_running) || (!strict && WT_TXNID_LT(last_running, prev_last_running + 100)))) return (0); /* It looks like an update is necessary, wait for exclusive access. */ if (wait) __wt_writelock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* * If the oldest ID has been updated while we waited, don't bother * scanning. */ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && WT_TXNID_LE(last_running, txn_global->last_running)) goto done; /* * Re-scan now that we have exclusive access. This is necessary because * threads get transaction snapshots with read locks, and we have to be * sure that there isn't a thread that has got a snapshot locally but * not yet published its snap_min. */ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); #ifdef HAVE_DIAGNOSTIC { /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. Coverity * complains if there are assignments only done in diagnostic builds, * and when the read is from a volatile. */ uint64_t id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); } #endif /* Update the oldest ID. */ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; if (WT_TXNID_LT(txn_global->last_running, last_running)) { txn_global->last_running = last_running; #ifdef HAVE_VERBOSE /* Output a verbose message about long-running transactions, * but only when some progress is being made. */ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } #endif } done: __wt_writeunlock(session, txn_global->scan_rwlock); return (ret); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; uint8_t *end, *p; size_t recsize; uint32_t i, rectype; const char *fmt; conn = S2C(session); txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = true; if (conn->compat_major >= WT_LOG_V2) { /* * Write the system log record containing a checkpoint * start operation. */ rectype = WT_LOGREC_SYSTEM; fmt = WT_UNCHECKED_STRING(I); WT_ERR(__wt_struct_size( session, &recsize, fmt, rectype)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_logop_checkpoint_start_pack( session, logrec)); WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0)); } else { WT_ERR(__wt_log_printf(session, "CHECKPOINT: Starting record")); WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } /* * We take and immediately release the visibility lock. * Acquiring the write lock guarantees that any transaction * that has written to the log has also made its transaction * visible at this time. */ __wt_writelock(session, &txn_global->visibility_rwlock); __wt_writeunlock(session, &txn_global->visibility_rwlock); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. */ WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } else ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ rectype = WT_LOGREC_CHECKPOINT; fmt = WT_UNCHECKED_STRING(IIIIu); WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress and this is not an unclean * recovery, tell the logging subsystem the checkpoint LSN so * that it can archive. Do not update the logging checkpoint * LSN if this is during a clean connection close, only during * a full checkpoint. A clean close may not update any * metadata LSN and we do not want to archive in that case. */ if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(session, &txn->ckpt_snapshot); txn->full_ckpt = false; break; WT_ILLEGAL_VALUE_ERR(session); } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __sweep -- * Close unused dhandles on the connection dhandle list. */ static int __sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *dhandle_next; WT_DECL_RET; time_t now; int locked; conn = S2C(session); /* Don't discard handles that have been open recently. */ WT_RET(__wt_seconds(session, &now)); WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); dhandle = SLIST_FIRST(&conn->dhlh); for (; dhandle != NULL; dhandle = dhandle_next) { dhandle_next = SLIST_NEXT(dhandle, l); if (WT_IS_METADATA(dhandle)) continue; if (dhandle->session_inuse != 0 || now <= dhandle->timeofdeath + WT_DHANDLE_SWEEP_WAIT) continue; if (dhandle->timeofdeath == 0) { dhandle->timeofdeath = now; WT_STAT_FAST_CONN_INCR(session, dh_conn_tod); continue; } /* * We have a candidate for closing; if it's open, acquire an * exclusive lock on the handle and close it. We might be * blocking opens for a long time (over disk I/O), but the * handle was quiescent for awhile. * * The close can fail if an update cannot be written (updates * in a no-longer-referenced file might not yet be globally * visible if sessions have disjoint sets of files open). If * the handle is busy, skip it, we'll retry the close the next * time, after the transaction state has progressed. * * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we want * opens to block on us rather than returning an EBUSY error to * the application. */ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == EBUSY) continue; WT_RET(ret); locked = 1; /* If the handle is open, try to close it. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, 0)); if (ret != 0) goto unlock; /* We closed the btree handle, bump the statistic. */ WT_STAT_FAST_CONN_INCR(session, dh_conn_handles); } /* * If there are no longer any references to the handle in any * sessions, attempt to discard it. The called function * re-checks that the handle is not in use, which is why we * don't do any special handling of EBUSY returns above. */ if (dhandle->session_inuse == 0 && dhandle->session_ref == 0) { WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_dhandle_discard_single(session, 0)); if (ret != 0) goto unlock; /* If the handle was discarded, it isn't locked. */ locked = 0; } else WT_STAT_FAST_CONN_INCR(session, dh_conn_ref); unlock: if (locked) WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); WT_RET_BUSY_OK(ret); } return (0); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; bool locked, readonly; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; #endif txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); readonly = txn->mod_count == 0; /* * Look for a commit timestamp. */ WT_ERR( __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval)); if (cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval, true, true, true)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } #ifdef HAVE_TIMESTAMPS /* * Debugging checks on timestamps, if user requested them. */ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " "none set on this transaction"); if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " "timestamp set on this transaction"); #endif /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. */ WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* * If the user chose the default setting, check whether sync is enabled * for this transaction (either inherited or via begin_transaction). * If sync is disabled, clear the field to avoid the log write being * flushed. * * Otherwise check for specific settings. We don't need to check for * "on" because that is the default inherited from the connection. If * the user set anything in begin_transaction, we only override with an * explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { /* * If the caller already set sync on begin_transaction then * they should not be using sync on commit_transaction. * Flag that as an error. */ if (F_ISSET(txn, WT_TXN_SYNC_SET)) WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction"); if (WT_STRING_MATCH("background", cval.str, cval.len)) txn->txn_logsync = WT_LOG_BACKGROUND; else if (WT_STRING_MATCH("off", cval.str, cval.len)) txn->txn_logsync = 0; /* * We don't need to check for "on" here because that is the * default to inherit from the connection setting. */ } /* Commit notification. */ if (txn->notify != NULL) WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a snapshot. */ if (session->ncursors > 0) { WT_DIAGNOSTIC_YIELD; WT_ERR(__wt_session_copy_values(session)); } /* If we are logging, write a commit log record. */ if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. * Release our snapshot in case it is keeping data pinned. * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); /* * We hold the visibility lock for reading from the time * we write our log record until the time we release our * transaction so that the LSN any checkpoint gets will * always reflect visible data. */ __wt_readlock(session, &txn_global->visibility_rwlock); locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } /* Note: we're going to commit: nothing can fail after this point. */ /* Process and free updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: /* * Switch reserved operations to abort to * simplify obsolete update list truncation. */ if (op->u.upd->type == WT_UPDATE_RESERVED) { op->u.upd->txnid = WT_TXN_ABORTED; break; } /* * Writes to the lookaside file can be evicted as soon * as they commit. */ if (conn->cache->las_fileid != 0 && op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && op->type != WT_TXN_OP_BASIC_TS) { WT_ASSERT(session, op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); } #endif break; case WT_TXN_OP_REF: #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) __wt_timestamp_set( &op->u.ref->page_del->timestamp, &txn->commit_timestamp); #endif break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* Other operations don't need timestamps. */ break; } __wt_txn_op_free(session, op); } txn->mod_count = 0; #ifdef HAVE_TIMESTAMPS /* * Track the largest commit timestamp we have seen. * * We don't actually clear the local commit timestamp, just the flag. * That said, we can't update the global commit timestamp until this * transaction is visible, which happens when we release it. */ update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); #endif __wt_txn_release(session); if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } /* * If it looks like we need to move the global commit timestamp, * write lock and re-check. */ if (update_timestamp) { #if WT_TIMESTAMP_SIZE == 8 while (__wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0) { if (__wt_atomic_cas64( &txn_global->commit_timestamp.val, prev_commit_timestamp.val, txn->commit_timestamp.val)) { txn_global->has_commit_timestamp = true; break; } __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp); } #else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { __wt_timestamp_set(&txn_global->commit_timestamp, &txn->commit_timestamp); txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); #endif } #endif /* * We're between transactions, if we need to block for eviction, it's * a good time to do so. Note that we must ignore any error return * because the user's data is committed. */ if (!readonly) (void)__wt_cache_eviction_check(session, false, false, NULL); return (0); err: /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); }
/* * __backup_start -- * Start a backup. */ static int __backup_start( WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FSTREAM *srcfs; const char *dest; bool exist, log_only, target_list; conn = S2C(session); srcfs = NULL; dest = NULL; cb->next = 0; cb->list = NULL; cb->list_next = 0; WT_RET(__wt_inmem_unsupported_op(session, "backup cursor")); /* * Single thread hot backups: we're holding the schema lock, so we * know we'll serialize with other attempts to start a hot backup. */ if (conn->hot_backup) WT_RET_MSG( session, EINVAL, "there is already a backup cursor open"); /* * The hot backup copy is done outside of WiredTiger, which means file * blocks can't be freed and re-allocated until the backup completes. * The checkpoint code checks the backup flag, and if a backup cursor * is open checkpoints aren't discarded. We release the lock as soon * as we've set the flag, we don't want to block checkpoints, we just * want to make sure no checkpoints are deleted. The checkpoint code * holds the lock until it's finished the checkpoint, otherwise we * could start a hot backup that would race with an already-started * checkpoint. * * We are holding the checkpoint and schema locks so schema operations * will not see the backup file list until it is complete and valid. */ __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = true; conn->hot_backup_list = NULL; __wt_writeunlock(session, &conn->hot_backup_lock); /* We're the lock holder, we own cleanup. */ F_SET(cb, WT_CURBACKUP_LOCKER); /* * Create a temporary backup file. This must be opened before * generating the list of targets in backup_uri. This file will * later be renamed to the correct name depending on whether or not * we're doing an incremental backup. We need a temp file so that if * we fail or crash while filling it, the existence of a partial file * doesn't confuse restarting in the source database. */ WT_ERR(__wt_fopen(session, WT_BACKUP_TMP, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs)); /* * If a list of targets was specified, work our way through them. * Else, generate a list of all database objects. * * Include log files if doing a full backup, and copy them before * copying data files to avoid rolling the metadata forward across * a checkpoint that completes during the backup. */ target_list = false; WT_ERR(__backup_uri(session, cfg, &target_list, &log_only)); if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); WT_ERR(__backup_all(session)); } /* Add the hot backup and standard WiredTiger files to the list. */ if (log_only) { /* * We also open an incremental backup source file so that we * can detect a crash with an incremental backup existing in * the source directory versus an improper destination. */ dest = WT_INCREMENTAL_BACKUP; WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &srcfs)); WT_ERR(__backup_list_append( session, cb, WT_INCREMENTAL_BACKUP)); } else { dest = WT_METADATA_BACKUP; WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP)); WT_ERR(__wt_fs_exist(session, WT_BASECONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_BASECONFIG)); WT_ERR(__wt_fs_exist(session, WT_USERCONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_USERCONFIG)); WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER)); } err: /* Close the hot backup file. */ WT_TRET(__wt_fclose(session, &cb->bfs)); if (srcfs != NULL) WT_TRET(__wt_fclose(session, &srcfs)); if (ret == 0) { WT_ASSERT(session, dest != NULL); WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false)); __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = cb->list; __wt_writeunlock(session, &conn->hot_backup_lock); } return (ret); }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { struct timespec start, now; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; uint64_t timediff; bool did_work, locked, signalled; session = arg; conn = S2C(session); log = conn->log; locked = signalled = false; /* * Set this to the number of milliseconds we want to run archive and * pre-allocation. Start it so that we run on the first time through. */ timediff = WT_THOUSAND; /* * The log server thread does a variety of work. It forces out any * buffered log writes. It pre-allocates log files and it performs * log archiving. The reason the wrlsn thread does not force out * the buffered writes is because we want to process and move the * write_lsn forward as quickly as possible. The same reason applies * to why the log file server thread does not force out the writes. * That thread does fsync calls which can take a long time and we * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ did_work = true; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the * wrlsn thread because of interaction advancing the write_lsn * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0, &did_work)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ if (timediff >= WT_THOUSAND || signalled) { /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) { /* * Log file pre-allocation is disabled when a * hot backup cursor is open because we have * agreed not to rename or remove any files in * the database directory. */ WT_ERR(__wt_readlock( session, conn->hot_backup_lock)); locked = true; if (!conn->hot_backup) WT_ERR(__log_prealloc_once(session)); WT_ERR(__wt_readunlock( session, conn->hot_backup_lock)); locked = false; } /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); WT_TRET(__wt_writeunlock( session, log->log_archive_lock)); WT_ERR(ret); } else WT_ERR( __wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open " "log cursor holding archive lock")); } } /* Wait until the next event. */ WT_ERR(__wt_epoch(session, &start)); WT_ERR(__wt_cond_auto_wait_signal(session, conn->log_cond, did_work, &signalled)); WT_ERR(__wt_epoch(session, &now)); timediff = WT_TIMEDIFF_MS(now, start); } if (0) { err: __wt_err(session, ret, "log server error"); if (locked) WT_TRET(__wt_readunlock( session, conn->hot_backup_lock)); } return (WT_THREAD_RET_VALUE); }
/* * __backup_start -- * Start a backup. */ static int __backup_start( WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; bool exist, log_only, target_list; conn = S2C(session); cb->next = 0; cb->list = NULL; cb->list_next = 0; /* * Single thread hot backups: we're holding the schema lock, so we * know we'll serialize with other attempts to start a hot backup. */ if (conn->hot_backup) WT_RET_MSG( session, EINVAL, "there is already a backup cursor open"); /* * The hot backup copy is done outside of WiredTiger, which means file * blocks can't be freed and re-allocated until the backup completes. * The checkpoint code checks the backup flag, and if a backup cursor * is open checkpoints aren't discarded. We release the lock as soon * as we've set the flag, we don't want to block checkpoints, we just * want to make sure no checkpoints are deleted. The checkpoint code * holds the lock until it's finished the checkpoint, otherwise we * could start a hot backup that would race with an already-started * checkpoint. */ WT_RET(__wt_writelock(session, conn->hot_backup_lock)); conn->hot_backup = true; WT_ERR(__wt_writeunlock(session, conn->hot_backup_lock)); /* Create the hot backup file. */ WT_ERR(__backup_file_create(session, cb, false)); /* Add log files if logging is enabled. */ /* * If a list of targets was specified, work our way through them. * Else, generate a list of all database objects. * * Include log files if doing a full backup, and copy them before * copying data files to avoid rolling the metadata forward across * a checkpoint that completes during the backup. */ target_list = false; WT_ERR(__backup_uri(session, cfg, &target_list, &log_only)); if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); WT_ERR(__backup_all(session)); } /* Add the hot backup and standard WiredTiger files to the list. */ if (log_only) { /* * Close any hot backup file. * We're about to open the incremental backup file. */ WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE)); WT_ERR(__backup_file_create(session, cb, log_only)); WT_ERR(__backup_list_append( session, cb, WT_INCREMENTAL_BACKUP)); } else { WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP)); WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_BASECONFIG)); WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_USERCONFIG)); WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER)); } err: /* Close the hot backup file. */ WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE)); if (ret != 0) { WT_TRET(__backup_cleanup_handles(session, cb)); WT_TRET(__backup_stop(session)); } return (ret); }
/* * __conn_btree_open -- * Open the current btree handle. */ static int __conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; dhandle = session->dhandle; btree = S2BT(session); WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) && F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); /* * If the handle is already open, it has to be closed so it can be * reopened with a new configuration. We don't need to check again: * this function isn't called if the handle is already open in the * required mode. * * This call can return EBUSY if there's an update in the object that's * not yet globally visible. That's not a problem because it can only * happen when we're switching from a normal handle to a "special" one, * so we're returning EBUSY to an attempt to verify or do other special * operations. The reverse won't happen because when the handle from a * verify or other special operation is closed, there won't be updates * in the tree that can block the close. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) WT_RET(__wt_conn_btree_sync_and_close(session, 0)); /* Discard any previous configuration, set up the new configuration. */ __conn_btree_config_clear(session); WT_RET(__conn_btree_config_set(session)); /* Set any special flags on the handle. */ F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS)); do { WT_ERR(__wt_btree_open(session, cfg)); F_SET(dhandle, WT_DHANDLE_OPEN); /* * Checkpoint handles are read only, so eviction calculations * based on the number of btrees are better to ignore them. */ if (dhandle->checkpoint == NULL) ++S2C(session)->open_btree_count; /* Drop back to a readlock if that is all that was needed. */ if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) { F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); WT_ERR(__wt_writeunlock(session, dhandle->rwlock)); WT_ERR( __conn_dhandle_open_lock(session, dhandle, flags)); } } while (!F_ISSET(dhandle, WT_DHANDLE_OPEN)); if (0) { err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); /* If the open failed, close the handle. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) WT_TRET(__wt_conn_btree_sync_and_close(session, 0)); } return (ret); }
/* * __conn_dhandle_open_lock -- * Spin on the current data handle until either (a) it is open, read * locked; or (b) it is closed, write locked. If exclusive access is * requested and cannot be granted immediately because the handle is * in use, fail with EBUSY. * * Here is a brief summary of how different operations synchronize using * either the schema lock, handle locks or handle flags: * * open -- holds the schema lock, one thread gets the handle exclusive, * reverts to a shared handle lock and drops the schema lock * once the handle is open; * bulk load -- sets bulk and exclusive; * salvage, truncate, update, verify -- hold the schema lock, set a * "special" flag; * sweep -- gets a write lock on the handle, doesn't set exclusive * * The schema lock prevents a lot of potential conflicts: we should never * see handles being salvaged or verified because those operation hold the * schema lock. However, it is possible to see a handle that is being * bulk loaded, or that the sweep server is closing. * * The principle here is that application operations can cause other * application operations to fail (so attempting to open a cursor on a * file while it is being bulk-loaded will fail), but internal or * database-wide operations should not prevent application-initiated * operations. For example, attempting to verify a file should not fail * because the sweep server happens to be in the process of closing that * file. */ static int __conn_dhandle_open_lock( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; int is_open, lock_busy, want_exclusive; btree = dhandle->handle; lock_busy = 0; want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE) ? 1 : 0; /* * Check that the handle is open. We've already incremented * the reference count, so once the handle is open it won't be * closed by another thread. * * If we can see the WT_DHANDLE_OPEN flag set while holding a * lock on the handle, then it's really open and we can start * using it. Alternatively, if we can get an exclusive lock * and WT_DHANDLE_OPEN is still not set, we need to do the open. */ for (;;) { /* * If the handle is already open for a special operation, * give up. */ if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) return (EBUSY); /* * If the handle is open, get a read lock and recheck. * * Wait for a read lock if we want exclusive access and failed * to get it: the sweep server may be closing this handle, and * we need to wait for it to complete. If we want exclusive * access and find the handle open once we get the read lock, * give up: some other thread has it locked for real. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { WT_RET(__wt_readlock(session, dhandle->rwlock)); is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN) ? 1 : 0; if (is_open && !want_exclusive) return (0); WT_RET(__wt_readunlock(session, dhandle->rwlock)); } else is_open = 0; /* * It isn't open or we want it exclusive: try to get an * exclusive lock. There is some subtlety here: if we race * with another thread that successfully opens the file, we * don't want to block waiting to get exclusive access. */ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { /* * If it was opened while we waited, drop the write * lock and get a read lock instead. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !want_exclusive) { lock_busy = 0; WT_RET( __wt_writeunlock(session, dhandle->rwlock)); continue; } /* We have an exclusive lock, we're done. */ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); return (0); } else if (ret != EBUSY || (is_open && want_exclusive)) return (ret); else lock_busy = 1; /* Give other threads a chance to make progress. */ __wt_yield(); } }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; int freq_per_sec, signalled; session = arg; conn = S2C(session); log = conn->log; signalled = 0; /* * Set this to the number of times per second we want to force out the * log slot buffer. */ #define WT_FORCE_PER_SECOND 20 freq_per_sec = WT_FORCE_PER_SECOND; /* * The log server thread does a variety of work. It forces out any * buffered log writes. It pre-allocates log files and it performs * log archiving. The reason the wrlsn thread does not force out * the buffered writes is because we want to process and move the * write_lsn forward as quickly as possible. The same reason applies * to why the log file server thread does not force out the writes. * That thread does fsync calls which can take a long time and we * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the * wrlsn thread because of interaction advancing the write_lsn * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ if (--freq_per_sec <= 0 || signalled != 0) { freq_per_sec = WT_FORCE_PER_SECOND; /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) WT_ERR(__log_prealloc_once(session)); /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); WT_TRET(__wt_writeunlock( session, log->log_archive_lock)); WT_ERR(ret); } else WT_ERR( __wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open " "log cursor holding archive lock")); } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); } if (0) { err: __wt_err(session, ret, "log server error"); } return (WT_THREAD_RET_VALUE); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } /* * The page's modification information can change underfoot if the page * is being reconciled, serialize with reconciliation. */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) __wt_writelock(session, &page->page_lock); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { if (multi->disk_image != NULL) continue; if ((ret = bm->compact_page_skip(bm, session, multi->addr.addr, multi->addr.size, skipp)) != 0) break; if (!*skipp) break; } if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) __wt_writeunlock(session, &page->page_lock); return (ret); }