/* * __backup_stop -- * Stop a backup. */ static int __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int i; conn = S2C(session); /* Release all btree names held by the backup. */ __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = NULL; __wt_writeunlock(session, &conn->hot_backup_lock); if (cb->list != NULL) { for (i = 0; cb->list[i] != NULL; ++i) __wt_free(session, cb->list[i]); __wt_free(session, cb->list); } /* Remove any backup specific file. */ WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion can proceed, as can the next hot backup. */ __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = false; __wt_writeunlock(session, &conn->hot_backup_lock); return (ret); }
/* * __wt_log_truncate_files -- * Truncate log files via archive once. Requires that the server is not * currently running. */ int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t backup_file, locked; WT_UNUSED(cfg); conn = S2C(session); log = conn->log; if (F_ISSET(conn, WT_CONN_SERVER_RUN) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); backup_file = 0; if (cursor != NULL) backup_file = WT_CURSOR_BACKUP_ID(cursor); WT_ASSERT(session, backup_file <= log->alloc_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file)); WT_RET(__wt_writelock(session, log->log_archive_lock)); locked = 1; WT_ERR(__log_archive_once(session, backup_file)); WT_ERR(__wt_writeunlock(session, log->log_archive_lock)); locked = 0; err: if (locked) WT_RET(__wt_writeunlock(session, log->log_archive_lock)); return (ret); }
/* * __wt_log_truncate_files -- * Truncate log files via archive once. Requires that the server is not * currently running. */ int __wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t backup_file; conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); if (!force && F_ISSET(conn, WT_CONN_SERVER_LOG) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); log = conn->log; backup_file = 0; if (cursor != NULL) { WT_ASSERT(session, force == false); backup_file = WT_CURSOR_BACKUP_ID(cursor); } WT_ASSERT(session, backup_file <= log->alloc_lsn.l.file); __wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file); __wt_writelock(session, &log->log_archive_lock); ret = __log_archive_once(session, backup_file); __wt_writeunlock(session, &log->log_archive_lock); return (ret); }
/* * __wt_lsm_tree_writelock -- * Acquire an exclusive lock on an LSM tree. */ int __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_RET(__wt_writelock(session, lsm_tree->rwlock)); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); return (0); }
/* * __wt_lsm_tree_lock -- * Lock an LSM tree for reading or writing. */ int __wt_lsm_tree_lock( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int exclusive) { /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ F_SET(session, WT_SESSION_NO_SCHEMA_LOCK); if (exclusive) return (__wt_writelock(session, lsm_tree->rwlock)); else return (__wt_readlock(session, lsm_tree->rwlock)); }
/* * __wt_ovfl_discard -- * Discard an on-page overflow value, and reset the page's cell. */ int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) { WT_BM *bm; WT_BTREE *btree; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_RET; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; __wt_cell_unpack(cell, unpack); /* * Finally remove overflow key/value objects, called when reconciliation * finishes after successfully writing a page. * * Keys must have already been instantiated and value objects must have * already been cached (if they might potentially still be read by any * running transaction). * * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ WT_RET(__wt_writelock(session, btree->ovfl_lock)); switch (unpack->raw) { case WT_CELL_KEY_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM); break; case WT_CELL_VALUE_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM); break; WT_ILLEGAL_VALUE(session); } WT_TRET(__wt_writeunlock(session, btree->ovfl_lock)); /* Free the backing disk blocks. */ WT_TRET(bm->free(bm, session, unpack->data, unpack->size)); return (ret); }
/* * __wt_thread_group_resize -- * Resize an array of utility threads taking the lock. */ int __wt_thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t new_min, uint32_t new_max, uint32_t flags) { WT_DECL_RET; __wt_verbose(session, WT_VERB_THREAD_GROUP, "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32 " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); __wt_writelock(session, group->lock); WT_TRET(__thread_group_resize(session, group, new_min, new_max, flags)); __wt_writeunlock(session, group->lock); return (ret); }
/* * __backup_stop -- * Stop a backup. */ static int __backup_stop(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); /* Remove any backup specific file. */ ret = __wt_backup_file_remove(session); /* Checkpoint deletion can proceed, as can the next hot backup. */ WT_TRET(__wt_writelock(session, conn->hot_backup_lock)); conn->hot_backup = false; WT_TRET(__wt_writeunlock(session, conn->hot_backup_lock)); return (ret); }
/* * __wt_thread_group_start_one -- * Start a new thread if possible. */ int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) { WT_DECL_RET; if (group->current_threads >= group->max) return (0); if (wait) __wt_writelock(session, group->lock); else if (__wt_try_writelock(session, group->lock) != 0) return (0); /* Recheck the bounds now that we hold the lock */ if (group->current_threads < group->max) WT_TRET(__thread_group_grow( session, group, group->current_threads + 1)); __wt_writeunlock(session, group->lock); return (ret); }
/* * __wt_ovfl_track_wrapup_err -- * Resolve the page's overflow tracking on reconciliation error. */ int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_DECL_RET; WT_OVFL_TRACK *track; if (page->modify == NULL || page->modify->ovfl_track == NULL) return (0); track = page->modify->ovfl_track; if (track->discard != NULL) WT_RET(__ovfl_discard_wrapup_err(session, page)); if (track->ovfl_reuse[0] != NULL) WT_RET(__ovfl_reuse_wrapup_err(session, page)); if (track->ovfl_txnc[0] != NULL) { WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock)); ret = __ovfl_txnc_wrapup(session, page); WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock)); } return (0); }
/* * __wt_thread_group_create -- * Create a new thread group, assumes incoming group structure is * zero initialized. */ int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name, uint32_t min, uint32_t max, uint32_t flags, int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context)) { WT_DECL_RET; bool cond_alloced; /* Check that the structure is initialized as expected */ WT_ASSERT(session, group->alloc == 0); cond_alloced = false; __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); WT_ERR(__wt_cond_alloc( session, "Thread group cond", false, &group->wait_cond)); cond_alloced = true; __wt_writelock(session, group->lock); group->run_func = run_func; group->name = name; WT_TRET(__thread_group_resize(session, group, min, max, flags)); __wt_writeunlock(session, group->lock); /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { if (cond_alloced) WT_TRET(__wt_cond_destroy(session, &group->wait_cond)); __wt_rwlock_destroy(session, &group->lock); } return (ret); }
/* * __snapshot_worker -- * Snapshot the tree. */ static int __snapshot_worker( WT_SESSION_IMPL *session, const char *name, int discard, snapshot_op op) { WT_BTREE *btree; WT_DECL_RET; WT_SNAPSHOT *deleted, *snap, *snapbase; int force, matched, tracked; btree = session->btree; matched = tracked = 0; snap = snapbase = NULL; /* Snapshots are single-threaded. */ __wt_writelock(session, btree->snaplock); /* Set the name to the default, if we aren't provided one. */ if (op == SNAPSHOT && name == NULL) { force = 0; name = WT_INTERNAL_SNAPSHOT; } else force = 1; /* * Get the list of snapshots for this file. If there's no reference, * this file is dead. Discard it from the cache without bothering to * write any dirty pages. */ if ((ret = __wt_meta_snaplist_get(session, btree->name, &snapbase)) != 0) { if (ret == WT_NOTFOUND) ret = __wt_bt_cache_flush( session, NULL, WT_SYNC_DISCARD_NOWRITE, 0); goto err; } switch (op) { case SNAPSHOT: /* * Create a new, possibly named, snapshot. Review existing * snapshots, deleting default snapshots and snapshots with * matching names, add the new snapshot entry at the end of * the list. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (strcmp(snap->name, name) == 0 || strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0) F_SET(snap, WT_SNAP_DELETE); WT_ERR(__wt_strdup(session, name, &snap->name)); F_SET(snap, WT_SNAP_ADD); break; case SNAPSHOT_DROP: /* * Drop all snapshots with matching names. * Drop all snapshots with the default name. * Add a new snapshot with the default name. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { /* * There should be only one snapshot with a matching * name, but it doesn't hurt to check the rest. */ if (strcmp(snap->name, name) == 0) matched = 1; else if (strcmp(snap->name, WT_INTERNAL_SNAPSHOT) != 0) continue; F_SET(snap, WT_SNAP_DELETE); } if (!matched) goto nomatch; WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name)); F_SET(snap, WT_SNAP_ADD); break; case SNAPSHOT_DROP_ALL: /* * Drop all snapshots. * Add a new snapshot with the default name. */ WT_SNAPSHOT_FOREACH(snapbase, snap) F_SET(snap, WT_SNAP_DELETE); WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name)); F_SET(snap, WT_SNAP_ADD); break; case SNAPSHOT_DROP_FROM: /* * Drop all snapshots after, and including, the named snapshot. * Drop all snapshots with the default name. * Add a new snapshot with the default name. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (strcmp(snap->name, name) == 0) matched = 1; if (matched || strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0) F_SET(snap, WT_SNAP_DELETE); } if (!matched) goto nomatch; WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name)); F_SET(snap, WT_SNAP_ADD); break; case SNAPSHOT_DROP_TO: /* * Drop all snapshots before, and including, the named snapshot. * Drop all snapshots with the default name. * Add a new snapshot with the default name. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (!matched || strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0) F_SET(snap, WT_SNAP_DELETE); if (strcmp(snap->name, name) == 0) matched = 1; } if (!matched) nomatch: WT_ERR_MSG(session, EINVAL, "no snapshot named %s was found", name); WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name)); F_SET(snap, WT_SNAP_ADD); break; } /* * Lock the snapshots that will be deleted. * * Snapshots are only locked when tracking is enabled, which covers * sync and drop operations, but not close. The reasoning is that * there should be no access to a snapshot during close, because any * thread accessing a snapshot will also have the current file handle * open. */ if (WT_META_TRACKING(session)) WT_SNAPSHOT_FOREACH(snapbase, deleted) if (F_ISSET(deleted, WT_SNAP_DELETE)) WT_ERR(__wt_session_lock_snapshot(session, deleted->name, WT_BTREE_EXCLUSIVE)); WT_ERR(__wt_bt_cache_flush( session, snapbase, discard ? WT_SYNC_DISCARD : WT_SYNC, force)); /* If there was a snapshot, update the metadata. */ if (snap->raw.data == NULL) { if (force) WT_ERR_MSG(session, EINVAL, "cache flush failed to create a snapshot"); } else { WT_ERR(__wt_meta_snaplist_set(session, btree->name, snapbase)); /* * If tracking is enabled, defer making pages available until * the end of the transaction. The exception is if the handle * is being discarded: in that case, it will be gone by the * time we try to apply or unroll the meta tracking event. */ if (WT_META_TRACKING(session) && !discard) { WT_ERR(__wt_meta_track_checkpoint(session)); tracked = 1; } else WT_ERR(__wt_bm_snapshot_resolve(session, snapbase)); } err: __wt_meta_snaplist_free(session, snapbase); if (!tracked) __wt_rwunlock(session, btree->snaplock); return (ret); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. */ int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; uint64_t current_id, last_running, oldest_id; uint64_t prev_last_running, prev_oldest_id; bool strict, wait; conn = S2C(session); txn_global = &conn->txn_global; strict = LF_ISSET(WT_TXN_OLDEST_STRICT); wait = LF_ISSET(WT_TXN_OLDEST_WAIT); current_id = last_running = txn_global->current; prev_last_running = txn_global->last_running; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return (0); /* First do a read-only scan. */ if (wait) __wt_readlock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); __wt_readunlock(session, txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for * non-forced updates), give up. */ if ((oldest_id == prev_oldest_id || (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && ((last_running == prev_last_running) || (!strict && WT_TXNID_LT(last_running, prev_last_running + 100)))) return (0); /* It looks like an update is necessary, wait for exclusive access. */ if (wait) __wt_writelock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* * If the oldest ID has been updated while we waited, don't bother * scanning. */ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && WT_TXNID_LE(last_running, txn_global->last_running)) goto done; /* * Re-scan now that we have exclusive access. This is necessary because * threads get transaction snapshots with read locks, and we have to be * sure that there isn't a thread that has got a snapshot locally but * not yet published its snap_min. */ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); #ifdef HAVE_DIAGNOSTIC { /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. Coverity * complains if there are assignments only done in diagnostic builds, * and when the read is from a volatile. */ uint64_t id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); } #endif /* Update the oldest ID. */ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; if (WT_TXNID_LT(txn_global->last_running, last_running)) { txn_global->last_running = last_running; #ifdef HAVE_VERBOSE /* Output a verbose message about long-running transactions, * but only when some progress is being made. */ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } #endif } done: __wt_writeunlock(session, txn_global->scan_rwlock); return (ret); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; uint8_t *end, *p; size_t recsize; uint32_t i, rectype; const char *fmt; conn = S2C(session); txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = true; if (conn->compat_major >= WT_LOG_V2) { /* * Write the system log record containing a checkpoint * start operation. */ rectype = WT_LOGREC_SYSTEM; fmt = WT_UNCHECKED_STRING(I); WT_ERR(__wt_struct_size( session, &recsize, fmt, rectype)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_logop_checkpoint_start_pack( session, logrec)); WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0)); } else { WT_ERR(__wt_log_printf(session, "CHECKPOINT: Starting record")); WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } /* * We take and immediately release the visibility lock. * Acquiring the write lock guarantees that any transaction * that has written to the log has also made its transaction * visible at this time. */ __wt_writelock(session, &txn_global->visibility_rwlock); __wt_writeunlock(session, &txn_global->visibility_rwlock); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. */ WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } else ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ rectype = WT_LOGREC_CHECKPOINT; fmt = WT_UNCHECKED_STRING(IIIIu); WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress and this is not an unclean * recovery, tell the logging subsystem the checkpoint LSN so * that it can archive. Do not update the logging checkpoint * LSN if this is during a clean connection close, only during * a full checkpoint. A clean close may not update any * metadata LSN and we do not want to archive in that case. */ if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(session, &txn->ckpt_snapshot); txn->full_ckpt = false; break; WT_ILLEGAL_VALUE_ERR(session); } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __conn_dhandle_get -- * Allocate a new data handle, lock it exclusively, and return it linked * into the connection's list. */ static int __conn_dhandle_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, uint32_t flags) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; uint32_t bucket; conn = S2C(session); /* * We have the handle lock, check whether we can find the handle we * are looking for. If we do, and we can lock it in the state we * want, this session will take ownership and we are done. */ ret = __wt_conn_dhandle_find(session, name, ckpt, flags); if (ret == 0) { dhandle = session->dhandle; WT_RET(__conn_dhandle_open_lock(session, dhandle, flags)); return (0); } WT_RET_NOTFOUND_OK(ret); /* * If no handle was found, allocate the data handle and a btree handle, * then initialize the data handle. Exclusively lock the data handle * before inserting it in the list. */ WT_RET(__wt_calloc_one(session, &dhandle)); WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); dhandle->name_hash = __wt_hash_city64(name, strlen(name)); WT_ERR(__wt_strdup(session, name, &dhandle->name)); if (ckpt != NULL) WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint)); WT_ERR(__wt_calloc_one(session, &btree)); dhandle->handle = btree; btree->dhandle = dhandle; WT_ERR(__wt_spin_init( session, &dhandle->close_lock, "data handle close")); F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); WT_ERR(__wt_writelock(session, dhandle->rwlock)); /* * Prepend the handle to the connection list, assuming we're likely to * need new files again soon, until they are cached by all sessions. * Find the right hash bucket to insert into as well. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket); session->dhandle = dhandle; return (0); err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); __wt_free(session, dhandle->name); __wt_free(session, dhandle->checkpoint); __wt_free(session, dhandle->handle); /* btree free */ __wt_spin_destroy(session, &dhandle->close_lock); __wt_overwrite_and_free(session, dhandle); return (ret); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; bool locked, readonly; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; #endif txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); readonly = txn->mod_count == 0; /* * Look for a commit timestamp. */ WT_ERR( __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval)); if (cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval, true, true, true)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } #ifdef HAVE_TIMESTAMPS /* * Debugging checks on timestamps, if user requested them. */ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " "none set on this transaction"); if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " "timestamp set on this transaction"); #endif /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. */ WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* * If the user chose the default setting, check whether sync is enabled * for this transaction (either inherited or via begin_transaction). * If sync is disabled, clear the field to avoid the log write being * flushed. * * Otherwise check for specific settings. We don't need to check for * "on" because that is the default inherited from the connection. If * the user set anything in begin_transaction, we only override with an * explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { /* * If the caller already set sync on begin_transaction then * they should not be using sync on commit_transaction. * Flag that as an error. */ if (F_ISSET(txn, WT_TXN_SYNC_SET)) WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction"); if (WT_STRING_MATCH("background", cval.str, cval.len)) txn->txn_logsync = WT_LOG_BACKGROUND; else if (WT_STRING_MATCH("off", cval.str, cval.len)) txn->txn_logsync = 0; /* * We don't need to check for "on" here because that is the * default to inherit from the connection setting. */ } /* Commit notification. */ if (txn->notify != NULL) WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a snapshot. */ if (session->ncursors > 0) { WT_DIAGNOSTIC_YIELD; WT_ERR(__wt_session_copy_values(session)); } /* If we are logging, write a commit log record. */ if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. * Release our snapshot in case it is keeping data pinned. * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); /* * We hold the visibility lock for reading from the time * we write our log record until the time we release our * transaction so that the LSN any checkpoint gets will * always reflect visible data. */ __wt_readlock(session, &txn_global->visibility_rwlock); locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } /* Note: we're going to commit: nothing can fail after this point. */ /* Process and free updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: /* * Switch reserved operations to abort to * simplify obsolete update list truncation. */ if (op->u.upd->type == WT_UPDATE_RESERVED) { op->u.upd->txnid = WT_TXN_ABORTED; break; } /* * Writes to the lookaside file can be evicted as soon * as they commit. */ if (conn->cache->las_fileid != 0 && op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && op->type != WT_TXN_OP_BASIC_TS) { WT_ASSERT(session, op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); } #endif break; case WT_TXN_OP_REF: #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) __wt_timestamp_set( &op->u.ref->page_del->timestamp, &txn->commit_timestamp); #endif break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* Other operations don't need timestamps. */ break; } __wt_txn_op_free(session, op); } txn->mod_count = 0; #ifdef HAVE_TIMESTAMPS /* * Track the largest commit timestamp we have seen. * * We don't actually clear the local commit timestamp, just the flag. * That said, we can't update the global commit timestamp until this * transaction is visible, which happens when we release it. */ update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); #endif __wt_txn_release(session); if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } /* * If it looks like we need to move the global commit timestamp, * write lock and re-check. */ if (update_timestamp) { #if WT_TIMESTAMP_SIZE == 8 while (__wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0) { if (__wt_atomic_cas64( &txn_global->commit_timestamp.val, prev_commit_timestamp.val, txn->commit_timestamp.val)) { txn_global->has_commit_timestamp = true; break; } __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp); } #else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { __wt_timestamp_set(&txn_global->commit_timestamp, &txn->commit_timestamp); txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); #endif } #endif /* * We're between transactions, if we need to block for eviction, it's * a good time to do so. Note that we must ignore any error return * because the user's data is committed. */ if (!readonly) (void)__wt_cache_eviction_check(session, false, false, NULL); return (0); err: /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); }
/* * __backup_start -- * Start a backup. */ static int __backup_start( WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FSTREAM *srcfs; const char *dest; bool exist, log_only, target_list; conn = S2C(session); srcfs = NULL; dest = NULL; cb->next = 0; cb->list = NULL; cb->list_next = 0; WT_RET(__wt_inmem_unsupported_op(session, "backup cursor")); /* * Single thread hot backups: we're holding the schema lock, so we * know we'll serialize with other attempts to start a hot backup. */ if (conn->hot_backup) WT_RET_MSG( session, EINVAL, "there is already a backup cursor open"); /* * The hot backup copy is done outside of WiredTiger, which means file * blocks can't be freed and re-allocated until the backup completes. * The checkpoint code checks the backup flag, and if a backup cursor * is open checkpoints aren't discarded. We release the lock as soon * as we've set the flag, we don't want to block checkpoints, we just * want to make sure no checkpoints are deleted. The checkpoint code * holds the lock until it's finished the checkpoint, otherwise we * could start a hot backup that would race with an already-started * checkpoint. * * We are holding the checkpoint and schema locks so schema operations * will not see the backup file list until it is complete and valid. */ __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = true; conn->hot_backup_list = NULL; __wt_writeunlock(session, &conn->hot_backup_lock); /* We're the lock holder, we own cleanup. */ F_SET(cb, WT_CURBACKUP_LOCKER); /* * Create a temporary backup file. This must be opened before * generating the list of targets in backup_uri. This file will * later be renamed to the correct name depending on whether or not * we're doing an incremental backup. We need a temp file so that if * we fail or crash while filling it, the existence of a partial file * doesn't confuse restarting in the source database. */ WT_ERR(__wt_fopen(session, WT_BACKUP_TMP, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs)); /* * If a list of targets was specified, work our way through them. * Else, generate a list of all database objects. * * Include log files if doing a full backup, and copy them before * copying data files to avoid rolling the metadata forward across * a checkpoint that completes during the backup. */ target_list = false; WT_ERR(__backup_uri(session, cfg, &target_list, &log_only)); if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); WT_ERR(__backup_all(session)); } /* Add the hot backup and standard WiredTiger files to the list. */ if (log_only) { /* * We also open an incremental backup source file so that we * can detect a crash with an incremental backup existing in * the source directory versus an improper destination. */ dest = WT_INCREMENTAL_BACKUP; WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC, WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &srcfs)); WT_ERR(__backup_list_append( session, cb, WT_INCREMENTAL_BACKUP)); } else { dest = WT_METADATA_BACKUP; WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP)); WT_ERR(__wt_fs_exist(session, WT_BASECONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_BASECONFIG)); WT_ERR(__wt_fs_exist(session, WT_USERCONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_USERCONFIG)); WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER)); } err: /* Close the hot backup file. */ WT_TRET(__wt_fclose(session, &cb->bfs)); if (srcfs != NULL) WT_TRET(__wt_fclose(session, &srcfs)); if (ret == 0) { WT_ASSERT(session, dest != NULL); WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false)); __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = cb->list; __wt_writeunlock(session, &conn->hot_backup_lock); } return (ret); }
/* * __backup_start -- * Start a backup. */ static int __backup_start( WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; bool exist, log_only, target_list; conn = S2C(session); cb->next = 0; cb->list = NULL; cb->list_next = 0; /* * Single thread hot backups: we're holding the schema lock, so we * know we'll serialize with other attempts to start a hot backup. */ if (conn->hot_backup) WT_RET_MSG( session, EINVAL, "there is already a backup cursor open"); /* * The hot backup copy is done outside of WiredTiger, which means file * blocks can't be freed and re-allocated until the backup completes. * The checkpoint code checks the backup flag, and if a backup cursor * is open checkpoints aren't discarded. We release the lock as soon * as we've set the flag, we don't want to block checkpoints, we just * want to make sure no checkpoints are deleted. The checkpoint code * holds the lock until it's finished the checkpoint, otherwise we * could start a hot backup that would race with an already-started * checkpoint. */ WT_RET(__wt_writelock(session, conn->hot_backup_lock)); conn->hot_backup = true; WT_ERR(__wt_writeunlock(session, conn->hot_backup_lock)); /* Create the hot backup file. */ WT_ERR(__backup_file_create(session, cb, false)); /* Add log files if logging is enabled. */ /* * If a list of targets was specified, work our way through them. * Else, generate a list of all database objects. * * Include log files if doing a full backup, and copy them before * copying data files to avoid rolling the metadata forward across * a checkpoint that completes during the backup. */ target_list = false; WT_ERR(__backup_uri(session, cfg, &target_list, &log_only)); if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); WT_ERR(__backup_all(session)); } /* Add the hot backup and standard WiredTiger files to the list. */ if (log_only) { /* * Close any hot backup file. * We're about to open the incremental backup file. */ WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE)); WT_ERR(__backup_file_create(session, cb, log_only)); WT_ERR(__backup_list_append( session, cb, WT_INCREMENTAL_BACKUP)); } else { WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP)); WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_BASECONFIG)); WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_USERCONFIG)); WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER)); } err: /* Close the hot backup file. */ WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE)); if (ret != 0) { WT_TRET(__backup_cleanup_handles(session, cb)); WT_TRET(__backup_stop(session)); } return (ret); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } /* * The page's modification information can change underfoot if the page * is being reconciled, serialize with reconciliation. */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) __wt_writelock(session, &page->page_lock); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { if (multi->disk_image != NULL) continue; if ((ret = bm->compact_page_skip(bm, session, multi->addr.addr, multi->addr.size, skipp)) != 0) break; if (!*skipp) break; } if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) __wt_writeunlock(session, &page->page_lock); return (ret); }