StatusWith<RecordId> WiredTigerRecordStore::updateRecord(OperationContext* txn, const RecordId& loc, const char* data, int len, bool enforceQuota, UpdateNotifier* notifier) { WiredTigerCursor curwrap(_uri, _tableId, true, txn); curwrap.assertInActiveTxn(); WT_CURSOR* c = curwrap.get(); invariant(c); c->set_key(c, _makeKey(loc)); int ret = WT_OP_CHECK(c->search(c)); invariantWTOK(ret); WT_ITEM old_value; ret = c->get_value(c, &old_value); invariantWTOK(ret); int old_length = old_value.size; c->set_key(c, _makeKey(loc)); WiredTigerItem value(data, len); c->set_value(c, value.Get()); ret = WT_OP_CHECK(c->insert(c)); invariantWTOK(ret); _increaseDataSize(txn, len - old_length); cappedDeleteAsNeeded(txn, loc); return StatusWith<RecordId>(loc); }
boost::optional<RecordId> WiredTigerRecordStore::oplogStartHack( OperationContext* txn, const RecordId& startingPosition) const { if (!_useOplogHack) return boost::none; { WiredTigerRecoveryUnit* wru = WiredTigerRecoveryUnit::get(txn); _oplogSetStartHack(wru); } WiredTigerCursor cursor(_uri, _tableId, true, txn); WT_CURSOR* c = cursor.get(); int cmp; c->set_key(c, _makeKey(startingPosition)); int ret = WT_OP_CHECK(c->search_near(c, &cmp)); if (ret == 0 && cmp > 0) ret = c->prev(c); // landed one higher than startingPosition if (ret == WT_NOTFOUND) return RecordId(); // nothing <= startingPosition invariantWTOK(ret); int64_t key; ret = c->get_key(c, &key); invariantWTOK(ret); return _fromKey(key); }
boost::optional<Record> next() final { if (_eof) return {}; WT_CURSOR* c = _cursor->get(); bool mustAdvance = true; if (_lastReturnedId.isNull() && !_forward && _rs._isCapped) { // In this case we need to seek to the highest visible record. const RecordId reverseCappedInitialSeekPoint = _readUntilForOplog.isNull() ? _rs.lowestCappedHiddenRecord() : _readUntilForOplog; if (!reverseCappedInitialSeekPoint.isNull()) { c->set_key(c, _makeKey(reverseCappedInitialSeekPoint)); int cmp; int seekRet = WT_OP_CHECK(c->search_near(c, &cmp)); if (seekRet == WT_NOTFOUND) { _eof = true; return {}; } invariantWTOK(seekRet); // If we landed at or past the lowest hidden record, we must advance to be in // the visible range. mustAdvance = _rs.isCappedHidden(reverseCappedInitialSeekPoint) ? (cmp >= 0) : (cmp > 0); // No longer hidden. } } if (mustAdvance) { // Nothing after the next line can throw WCEs. // Note that an unpositioned (or eof) WT_CURSOR returns the first/last entry in the // table when you call next/prev. int advanceRet = WT_OP_CHECK(_forward ? c->next(c) : c->prev(c)); if (advanceRet == WT_NOTFOUND) { _eof = true; return {}; } invariantWTOK(advanceRet); } int64_t key; invariantWTOK(c->get_key(c, &key)); const RecordId id = _fromKey(key); if (!isVisible(id)) { _eof = true; return {}; } WT_ITEM value; invariantWTOK(c->get_value(c, &value)); _lastReturnedId = id; return {{id, {static_cast<const char*>(value.data), static_cast<int>(value.size)}}}; }
void WiredTigerRecoveryUnit::_txnClose(bool commit) { invariant(_isActive(), toString(_state)); WT_SESSION* s = _session->getSession(); if (_timer) { const int transactionTime = _timer->millis(); // `serverGlobalParams.slowMs` can be set to values <= 0. In those cases, give logging a // break. if (transactionTime >= std::max(1, serverGlobalParams.slowMS)) { LOG(kSlowTransactionSeverity) << "Slow WT transaction. Lifetime of SnapshotId " << _mySnapshotId << " was " << transactionTime << "ms"; } } int wtRet; if (commit) { if (!_commitTimestamp.isNull()) { const std::string conf = "commit_timestamp=" + integerToHex(_commitTimestamp.asULL()); invariantWTOK(s->timestamp_transaction(s, conf.c_str())); _isTimestamped = true; } wtRet = s->commit_transaction(s, nullptr); LOG(3) << "WT commit_transaction for snapshot id " << _mySnapshotId; } else { wtRet = s->rollback_transaction(s, nullptr); invariant(!wtRet); LOG(3) << "WT rollback_transaction for snapshot id " << _mySnapshotId; } if (_isTimestamped) { if (!_orderedCommit) { // We only need to update oplog visibility where commits can be out-of-order with // respect to their assigned optime and such commits might otherwise be visible. // This should happen only on primary nodes. _oplogManager->triggerJournalFlush(); } _isTimestamped = false; } invariantWTOK(wtRet); invariant(!_lastTimestampSet || _commitTimestamp.isNull(), str::stream() << "Cannot have both a _lastTimestampSet and a " "_commitTimestamp. _lastTimestampSet: " << _lastTimestampSet->toString() << ". _commitTimestamp: " << _commitTimestamp.toString()); // We reset the _lastTimestampSet between transactions. Since it is legal for one // transaction on a RecoveryUnit to call setTimestamp() and another to call // setCommitTimestamp(). _lastTimestampSet = boost::none; _prepareTimestamp = Timestamp(); _mySnapshotId = nextSnapshotId.fetchAndAdd(1); _isOplogReader = false; _orderedCommit = true; // Default value is true; we assume all writes are ordered. }
void WiredTigerSessionCache::waitUntilDurable(bool forceCheckpoint) { const int shuttingDown = _shuttingDown.fetchAndAdd(1); ON_BLOCK_EXIT([this] { _shuttingDown.fetchAndSubtract(1); }); uassert(ErrorCodes::ShutdownInProgress, "Cannot wait for durability because a shutdown is in progress", !(shuttingDown & kShuttingDownMask)); // When forcing a checkpoint with journaling enabled, don't synchronize with other // waiters, as a log flush is much cheaper than a full checkpoint. if (forceCheckpoint && _engine->isDurable()) { UniqueWiredTigerSession session = getSession(); WT_SESSION* s = session->getSession(); { stdx::unique_lock<stdx::mutex> lk(_journalListenerMutex); JournalListener::Token token = _journalListener->getToken(); invariantWTOK(s->checkpoint(s, NULL)); _journalListener->onDurable(token); } LOG(4) << "created checkpoint (forced)"; return; } uint32_t start = _lastSyncTime.load(); // Do the remainder in a critical section that ensures only a single thread at a time // will attempt to synchronize. stdx::unique_lock<stdx::mutex> lk(_lastSyncMutex); uint32_t current = _lastSyncTime.loadRelaxed(); // synchronized with writes through mutex if (current != start) { // Someone else synced already since we read lastSyncTime, so we're done! return; } _lastSyncTime.store(current + 1); // Nobody has synched yet, so we have to sync ourselves. auto session = getSession(); WT_SESSION* s = session->getSession(); // This gets the token (OpTime) from the last write, before flushing (either the journal, or a // checkpoint), and then reports that token (OpTime) as a durable write. stdx::unique_lock<stdx::mutex> jlk(_journalListenerMutex); JournalListener::Token token = _journalListener->getToken(); // Use the journal when available, or a checkpoint otherwise. if (_engine->isDurable()) { invariantWTOK(s->log_flush(s, "sync=on")); LOG(4) << "flushed journal"; } else { invariantWTOK(s->checkpoint(s, NULL)); LOG(4) << "created checkpoint"; } _journalListener->onDurable(token); }
void WiredTigerRecoveryUnit::_txnClose(bool commit) { invariant(_active); WT_SESSION* s = _session->getSession(); if (commit) { invariantWTOK(s->commit_transaction(s, NULL)); LOG(3) << "WT commit_transaction for snapshot id " << _mySnapshotId; } else { invariantWTOK(s->rollback_transaction(s, NULL)); LOG(3) << "WT rollback_transaction for snapshot id " << _mySnapshotId; } _active = false; _mySnapshotId = nextSnapshotId.fetchAndAdd(1); }
bool restore() final { if (!_cursor) _cursor.emplace(_rs.getURI(), _rs.tableId(), true, _txn); // This will ensure an active session exists, so any restored cursors will bind to it invariant(WiredTigerRecoveryUnit::get(_txn)->getSession(_txn) == _cursor->getSession()); // If we've hit EOF, then this iterator is done and need not be restored. if (_eof) return true; if (_lastReturnedId.isNull()) return true; WT_CURSOR* c = _cursor->get(); c->set_key(c, _makeKey(_lastReturnedId)); int cmp; int ret = WT_OP_CHECK(c->search_near(c, &cmp)); if (ret == WT_NOTFOUND) { _eof = true; return !_rs._isCapped; } invariantWTOK(ret); if (cmp == 0) return true; // Landed right where we left off. if (_rs._isCapped) { // Doc was deleted either by cappedDeleteAsNeeded() or cappedTruncateAfter(). // It is important that we error out in this case so that consumers don't // silently get 'holes' when scanning capped collections. We don't make // this guarantee for normal collections so it is ok to skip ahead in that case. _eof = true; return false; } if (_forward && cmp > 0) { // We landed after where we were. Move back one so that next() will return this // document. ret = WT_OP_CHECK(c->prev(c)); } else if (!_forward && cmp < 0) { // Do the opposite for reverse cursors. ret = WT_OP_CHECK(c->next(c)); } if (ret != WT_NOTFOUND) invariantWTOK(ret); return true; }
void WiredTigerRecoveryUnit::_txnClose(bool commit) { invariant(_active); WT_SESSION* s = _session->getSession(); if (commit) { invariantWTOK(s->commit_transaction(s, NULL)); LOG(2) << "WT commit_transaction"; } else { invariantWTOK(s->rollback_transaction(s, NULL)); LOG(2) << "WT rollback_transaction"; } _active = false; _myTransactionCount++; _ticket.reset(NULL); }
boost::optional<Record> next() final { int advanceRet = WT_OP_CHECK(_cursor->next(_cursor)); if (advanceRet == WT_NOTFOUND) return {}; invariantWTOK(advanceRet); int64_t key; invariantWTOK(_cursor->get_key(_cursor, &key)); const RecordId id = _fromKey(key); WT_ITEM value; invariantWTOK(_cursor->get_value(_cursor, &value)); return {{id, {static_cast<const char*>(value.data), static_cast<int>(value.size)}}}; }
void WiredTigerKVEngine::syncSizeInfo() const { if ( !_sizeStorer ) return; try { WiredTigerSession session( _conn, -1 ); WT_SESSION* s = session.getSession(); invariantWTOK( s->begin_transaction( s, "sync=true" ) ); _sizeStorer->storeInto( &session, _sizeStorerUri ); invariantWTOK( s->commit_transaction( s, NULL ) ); } catch ( const WriteConflictException& de ) { // ignore, it means someone else is doing it } }
void WiredTigerRecoveryUnit::_txnClose( bool commit ) { invariant( _active ); WT_SESSION *s = _session->getSession(); if ( commit ) { invariantWTOK( s->commit_transaction(s, NULL) ); LOG(2) << "WT commit_transaction"; if ( _syncing ) awaitCommitData.syncHappend(); } else { invariantWTOK( s->rollback_transaction(s, NULL) ); LOG(2) << "WT rollback_transaction"; } _active = false; }
void WiredTigerOperationStats::fetchStats(WT_SESSION* session, const std::string& uri, const std::string& config) { invariant(session); WT_CURSOR* c = nullptr; const char* cursorConfig = config.empty() ? nullptr : config.c_str(); int ret = session->open_cursor(session, uri.c_str(), nullptr, cursorConfig, &c); uassert(ErrorCodes::CursorNotFound, "Unable to open statistics cursor", ret == 0); invariant(c); ON_BLOCK_EXIT([&] { c->close(c); }); const char* desc; uint64_t value; uint64_t key; while (c->next(c) == 0 && c->get_key(c, &key) == 0) { fassert(51035, c->get_value(c, &desc, nullptr, &value) == 0); #if defined(__s390x__) _stats[key >> 32] = WiredTigerUtil::castStatisticsValue<long long>(value); #else _stats[key] = WiredTigerUtil::castStatisticsValue<long long>(value); #endif // __s390x__ } // Reset the statistics so that the next fetch gives the recent values. invariantWTOK(c->reset(c)); }
bool WiredTigerKVEngine::_drop( const StringData& ident ) { string uri = _uri( ident ); WiredTigerSession session( _conn, -1 ); int ret = session.getSession()->drop( session.getSession(), uri.c_str(), "force" ); LOG(1) << "WT drop of " << uri << " res " << ret; if ( ret == 0 ) { // yay, it worked return true; } if ( ret == EBUSY ) { // this is expected, queue it up { boost::mutex::scoped_lock lk( _identToDropMutex ); _identToDrop.insert( uri ); _epoch++; } _sessionCache->closeAll(); return false; } invariantWTOK( ret ); return false; }
void WiredTigerSnapshotManager::shutdown() { stdx::lock_guard<stdx::mutex> lock(_mutex); if (!_session) return; invariantWTOK(_session->close(_session, NULL)); _session = nullptr; }
Status WiredTigerRecordStore::truncate(OperationContext* txn) { WiredTigerCursor startWrap(_uri, _tableId, true, txn); WT_CURSOR* start = startWrap.get(); int ret = WT_OP_CHECK(start->next(start)); // Empty collections don't have anything to truncate. if (ret == WT_NOTFOUND) { return Status::OK(); } invariantWTOK(ret); WT_SESSION* session = WiredTigerRecoveryUnit::get(txn)->getSession(txn)->getSession(); invariantWTOK(WT_OP_CHECK(session->truncate(session, NULL, start, NULL, NULL))); _changeNumRecords(txn, -numRecords(txn)); _increaseDataSize(txn, -dataSize(txn)); return Status::OK(); }
void WiredTigerRecoveryUnit::_txnOpen() { invariant( !_active ); WT_SESSION *s = _session->getSession(); _syncing = _syncing || awaitCommitData.numWaitingForSync.load() > 0; invariantWTOK( s->begin_transaction(s, _syncing ? "sync=true" : NULL) ); LOG(2) << "WT begin_transaction"; _timer.reset(); _active = true; }
boost::optional<Record> seekExact(const RecordId& id) final { WT_CURSOR* c = _cursor->get(); c->set_key(c, _makeKey(id)); // Nothing after the next line can throw WCEs. int seekRet = WT_OP_CHECK(c->search(c)); if (seekRet == WT_NOTFOUND) { _eof = true; return {}; } invariantWTOK(seekRet); WT_ITEM value; invariantWTOK(c->get_value(c, &value)); _lastReturnedId = id; _eof = false; return {{id, {static_cast<const char*>(value.data), static_cast<int>(value.size)}}}; }
// Retrieve the value from a positioned cursor. RecordData WiredTigerRecordStore::_getData(const WiredTigerCursor& cursor) const { WT_ITEM value; int ret = cursor->get_value(cursor.get(), &value); invariantWTOK(ret); SharedBuffer data = SharedBuffer::allocate(value.size); memcpy(data.get(), value.data, value.size); return RecordData(data, value.size); }
WiredTigerSession::WiredTigerSession(WT_CONNECTION* conn, WiredTigerSessionCache* cache, int epoch) : _epoch(epoch), _cache(cache), _session(NULL), _cursorGen(0), _cursorsCached(0), _cursorsOut(0) { invariantWTOK(conn->open_session(conn, NULL, "isolation=snapshot", &_session)); }
int WiredTigerKVEngine::flushAllFiles( bool sync ) { LOG(1) << "WiredTigerKVEngine::flushAllFiles"; syncSizeInfo(); WiredTigerSession session( _conn, -1 ); WT_SESSION* s = session.getSession(); invariantWTOK( s->checkpoint(s, NULL ) ); return 1; }
void WiredTigerSession::releaseCursor(uint64_t id, WT_CURSOR* cursor) { invariant(_session); invariant(cursor); _cursorsOut--; invariantWTOK(cursor->reset(cursor)); // Cursors are pushed to the front of the list and removed from the back _cursors.push_front(WiredTigerCachedCursor(id, _cursorGen++, cursor)); // A negative value for wiredTigercursorCacheSize means to use hybrid caching. std::uint32_t cacheSize = abs(kWiredTigerCursorCacheSize.load()); while (!_cursors.empty() && _cursorGen - _cursors.back()._gen > cacheSize) { cursor = _cursors.back()._cursor; _cursors.pop_back(); invariantWTOK(cursor->close(cursor)); } }
void WiredTigerSnapshotManager::cleanupUnneededSnapshots() { stdx::lock_guard<stdx::mutex> lock(_mutex); if (!_committedSnapshot) return; const std::string config = str::stream() << "drop=(before=" << _committedSnapshot->asU64() << ')'; invariantWTOK(_session->snapshot(_session, config.c_str())); }
void WiredTigerSession::closeAllCursors() { invariant(_session); for (CursorCache::iterator i = _cursors.begin(); i != _cursors.end(); ++i) { WT_CURSOR* cursor = i->_cursor; if (cursor) { invariantWTOK(cursor->close(cursor)); } } _cursors.clear(); }
void WiredTigerRecoveryUnit::_txnOpen(OperationContext* opCtx) { invariant( !_active ); _getTicket(opCtx); WT_SESSION *s = _session->getSession(); _syncing = _syncing || waitUntilDurableData.numWaitingForSync.load() > 0; invariantWTOK( s->begin_transaction(s, _syncing ? "sync=true" : NULL) ); LOG(2) << "WT begin_transaction"; _timer.reset(); _active = true; }
void WiredTigerSnapshotManager::beginTransactionAtTimestamp(SnapshotName pointInTime, WT_SESSION* session) const { char readTSConfigString[15 /* read_timestamp= */ + (8 * 2) /* 8 hexadecimal characters */ + 1 /* trailing null */]; auto size = std::snprintf(readTSConfigString, sizeof(readTSConfigString), "read_timestamp=%llx", static_cast<unsigned long long>(pointInTime.asU64())); invariant(static_cast<std::size_t>(size) < sizeof(readTSConfigString)); invariantWTOK(session->begin_transaction(session, readTSConfigString)); }
bool restore() final { // We can't use the CursorCache since this cursor needs a special config string. WT_SESSION* session = WiredTigerRecoveryUnit::get(_txn)->getSession(_txn)->getSession(); if (!_cursor) { invariantWTOK( session->open_cursor(session, _rs->_uri.c_str(), NULL, "next_random", &_cursor)); invariant(_cursor); } return true; }
RecordData WiredTigerRecordStore::dataFor(OperationContext* txn, const RecordId& loc) const { // ownership passes to the shared_array created below WiredTigerCursor curwrap(_uri, _tableId, true, txn); WT_CURSOR* c = curwrap.get(); invariant(c); c->set_key(c, _makeKey(loc)); int ret = WT_OP_CHECK(c->search(c)); massert(28556, "Didn't find RecordId in WiredTigerRecordStore", ret != WT_NOTFOUND); invariantWTOK(ret); return _getData(curwrap); }
void WiredTigerRecordStore::deleteRecord(OperationContext* txn, const RecordId& loc) { WiredTigerCursor cursor(_uri, _tableId, true, txn); cursor.assertInActiveTxn(); WT_CURSOR* c = cursor.get(); c->set_key(c, _makeKey(loc)); int ret = WT_OP_CHECK(c->search(c)); invariantWTOK(ret); WT_ITEM old_value; ret = c->get_value(c, &old_value); invariantWTOK(ret); int old_length = old_value.size; ret = WT_OP_CHECK(c->remove(c)); invariantWTOK(ret); _changeNumRecords(txn, -1); _increaseDataSize(txn, -old_length); }
Status WiredTigerRecordStore::compact(OperationContext* txn, RecordStoreCompactAdaptor* adaptor, const CompactOptions* options, CompactStats* stats) { WiredTigerSessionCache* cache = WiredTigerRecoveryUnit::get(txn)->getSessionCache(); WiredTigerSession* session = cache->getSession(); WT_SESSION* s = session->getSession(); int ret = s->compact(s, getURI().c_str(), "timeout=0"); invariantWTOK(ret); cache->releaseSession(session); return Status::OK(); }
void WiredTigerRecoveryUnit::prepareUnitOfWork() { invariant(_inUnitOfWork(), toString(_state)); invariant(!_prepareTimestamp.isNull()); auto session = getSession(); WT_SESSION* s = session->getSession(); LOG(1) << "preparing transaction at time: " << _prepareTimestamp; const std::string conf = "prepare_timestamp=" + integerToHex(_prepareTimestamp.asULL()); // Prepare the transaction. invariantWTOK(s->prepare_transaction(s, conf.c_str())); }