void TerarkDbRecordStore::deleteRecord(OperationContext* txn, const RecordId& id) { auto& td = m_table->getMyThreadData(); m_table->m_tab->removeRow(id.repr()-1, &*td.m_dbCtx); }
void putRecordId(void* dest, RecordId loc) { const RecordIdRepr repr = loc.repr(); memcpy(dest, &repr, sizeof(repr)); }
int64_t WiredTigerRecordStore::_makeKey(const RecordId& loc) { return loc.repr(); }
PlanStage::StageState IDHackStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_done) { return PlanStage::IS_EOF; } if (WorkingSet::INVALID_ID != _idBeingPagedIn) { invariant(_recordCursor); WorkingSetID id = _idBeingPagedIn; _idBeingPagedIn = WorkingSet::INVALID_ID; invariant(WorkingSetCommon::fetchIfUnfetched(_txn, _workingSet, id, _recordCursor)); WorkingSetMember* member = _workingSet->get(id); return advance(id, member, out); } WorkingSetID id = WorkingSet::INVALID_ID; try { // Use the index catalog to get the id index. const IndexCatalog* catalog = _collection->getIndexCatalog(); // Find the index we use. IndexDescriptor* idDesc = catalog->findIdIndex(_txn); if (NULL == idDesc) { _done = true; return PlanStage::IS_EOF; } // Look up the key by going directly to the index. RecordId loc = catalog->getIndex(idDesc)->findSingle(_txn, _key); // Key not found. if (loc.isNull()) { _done = true; return PlanStage::IS_EOF; } ++_specificStats.keysExamined; ++_specificStats.docsExamined; // Create a new WSM for the result document. id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = loc; _workingSet->transitionToLocAndIdx(id); if (!_recordCursor) _recordCursor = _collection->getCursor(_txn); // We may need to request a yield while we fetch the document. if (auto fetcher = _recordCursor->fetcherForId(loc)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up a // fetch request. _idBeingPagedIn = id; member->setFetcher(fetcher.release()); *out = id; _commonStats.needYield++; return NEED_YIELD; } // The doc was already in memory, so we go ahead and return it. if (!WorkingSetCommon::fetch(_txn, _workingSet, id, _recordCursor)) { // _id is immutable so the index would return the only record that could // possibly match the query. _workingSet->free(id); _commonStats.isEOF = true; _done = true; return IS_EOF; } return advance(id, member, out); } catch (const WriteConflictException& wce) { // Restart at the beginning on retry. _recordCursor.reset(); if (id != WorkingSet::INVALID_ID) _workingSet->free(id); *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } }
Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace(OperationContext* txn, StringData fromNS, StringData toNS, bool stayTemp) { // some sanity checking NamespaceDetails* fromDetails = _namespaceIndex.details(fromNS); if (!fromDetails) return Status(ErrorCodes::BadValue, "from namespace doesn't exist"); if (_namespaceIndex.details(toNS)) return Status(ErrorCodes::BadValue, "to namespace already exists"); // at this point, we haven't done anything destructive yet // ---- // actually start moving // ---- // this could throw, but if it does we're ok _namespaceIndex.add_ns(txn, toNS, fromDetails); NamespaceDetails* toDetails = _namespaceIndex.details(toNS); try { toDetails->copyingFrom(txn, toNS, _namespaceIndex, fromDetails); // fixes extraOffset } catch (DBException&) { // could end up here if .ns is full - if so try to clean up / roll back a little _namespaceIndex.kill_ns(txn, toNS); throw; } // at this point, code .ns stuff moved _namespaceIndex.kill_ns(txn, fromNS); fromDetails = NULL; // fix system.namespaces BSONObj newSpec; RecordId oldSpecLocation = getCollectionCatalogEntry(fromNS)->getNamespacesRecordId(); invariant(!oldSpecLocation.isNull()); { BSONObj oldSpec = _getNamespaceRecordStore()->dataFor(txn, oldSpecLocation).releaseToBson(); invariant(!oldSpec.isEmpty()); BSONObjBuilder b; BSONObjIterator i(oldSpec.getObjectField("options")); while (i.more()) { BSONElement e = i.next(); if (strcmp(e.fieldName(), "create") != 0) { if (stayTemp || (strcmp(e.fieldName(), "temp") != 0)) b.append(e); } else { b << "create" << toNS; } } newSpec = b.obj(); } RecordId rid = _addNamespaceToNamespaceCollection(txn, toNS, newSpec.isEmpty() ? 0 : &newSpec); _getNamespaceRecordStore()->deleteRecord(txn, oldSpecLocation); Entry*& entry = _collections[toNS.toString()]; invariant(entry == NULL); txn->recoveryUnit()->registerChange(new EntryInsertion(toNS, this)); entry = new Entry(); _removeFromCache(txn->recoveryUnit(), fromNS); _insertInCache(txn, toNS, rid, entry); return Status::OK(); }
RecordId WiredTigerRecordStore::_nextId() { invariant(!_useOplogHack); RecordId out = RecordId(_nextIdNum.fetchAndAdd(1)); invariant(out.isNormal()); return out; }
PlanStage::StageState IDHackStage::doWork(WorkingSetID* out) { if (_done) { return PlanStage::IS_EOF; } if (WorkingSet::INVALID_ID != _idBeingPagedIn) { invariant(_recordCursor); WorkingSetID id = _idBeingPagedIn; _idBeingPagedIn = WorkingSet::INVALID_ID; invariant(WorkingSetCommon::fetchIfUnfetched(getOpCtx(), _workingSet, id, _recordCursor)); WorkingSetMember* member = _workingSet->get(id); return advance(id, member, out); } WorkingSetID id = WorkingSet::INVALID_ID; try { // Look up the key by going directly to the index. RecordId recordId = _accessMethod->findSingle(getOpCtx(), _key); // Key not found. if (recordId.isNull()) { _done = true; return PlanStage::IS_EOF; } ++_specificStats.keysExamined; ++_specificStats.docsExamined; // Create a new WSM for the result document. id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->recordId = recordId; _workingSet->transitionToRecordIdAndIdx(id); if (!_recordCursor) _recordCursor = _collection->getCursor(getOpCtx()); // We may need to request a yield while we fetch the document. if (auto fetcher = _recordCursor->fetcherForId(recordId)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up a // fetch request. _idBeingPagedIn = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc was already in memory, so we go ahead and return it. if (!WorkingSetCommon::fetch(getOpCtx(), _workingSet, id, _recordCursor)) { // _id is immutable so the index would return the only record that could // possibly match the query. _workingSet->free(id); _commonStats.isEOF = true; _done = true; return IS_EOF; } return advance(id, member, out); } catch (const WriteConflictException& wce) { // Restart at the beginning on retry. _recordCursor.reset(); if (id != WorkingSet::INVALID_ID) _workingSet->free(id); *out = WorkingSet::INVALID_ID; return NEED_YIELD; } }
TEST(KeyStringTest, RecordIds) { for (int i = 0; i < 63; i++) { const RecordId rid = RecordId(1ll << i); { // Test encoding / decoding of single RecordIds const KeyString ks(rid); ASSERT_GTE(ks.getSize(), 2u); ASSERT_LTE(ks.getSize(), 10u); ASSERT_EQ(KeyString::decodeRecordIdAtEnd(ks.getBuffer(), ks.getSize()), rid); { BufReader reader(ks.getBuffer(), ks.getSize()); ASSERT_EQ(KeyString::decodeRecordId(&reader), rid); ASSERT(reader.atEof()); } if (rid.isNormal()) { ASSERT_GT(ks, KeyString(RecordId())); ASSERT_GT(ks, KeyString(RecordId::min())); ASSERT_LT(ks, KeyString(RecordId::max())); ASSERT_GT(ks, KeyString(RecordId(rid.repr() - 1))); ASSERT_LT(ks, KeyString(RecordId(rid.repr() + 1))); } } for (int j = 0; j < 63; j++) { RecordId other = RecordId(1ll << j); if (rid == other) ASSERT_EQ(KeyString(rid), KeyString(other)); if (rid < other) ASSERT_LT(KeyString(rid), KeyString(other)); if (rid > other) ASSERT_GT(KeyString(rid), KeyString(other)); { // Test concatenating RecordIds like in a unique index. KeyString ks; ks.appendRecordId(RecordId::max()); // uses all bytes ks.appendRecordId(rid); ks.appendRecordId(RecordId(0xDEADBEEF)); // uses some extra bytes ks.appendRecordId(rid); ks.appendRecordId(RecordId(1)); // uses no extra bytes ks.appendRecordId(rid); ks.appendRecordId(other); ASSERT_EQ(KeyString::decodeRecordIdAtEnd(ks.getBuffer(), ks.getSize()), other); // forward scan BufReader reader(ks.getBuffer(), ks.getSize()); ASSERT_EQ(KeyString::decodeRecordId(&reader), RecordId::max()); ASSERT_EQ(KeyString::decodeRecordId(&reader), rid); ASSERT_EQ(KeyString::decodeRecordId(&reader), RecordId(0xDEADBEEF)); ASSERT_EQ(KeyString::decodeRecordId(&reader), rid); ASSERT_EQ(KeyString::decodeRecordId(&reader), RecordId(1)); ASSERT_EQ(KeyString::decodeRecordId(&reader), rid); ASSERT_EQ(KeyString::decodeRecordId(&reader), other); ASSERT(reader.atEof()); } } } }
RecordId EphemeralForTestRecordStore::allocateLoc() { RecordId out = RecordId(_data->nextId++); invariant(out.isNormal()); return out; }
PlanStage::StageState CollectionScan::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_isDead) { return PlanStage::DEAD; } // Do some init if we haven't already. if (NULL == _iter) { if ( _params.collection == NULL ) { _isDead = true; return PlanStage::DEAD; } try { if (_lastSeenLoc.isNull()) { _iter.reset( _params.collection->getIterator( _txn, _params.start, _params.direction ) ); } else { invariant(_params.tailable); _iter.reset( _params.collection->getIterator( _txn, _lastSeenLoc, _params.direction ) ); // Advance _iter past where we were last time. If it returns something else, // mark us as dead since we want to signal an error rather than silently // dropping data from the stream. This is related to the _lastSeenLock handling // in invalidate. if (_iter->getNext() != _lastSeenLoc) { _isDead = true; return PlanStage::DEAD; } } } catch (const WriteConflictException& wce) { // Leave us in a state to try again next time. _iter.reset(); *out = WorkingSet::INVALID_ID; return PlanStage::NEED_YIELD; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Should we try getNext() on the underlying _iter? if (isEOF()) return PlanStage::IS_EOF; const RecordId curr = _iter->curr(); if (curr.isNull()) { // We just hit EOF if (_params.tailable) _iter.reset(); // pick up where we left off on the next call to work() return PlanStage::IS_EOF; } _lastSeenLoc = curr; // See if the record we're about to access is in memory. If not, pass a fetch request up. // Note that curr() does not touch the record. This way, we are able to yield before // fetching the record. { std::auto_ptr<RecordFetcher> fetcher( _params.collection->documentNeedsFetch(_txn, curr)); if (NULL != fetcher.get()) { WorkingSetMember* member = _workingSet->get(_wsidForFetch); member->loc = curr; // Pass the RecordFetcher off to the WSM. member->setFetcher(fetcher.release()); *out = _wsidForFetch; _commonStats.needYield++; return NEED_YIELD; } } // Do this before advancing because it is more efficient while the iterator is still on this // document. const Snapshotted<BSONObj> obj = Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), _iter->dataFor(curr).releaseToBson()); // Advance the iterator. try { invariant(_iter->getNext() == curr); } catch (const WriteConflictException& wce) { // If getNext thows, it leaves us on the original document. invariant(_iter->curr() == curr); *out = WorkingSet::INVALID_ID; return PlanStage::NEED_YIELD; } WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = curr; member->obj = obj; member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; return returnIfMatches(member, id, out); }
RecordId MobileRecordStore::_nextId() { RecordId out = RecordId(_nextIdNum.fetchAndAdd(1)); invariant(out.isNormal()); return out; }
void syncFixUp(OperationContext* txn, FixUpInfo& fixUpInfo, OplogReader* oplogreader, ReplicationCoordinator* replCoord) { DBClientConnection* them = oplogreader->conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totalSize = 0; list< pair<DocID, BSONObj> > goodVersions; BSONObj newMinValid; // fetch all the goodVersions of each document from current primary DocID doc; unsigned long long numFetched = 0; try { for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); it != fixUpInfo.toRefetch.end(); it++) { doc = *it; verify(!doc._id.eoo()); { // TODO : slow. lots of round trips. numFetched++; BSONObj good = them->findOne(doc.ns, doc._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); totalSize += good.objsize(); uassert(13410, "replSet too much data to roll back", totalSize < 300 * 1024 * 1024); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID, BSONObj>(doc,good)); } } newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { error() << "rollback error newMinValid empty?"; return; } } catch (DBException& e) { LOG(1) << "rollback re-get objects: " << e.toString(); error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' << numFetched << '/' << fixUpInfo.toRefetch.size(); throw e; } log() << "rollback 3.5"; if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily consistent. warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; return; } // update them log() << "rollback 4 n:" << goodVersions.size(); bool warn = false; invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); invariant(txn->lockState()->isW()); // we have items we are writing that aren't from a point-in-time. thus best not to come // online until we get to that point in freshness. Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); // any full collection resyncs required? if (!fixUpInfo.collectionsToResync.empty()) { for (set<string>::iterator it = fixUpInfo.collectionsToResync.begin(); it != fixUpInfo.collectionsToResync.end(); it++) { string ns = *it; log() << "rollback 4.1 coll resync " << ns; const NamespaceString nss(ns); Database* db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); { WriteUnitOfWork wunit(txn); db->dropCollection(txn, ns); wunit.commit(); } { string errmsg; // This comes as a GlobalWrite lock, so there is no DB to be acquired after // resume, so we can skip the DB stability checks. Also // copyCollectionFromRemote will acquire its own database pointer, under the // appropriate locks, so just releasing and acquiring the lock is safe. invariant(txn->lockState()->isW()); Lock::TempRelease release(txn->lockState()); bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok); } } // we did more reading from primary, so check it again for a rollback (which would mess // us up), and make minValid newer. log() << "rollback 4.2"; string err; try { newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { err = "can't get minvalid from sync source"; } else { Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); } } catch (DBException& e) { err = "can't get/set minvalid: "; err += e.what(); } if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily // consistent. however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if (!err.empty()) { severe() << "rolling back : " << err << ". A full resync will be necessary."; // TODO: reset minvalid so that we are permanently in fatal state // TODO: don't be fatal, but rather, get all the data first. throw RSFatalException(); } log() << "rollback 4.3"; } map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers; log() << "rollback 4.6"; // drop collections to drop before doing individual fixups - that might make things faster // below actually if there were subsequent inserts to rollback for (set<string>::iterator it = fixUpInfo.toDrop.begin(); it != fixUpInfo.toDrop.end(); it++) { log() << "rollback drop: " << *it; Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); if (db) { WriteUnitOfWork wunit(txn); shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); // perform a collection scan and write all documents in the collection to disk boost::scoped_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, *it, db->getCollection(*it))); BSONObj curObj; PlanExecutor::ExecState execState; while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { removeSaver->goingToDelete(curObj); } if (execState != PlanExecutor::IS_EOF) { if (execState == PlanExecutor::FAILURE && WorkingSetCommon::isValidStatusMemberObject(curObj)) { Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); severe() << "rolling back createCollection on " << *it << " failed with " << errorStatus << ". A full resync is necessary."; } else { severe() << "rolling back createCollection on " << *it << " failed. A full resync is necessary."; } throw RSFatalException(); } db->dropCollection(txn, *it); wunit.commit(); } } log() << "rollback 4.7"; OldClientContext ctx(txn, rsOplogName); Collection* oplogCollection = ctx.db()->getCollection(rsOplogName); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsOplogName, oplogCollection); unsigned deletes = 0, updates = 0; time_t lastProgressUpdate = time(0); time_t progressUpdateGap = 10; for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin(); it != goodVersions.end(); it++) { time_t now = time(0); if (now - lastProgressUpdate > progressUpdateGap) { log() << deletes << " delete and " << updates << " update operations processed out of " << goodVersions.size() << " total operations"; lastProgressUpdate = now; } const DocID& doc = it->first; BSONObj pattern = doc._id.wrap(); // { _id : ... } try { verify(doc.ns && *doc.ns); if (fixUpInfo.collectionsToResync.count(doc.ns)) { // we just synced this entire collection continue; } // keep an archive of items rolled back shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); // todo: lots of overhead in context, this can be faster OldClientContext ctx(txn, doc.ns); // Add the doc to our rollback file BSONObj obj; Collection* collection = ctx.db()->getCollection(doc.ns); // Do not log an error when undoing an insert on a no longer existent collection. // It is likely that the collection was dropped as part of rolling back a // createCollection command and regardless, the document no longer exists. if (collection) { bool found = Helpers::findOne(txn, collection, pattern, obj, false); if (found) { removeSaver->goingToDelete(obj); } else { error() << "rollback cannot find object: " << pattern << " in namespace " << doc.ns; } } if (it->second.isEmpty()) { // wasn't on the primary; delete. // TODO 1.6 : can't delete from a capped collection. need to handle that here. deletes++; if (collection) { if (collection->isCapped()) { // can't delete from a capped collection - so we truncate instead. if // this item must go, so must all successors!!! try { // TODO: IIRC cappedTruncateAfter does not handle completely empty. // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); RecordId loc = Helpers::findOne(txn, collection, pattern, false); if (Listener::getElapsedTimeMillis() - start > 200) warning() << "roll back slow no _id index for " << doc.ns << " perhaps?"; // would be faster but requires index: // RecordId loc = Helpers::findById(nsd, pattern); if (!loc.isNull()) { try { collection->temp_cappedTruncateAfter(txn, loc, true); } catch (DBException& e) { if (e.getCode() == 13415) { // hack: need to just make cappedTruncate do this... MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); uassertStatusOK(collection->truncate(txn)); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "truncate", collection->ns().ns()); } else { throw e; } } } } catch (DBException& e) { error() << "rolling back capped collection rec " << doc.ns << ' ' << e.toString(); } } else {
void pass(int p) { OperationContextImpl txn; create(); ASSERT_EQUALS( 2, nExtents() ); BSONObj b = bigObj(); int N = MinExtentSize / b.objsize() * nExtents() + 5; int T = N - 4; RecordId truncAt; //RecordId l[ 8 ]; for ( int i = 0; i < N; ++i ) { BSONObj bb = bigObj(); StatusWith<RecordId> status = collection()->insertDocument( &txn, bb, true ); ASSERT( status.isOK() ); RecordId a = status.getValue(); if( T == i ) truncAt = a; ASSERT( !a.isNull() ); /*ASSERT_EQUALS( i < 2 ? i + 1 : 3 + i % 2, nRecords() ); if ( i > 3 ) ASSERT( l[ i ] == l[ i - 4 ] );*/ } ASSERT( nRecords() < N ); RecordId last, first; { auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::BACKWARD)); runner->getNext(NULL, &last); ASSERT( !last.isNull() ); } { auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::FORWARD)); runner->getNext(NULL, &first); ASSERT( !first.isNull() ); ASSERT( first != last ) ; } collection()->temp_cappedTruncateAfter(&txn, truncAt, false); ASSERT_EQUALS( collection()->numRecords() , 28u ); { RecordId loc; auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::FORWARD)); runner->getNext(NULL, &loc); ASSERT( first == loc); } { auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::BACKWARD)); RecordId loc; runner->getNext(NULL, &loc); ASSERT( last != loc ); ASSERT( !last.isNull() ); } // Too big BSONObjBuilder bob; bob.appendOID("_id", 0, true); bob.append( "a", string( MinExtentSize + 300, 'a' ) ); BSONObj bigger = bob.done(); StatusWith<RecordId> status = collection()->insertDocument( &txn, bigger, true ); ASSERT( !status.isOK() ); ASSERT_EQUALS( 0, nRecords() ); }