PlanStage::StageState MultiIteratorStage::work(WorkingSetID* out) { if (_collection == NULL) return PlanStage::DEAD; // The RecordId we're about to look at it might not be in memory. In this case // we request a yield while we fetch the document. if (!_iterators.empty()) { RecordId curr = _iterators.back()->curr(); if (!curr.isNull()) { std::auto_ptr<RecordFetcher> fetcher(_collection->documentNeedsFetch(_txn, curr)); if (NULL != fetcher.get()) { WorkingSetMember* member = _ws->get(_wsidForFetch); member->loc = curr; // Pass the RecordFetcher off to the WSM on which we're performing the fetch. member->setFetcher(fetcher.release()); *out = _wsidForFetch; return NEED_FETCH; } } } RecordId next = _advance(); if (next.isNull()) return PlanStage::IS_EOF; *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->loc = next; member->obj = _collection->docFor(_txn, next); member->state = WorkingSetMember::LOC_AND_OBJ; return PlanStage::ADVANCED; }
PlanStage::StageState OplogStart::workExtentHopping(WorkingSetID* out) { if (_done || _subIterators.empty()) { return PlanStage::IS_EOF; } // we work from the back to the front since the back has the newest data. const RecordId loc = _subIterators.back()->curr(); if (loc.isNull()) return PlanStage::NEED_TIME; // TODO: should we ever try and return NEED_FETCH here? const BSONObj obj = _subIterators.back()->dataFor(loc).releaseToBson(); if (!_filter->matchesBSON(obj)) { _done = true; WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = loc; member->obj = Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), obj); member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; *out = id; return PlanStage::ADVANCED; } _subIterators.popAndDeleteBack(); return PlanStage::NEED_TIME; }
bool Helpers::findById(OperationContext* opCtx, Database* database, StringData ns, BSONObj query, BSONObj& result, bool* nsFound, bool* indexFound) { invariant(database); Collection* collection = database->getCollection(opCtx, ns); if (!collection) { return false; } if (nsFound) *nsFound = true; IndexCatalog* catalog = collection->getIndexCatalog(); const IndexDescriptor* desc = catalog->findIdIndex(opCtx); if (!desc) return false; if (indexFound) *indexFound = 1; RecordId loc = catalog->getIndex(desc)->findSingle(opCtx, query["_id"].wrap()); if (loc.isNull()) return false; result = collection->docFor(opCtx, loc).value(); return true; }
boost::optional<Record> next() final { if (_eof) return {}; WT_CURSOR* c = _cursor->get(); bool mustAdvance = true; if (_lastReturnedId.isNull() && !_forward && _rs._isCapped) { // In this case we need to seek to the highest visible record. const RecordId reverseCappedInitialSeekPoint = _readUntilForOplog.isNull() ? _rs.lowestCappedHiddenRecord() : _readUntilForOplog; if (!reverseCappedInitialSeekPoint.isNull()) { c->set_key(c, _makeKey(reverseCappedInitialSeekPoint)); int cmp; int seekRet = WT_OP_CHECK(c->search_near(c, &cmp)); if (seekRet == WT_NOTFOUND) { _eof = true; return {}; } invariantWTOK(seekRet); // If we landed at or past the lowest hidden record, we must advance to be in // the visible range. mustAdvance = _rs.isCappedHidden(reverseCappedInitialSeekPoint) ? (cmp >= 0) : (cmp > 0); // No longer hidden. } } if (mustAdvance) { // Nothing after the next line can throw WCEs. // Note that an unpositioned (or eof) WT_CURSOR returns the first/last entry in the // table when you call next/prev. int advanceRet = WT_OP_CHECK(_forward ? c->next(c) : c->prev(c)); if (advanceRet == WT_NOTFOUND) { _eof = true; return {}; } invariantWTOK(advanceRet); } int64_t key; invariantWTOK(c->get_key(c, &key)); const RecordId id = _fromKey(key); if (!isVisible(id)) { _eof = true; return {}; } WT_ITEM value; invariantWTOK(c->get_value(c, &value)); _lastReturnedId = id; return {{id, {static_cast<const char*>(value.data), static_cast<int>(value.size)}}}; }
int nExtents() const { int count = 0; for ( RecordId extLoc = nsd()->firstExtent(); !extLoc.isNull(); extLoc = extentManager()->getExtent(extLoc)->xnext ) { ++count; } return count; }
RecordId MultiIteratorStage::_advance() { while (!_iterators.empty()) { RecordId out = _iterators.back()->getNext(); if (!out.isNull()) return out; _iterators.popAndDeleteBack(); } return RecordId(); }
/* fetch a single object from collection ns that matches query set your db SavedContext first */ bool Helpers::findOne(OperationContext* txn, Collection* collection, const BSONObj &query, BSONObj& result, bool requireIndex) { RecordId loc = findOne( txn, collection, query, requireIndex ); if ( loc.isNull() ) return false; result = collection->docFor(txn, loc).value(); return true; }
void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp) { Timer timer; const NamespaceString oplogNss(NamespaceString::kRsOplogNamespace); AutoGetDb autoDb(opCtx, oplogNss.db(), MODE_IX); Lock::CollectionLock oplogCollectionLoc(opCtx->lockState(), oplogNss.ns(), MODE_X); Collection* oplogCollection = autoDb.getDb()->getCollection(opCtx, oplogNss); if (!oplogCollection) { fassertFailedWithStatusNoTrace( 34418, Status(ErrorCodes::NamespaceNotFound, str::stream() << "Can't find " << NamespaceString::kRsOplogNamespace.ns())); } // Scan through oplog in reverse, from latest entry to first, to find the truncateTimestamp. RecordId oldestIDToDelete; // Non-null if there is something to delete. auto oplogRs = oplogCollection->getRecordStore(); auto oplogReverseCursor = oplogRs->getCursor(opCtx, /*forward=*/false); size_t count = 0; while (auto next = oplogReverseCursor->next()) { const BSONObj entry = next->data.releaseToBson(); const RecordId id = next->id; count++; const auto tsElem = entry["ts"]; if (count == 1) { if (tsElem.eoo()) LOG(2) << "Oplog tail entry: " << redact(entry); else LOG(2) << "Oplog tail entry ts field: " << tsElem; } if (tsElem.timestamp() < truncateTimestamp) { // If count == 1, that means that we have nothing to delete because everything in the // oplog is < truncateTimestamp. if (count != 1) { invariant(!oldestIDToDelete.isNull()); oplogCollection->cappedTruncateAfter(opCtx, oldestIDToDelete, /*inclusive=*/true); } log() << "Replication recovery oplog truncation finished in: " << timer.millis() << "ms"; return; } oldestIDToDelete = id; } severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON() << " but couldn't find any after looking through " << count << " entries."; fassertFailedNoTrace(40296); }
PlanStage::StageState IDHackStage::doWork(WorkingSetID* out) { if (_done) { return PlanStage::IS_EOF; } WorkingSetID id = WorkingSet::INVALID_ID; try { // Look up the key by going directly to the index. RecordId recordId = indexAccessMethod()->findSingle(getOpCtx(), _key); // Key not found. if (recordId.isNull()) { _done = true; return PlanStage::IS_EOF; } ++_specificStats.keysExamined; ++_specificStats.docsExamined; // Create a new WSM for the result document. id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->recordId = recordId; _workingSet->transitionToRecordIdAndIdx(id); if (!_recordCursor) _recordCursor = collection()->getCursor(getOpCtx()); // Find the document associated with 'id' in the collection's record store. if (!WorkingSetCommon::fetch(getOpCtx(), _workingSet, id, _recordCursor)) { // We didn't find a document with RecordId 'id'. _workingSet->free(id); _commonStats.isEOF = true; _done = true; return IS_EOF; } return advance(id, member, out); } catch (const WriteConflictException&) { // Restart at the beginning on retry. _recordCursor.reset(); if (id != WorkingSet::INVALID_ID) _workingSet->free(id); *out = WorkingSet::INVALID_ID; return NEED_YIELD; } }
bool TerarkDbRecordStore::findRecord(OperationContext* txn, const RecordId& id, RecordData* out) const { if (id.isNull()) return false; llong recIdx = id.repr() - 1; CompositeTable* tab = m_table->m_tab.get(); auto& td = m_table->getMyThreadData(); tab->getValue(recIdx, &td.m_buf, &*td.m_dbCtx); SharedBuffer bson = td.m_coder.decode(&tab->rowSchema(), td.m_buf); // size_t bufsize = sizeof(SharedBuffer::Holder) + bson.objsize(); int bufsize = ConstDataView(bson.get()).read<LittleEndian<int>>(); *out = RecordData(bson, bufsize); return true; }
int nRecords() const { int count = 0; const Extent* ext; for ( RecordId extLoc = nsd()->firstExtent(); !extLoc.isNull(); extLoc = ext->xnext) { ext = extentManager()->getExtent(extLoc); int fileNo = ext->firstRecord.a(); if ( fileNo == -1 ) continue; for ( int recOfs = ext->firstRecord.getOfs(); recOfs != RecordId::NullOfs; recOfs = recordStore()->recordFor(RecordId(fileNo, recOfs))->nextOfs() ) { ++count; } } ASSERT_EQUALS( count, nsd()->numRecords() ); return count; }
InMemoryRecordIterator::InMemoryRecordIterator(OperationContext* txn, const InMemoryRecordStore::Records& records, const InMemoryRecordStore& rs, RecordId start, bool tailable) : _txn(txn), _tailable(tailable), _lastLoc(RecordId::min()), _killedByInvalidate(false), _records(records), _rs(rs) { if (start.isNull()) { _it = _records.begin(); } else { _it = _records.find(start); invariant(_it != _records.end()); } }
CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions(OperationContext* txn, RecordId rid) const { CollectionOptions options; if (rid.isNull()) { return options; } RecordStoreV1Base* rs = _getNamespaceRecordStore(); invariant(rs); RecordData data; invariant(rs->findRecord(txn, rid, &data)); if (data.releaseToBson()["options"].isABSONObj()) { Status status = options.parse(data.releaseToBson()["options"].Obj()); fassert(18523, status); } return options; }
InMemoryRecordReverseIterator::InMemoryRecordReverseIterator( OperationContext* txn, const InMemoryRecordStore::Records& records, const InMemoryRecordStore& rs, RecordId start) : _txn(txn), _killedByInvalidate(false), _records(records), _rs(rs) { if (start.isNull()) { _it = _records.rbegin(); } else { // The reverse iterator will point to the preceding element, so we // increment the base iterator to make it point past the found element InMemoryRecordStore::Records::const_iterator baseIt(++_records.find(start)); _it = InMemoryRecordStore::Records::const_reverse_iterator(baseIt); invariant(_it != _records.rend()); } }
void run() { create(); nsd()->deletedListEntry( 2 ) = nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted().drec()->nextDeleted(); nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted().drec()->nextDeleted().writing() = RecordId(); nsd()->cappedLastDelRecLastExtent().Null(); NamespaceDetails *d = nsd(); zero( &d->capExtent() ); zero( &d->capFirstNewRecord() ); // this has a side effect of called NamespaceDetails::cappedCheckMigrate db()->namespaceIndex().details( ns() ); ASSERT( nsd()->firstExtent() == nsd()->capExtent() ); ASSERT( nsd()->capExtent().getOfs() != 0 ); ASSERT( !nsd()->capFirstNewRecord().isValid() ); int nDeleted = 0; for ( RecordId i = nsd()->cappedListOfAllDeletedRecords(); !i.isNull(); i = i.drec()->nextDeleted(), ++nDeleted ); ASSERT_EQUALS( 10, nDeleted ); ASSERT( nsd()->cappedLastDelRecLastExtent().isNull() ); }
PlanStage::StageState OplogStart::workExtentHopping(WorkingSetID* out) { if (_done || _subIterators.empty()) { return PlanStage::IS_EOF; } // we work from the back to the front since the back has the newest data. const RecordId loc = _subIterators.back()->getNext(); _subIterators.popAndDeleteBack(); // TODO: should we ever try and return NEED_FETCH here? if (!loc.isNull() && !_filter->matchesBSON(_collection->docFor(_txn, loc))) { _done = true; WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = loc; member->obj = _collection->docFor(_txn, member->loc); member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; }
bool Helpers::findById(OperationContext* txn, Database* database, const char *ns, BSONObj query, BSONObj& result, bool* nsFound, bool* indexFound) { invariant(database); Collection* collection = database->getCollection( ns ); if ( !collection ) { return false; } if ( nsFound ) *nsFound = true; IndexCatalog* catalog = collection->getIndexCatalog(); const IndexDescriptor* desc = catalog->findIdIndex( txn ); if ( !desc ) return false; if ( indexFound ) *indexFound = 1; // See SERVER-12397. This may not always be true. BtreeBasedAccessMethod* accessMethod = static_cast<BtreeBasedAccessMethod*>(catalog->getIndex( desc )); RecordId loc = accessMethod->findSingle( txn, query["_id"].wrap() ); if ( loc.isNull() ) return false; result = collection->docFor(txn, loc).value(); return true; }
PlanStage::StageState IDHackStage::doWork(WorkingSetID* out) { if (_done) { return PlanStage::IS_EOF; } if (WorkingSet::INVALID_ID != _idBeingPagedIn) { invariant(_recordCursor); WorkingSetID id = _idBeingPagedIn; _idBeingPagedIn = WorkingSet::INVALID_ID; invariant(WorkingSetCommon::fetchIfUnfetched(getOpCtx(), _workingSet, id, _recordCursor)); WorkingSetMember* member = _workingSet->get(id); return advance(id, member, out); } WorkingSetID id = WorkingSet::INVALID_ID; try { // Look up the key by going directly to the index. RecordId recordId = _accessMethod->findSingle(getOpCtx(), _key); // Key not found. if (recordId.isNull()) { _done = true; return PlanStage::IS_EOF; } ++_specificStats.keysExamined; ++_specificStats.docsExamined; // Create a new WSM for the result document. id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->recordId = recordId; _workingSet->transitionToRecordIdAndIdx(id); if (!_recordCursor) _recordCursor = _collection->getCursor(getOpCtx()); // We may need to request a yield while we fetch the document. if (auto fetcher = _recordCursor->fetcherForId(recordId)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up a // fetch request. _idBeingPagedIn = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc was already in memory, so we go ahead and return it. if (!WorkingSetCommon::fetch(getOpCtx(), _workingSet, id, _recordCursor)) { // _id is immutable so the index would return the only record that could // possibly match the query. _workingSet->free(id); _commonStats.isEOF = true; _done = true; return IS_EOF; } return advance(id, member, out); } catch (const WriteConflictException& wce) { // Restart at the beginning on retry. _recordCursor.reset(); if (id != WorkingSet::INVALID_ID) _workingSet->free(id); *out = WorkingSet::INVALID_ID; return NEED_YIELD; } }
void syncFixUp(OperationContext* txn, FixUpInfo& fixUpInfo, OplogReader* oplogreader, ReplicationCoordinator* replCoord) { DBClientConnection* them = oplogreader->conn(); // fetch all first so we needn't handle interruption in a fancy way unsigned long long totalSize = 0; list< pair<DocID, BSONObj> > goodVersions; BSONObj newMinValid; // fetch all the goodVersions of each document from current primary DocID doc; unsigned long long numFetched = 0; try { for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); it != fixUpInfo.toRefetch.end(); it++) { doc = *it; verify(!doc._id.eoo()); { // TODO : slow. lots of round trips. numFetched++; BSONObj good = them->findOne(doc.ns, doc._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); totalSize += good.objsize(); uassert(13410, "replSet too much data to roll back", totalSize < 300 * 1024 * 1024); // note good might be eoo, indicating we should delete it goodVersions.push_back(pair<DocID, BSONObj>(doc,good)); } } newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { error() << "rollback error newMinValid empty?"; return; } } catch (DBException& e) { LOG(1) << "rollback re-get objects: " << e.toString(); error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' << numFetched << '/' << fixUpInfo.toRefetch.size(); throw e; } log() << "rollback 3.5"; if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily consistent. warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; return; } // update them log() << "rollback 4 n:" << goodVersions.size(); bool warn = false; invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); invariant(txn->lockState()->isW()); // we have items we are writing that aren't from a point-in-time. thus best not to come // online until we get to that point in freshness. Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); // any full collection resyncs required? if (!fixUpInfo.collectionsToResyncData.empty() || !fixUpInfo.collectionsToResyncMetadata.empty()) { for (const string& ns : fixUpInfo.collectionsToResyncData) { log() << "rollback 4.1.1 coll resync " << ns; fixUpInfo.collectionsToResyncMetadata.erase(ns); const NamespaceString nss(ns); Database* db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); { WriteUnitOfWork wunit(txn); db->dropCollection(txn, ns); wunit.commit(); } { string errmsg; // This comes as a GlobalWrite lock, so there is no DB to be acquired after // resume, so we can skip the DB stability checks. Also // copyCollectionFromRemote will acquire its own database pointer, under the // appropriate locks, so just releasing and acquiring the lock is safe. invariant(txn->lockState()->isW()); Lock::TempRelease release(txn->lockState()); bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok); } } for (const string& ns : fixUpInfo.collectionsToResyncMetadata) { log() << "rollback 4.1.2 coll metadata resync " << ns; const NamespaceString nss(ns); auto db = dbHolder().openDb(txn, nss.db().toString()); invariant(db); auto collection = db->getCollection(ns); invariant(collection); auto cce = collection->getCatalogEntry(); const std::list<BSONObj> info = them->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll())); if (info.empty()) { // Collection dropped by "them" so we should drop it too. log() << ns << " not found on remote host, dropping"; fixUpInfo.toDrop.insert(ns); continue; } invariant(info.size() == 1); CollectionOptions options; auto status = options.parse(info.front()); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to parse options " << info.front() << ": " << status.toString()); } WriteUnitOfWork wuow(txn); if (options.flagsSet || cce->getCollectionOptions(txn).flagsSet) { cce->updateFlags(txn, options.flags); } status = collection->setValidator(txn, options.validator); if (!status.isOK()) { throw RSFatalException(str::stream() << "Failed to set validator: " << status.toString()); } wuow.commit(); } // we did more reading from primary, so check it again for a rollback (which would mess // us up), and make minValid newer. log() << "rollback 4.2"; string err; try { newMinValid = oplogreader->getLastOp(rsOplogName); if (newMinValid.isEmpty()) { err = "can't get minvalid from sync source"; } else { Timestamp minValid = newMinValid["ts"].timestamp(); log() << "minvalid=" << minValid.toStringLong(); setMinValid(txn, minValid); } } catch (DBException& e) { err = "can't get/set minvalid: "; err += e.what(); } if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { // our source rolled back itself. so the data we received isn't necessarily // consistent. however, we've now done writes. thus we have a problem. err += "rbid at primary changed during resync/rollback"; } if (!err.empty()) { severe() << "rolling back : " << err << ". A full resync will be necessary."; // TODO: reset minvalid so that we are permanently in fatal state // TODO: don't be fatal, but rather, get all the data first. throw RSFatalException(); } log() << "rollback 4.3"; } map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers; log() << "rollback 4.6"; // drop collections to drop before doing individual fixups - that might make things faster // below actually if there were subsequent inserts to rollback for (set<string>::iterator it = fixUpInfo.toDrop.begin(); it != fixUpInfo.toDrop.end(); it++) { log() << "rollback drop: " << *it; Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); if (db) { WriteUnitOfWork wunit(txn); shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); // perform a collection scan and write all documents in the collection to disk boost::scoped_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, *it, db->getCollection(*it))); BSONObj curObj; PlanExecutor::ExecState execState; while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { removeSaver->goingToDelete(curObj); } if (execState != PlanExecutor::IS_EOF) { if (execState == PlanExecutor::FAILURE && WorkingSetCommon::isValidStatusMemberObject(curObj)) { Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); severe() << "rolling back createCollection on " << *it << " failed with " << errorStatus << ". A full resync is necessary."; } else { severe() << "rolling back createCollection on " << *it << " failed. A full resync is necessary."; } throw RSFatalException(); } db->dropCollection(txn, *it); wunit.commit(); } } log() << "rollback 4.7"; OldClientContext ctx(txn, rsOplogName); Collection* oplogCollection = ctx.db()->getCollection(rsOplogName); uassert(13423, str::stream() << "replSet error in rollback can't find " << rsOplogName, oplogCollection); unsigned deletes = 0, updates = 0; time_t lastProgressUpdate = time(0); time_t progressUpdateGap = 10; for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin(); it != goodVersions.end(); it++) { time_t now = time(0); if (now - lastProgressUpdate > progressUpdateGap) { log() << deletes << " delete and " << updates << " update operations processed out of " << goodVersions.size() << " total operations"; lastProgressUpdate = now; } const DocID& doc = it->first; BSONObj pattern = doc._id.wrap(); // { _id : ... } try { verify(doc.ns && *doc.ns); if (fixUpInfo.collectionsToResyncData.count(doc.ns)) { // we just synced this entire collection continue; } // keep an archive of items rolled back shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; if (!removeSaver) removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); // todo: lots of overhead in context, this can be faster OldClientContext ctx(txn, doc.ns); // Add the doc to our rollback file BSONObj obj; Collection* collection = ctx.db()->getCollection(doc.ns); // Do not log an error when undoing an insert on a no longer existent collection. // It is likely that the collection was dropped as part of rolling back a // createCollection command and regardless, the document no longer exists. if (collection) { bool found = Helpers::findOne(txn, collection, pattern, obj, false); if (found) { removeSaver->goingToDelete(obj); } else { error() << "rollback cannot find object: " << pattern << " in namespace " << doc.ns; } } if (it->second.isEmpty()) { // wasn't on the primary; delete. // TODO 1.6 : can't delete from a capped collection. need to handle that here. deletes++; if (collection) { if (collection->isCapped()) { // can't delete from a capped collection - so we truncate instead. if // this item must go, so must all successors!!! try { // TODO: IIRC cappedTruncateAfter does not handle completely empty. // this will crazy slow if no _id index. long long start = Listener::getElapsedTimeMillis(); RecordId loc = Helpers::findOne(txn, collection, pattern, false); if (Listener::getElapsedTimeMillis() - start > 200) warning() << "roll back slow no _id index for " << doc.ns << " perhaps?"; // would be faster but requires index: // RecordId loc = Helpers::findById(nsd, pattern); if (!loc.isNull()) { try { collection->temp_cappedTruncateAfter(txn, loc, true); } catch (DBException& e) { if (e.getCode() == 13415) { // hack: need to just make cappedTruncate do this... MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { WriteUnitOfWork wunit(txn); uassertStatusOK(collection->truncate(txn)); wunit.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "truncate", collection->ns().ns()); } else { throw e; } } } } catch (DBException& e) { error() << "rolling back capped collection rec " << doc.ns << ' ' << e.toString(); } } else {
PlanStage::StageState IDHackStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_done) { return PlanStage::IS_EOF; } if (WorkingSet::INVALID_ID != _idBeingPagedIn) { invariant(_recordCursor); WorkingSetID id = _idBeingPagedIn; _idBeingPagedIn = WorkingSet::INVALID_ID; WorkingSetMember* member = _workingSet->get(id); invariant(WorkingSetCommon::fetchIfUnfetched(_txn, member, _recordCursor)); return advance(id, member, out); } WorkingSetID id = WorkingSet::INVALID_ID; try { // Use the index catalog to get the id index. const IndexCatalog* catalog = _collection->getIndexCatalog(); // Find the index we use. IndexDescriptor* idDesc = catalog->findIdIndex(_txn); if (NULL == idDesc) { _done = true; return PlanStage::IS_EOF; } // Look up the key by going directly to the index. RecordId loc = catalog->getIndex(idDesc)->findSingle(_txn, _key); // Key not found. if (loc.isNull()) { _done = true; return PlanStage::IS_EOF; } ++_specificStats.keysExamined; ++_specificStats.docsExamined; // Create a new WSM for the result document. id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->state = WorkingSetMember::LOC_AND_IDX; member->loc = loc; if (!_recordCursor) _recordCursor = _collection->getCursor(_txn); // We may need to request a yield while we fetch the document. if (auto fetcher = _recordCursor->fetcherForId(loc)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up a // fetch request. _idBeingPagedIn = id; member->setFetcher(fetcher.release()); *out = id; _commonStats.needYield++; return NEED_YIELD; } // The doc was already in memory, so we go ahead and return it. if (!WorkingSetCommon::fetch(_txn, member, _recordCursor)) { // _id is immutable so the index would return the only record that could // possibly match the query. _workingSet->free(id); _commonStats.isEOF = true; _done = true; return IS_EOF; } return advance(id, member, out); } catch (const WriteConflictException& wce) { // Restart at the beginning on retry. _recordCursor.reset(); if (id != WorkingSet::INVALID_ID) _workingSet->free(id); *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } }
PlanStage::StageState IDHackStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_done) { return PlanStage::IS_EOF; } if (WorkingSet::INVALID_ID != _idBeingPagedIn) { WorkingSetID id = _idBeingPagedIn; _idBeingPagedIn = WorkingSet::INVALID_ID; WorkingSetMember* member = _workingSet->get(id); WorkingSetCommon::completeFetch(_txn, member, _collection); return advance(id, member, out); } // Use the index catalog to get the id index. const IndexCatalog* catalog = _collection->getIndexCatalog(); // Find the index we use. IndexDescriptor* idDesc = catalog->findIdIndex(_txn); if (NULL == idDesc) { _done = true; return PlanStage::IS_EOF; } // This may not be valid always. See SERVER-12397. const BtreeBasedAccessMethod* accessMethod = static_cast<const BtreeBasedAccessMethod*>(catalog->getIndex(idDesc)); // Look up the key by going directly to the Btree. RecordId loc = accessMethod->findSingle(_txn, _key); // Key not found. if (loc.isNull()) { _done = true; return PlanStage::IS_EOF; } ++_specificStats.keysExamined; ++_specificStats.docsExamined; // Create a new WSM for the result document. WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = loc; member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; // We may need to request a yield while we fetch the document. std::auto_ptr<RecordFetcher> fetcher(_collection->documentNeedsFetch(_txn, loc)); if (NULL != fetcher.get()) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up a // fetch request. _idBeingPagedIn = id; member->setFetcher(fetcher.release()); *out = id; _commonStats.needFetch++; return NEED_FETCH; } // The doc was already in memory, so we go ahead and return it. member->obj = _collection->docFor(_txn, member->loc); return advance(id, member, out); }
Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace(OperationContext* txn, StringData fromNS, StringData toNS, bool stayTemp) { // some sanity checking NamespaceDetails* fromDetails = _namespaceIndex.details(fromNS); if (!fromDetails) return Status(ErrorCodes::BadValue, "from namespace doesn't exist"); if (_namespaceIndex.details(toNS)) return Status(ErrorCodes::BadValue, "to namespace already exists"); // at this point, we haven't done anything destructive yet // ---- // actually start moving // ---- // this could throw, but if it does we're ok _namespaceIndex.add_ns(txn, toNS, fromDetails); NamespaceDetails* toDetails = _namespaceIndex.details(toNS); try { toDetails->copyingFrom(txn, toNS, _namespaceIndex, fromDetails); // fixes extraOffset } catch (DBException&) { // could end up here if .ns is full - if so try to clean up / roll back a little _namespaceIndex.kill_ns(txn, toNS); throw; } // at this point, code .ns stuff moved _namespaceIndex.kill_ns(txn, fromNS); fromDetails = NULL; // fix system.namespaces BSONObj newSpec; RecordId oldSpecLocation = getCollectionCatalogEntry(fromNS)->getNamespacesRecordId(); invariant(!oldSpecLocation.isNull()); { BSONObj oldSpec = _getNamespaceRecordStore()->dataFor(txn, oldSpecLocation).releaseToBson(); invariant(!oldSpec.isEmpty()); BSONObjBuilder b; BSONObjIterator i(oldSpec.getObjectField("options")); while (i.more()) { BSONElement e = i.next(); if (strcmp(e.fieldName(), "create") != 0) { if (stayTemp || (strcmp(e.fieldName(), "temp") != 0)) b.append(e); } else { b << "create" << toNS; } } newSpec = b.obj(); } RecordId rid = _addNamespaceToNamespaceCollection(txn, toNS, newSpec.isEmpty() ? 0 : &newSpec); _getNamespaceRecordStore()->deleteRecord(txn, oldSpecLocation); Entry*& entry = _collections[toNS.toString()]; invariant(entry == NULL); txn->recoveryUnit()->registerChange(new EntryInsertion(toNS, this)); entry = new Entry(); _removeFromCache(txn->recoveryUnit(), fromNS); _insertInCache(txn, toNS, rid, entry); return Status::OK(); }
void pass(int p) { OperationContextImpl txn; create(); ASSERT_EQUALS( 2, nExtents() ); BSONObj b = bigObj(); int N = MinExtentSize / b.objsize() * nExtents() + 5; int T = N - 4; RecordId truncAt; //RecordId l[ 8 ]; for ( int i = 0; i < N; ++i ) { BSONObj bb = bigObj(); StatusWith<RecordId> status = collection()->insertDocument( &txn, bb, true ); ASSERT( status.isOK() ); RecordId a = status.getValue(); if( T == i ) truncAt = a; ASSERT( !a.isNull() ); /*ASSERT_EQUALS( i < 2 ? i + 1 : 3 + i % 2, nRecords() ); if ( i > 3 ) ASSERT( l[ i ] == l[ i - 4 ] );*/ } ASSERT( nRecords() < N ); RecordId last, first; { auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::BACKWARD)); runner->getNext(NULL, &last); ASSERT( !last.isNull() ); } { auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::FORWARD)); runner->getNext(NULL, &first); ASSERT( !first.isNull() ); ASSERT( first != last ) ; } collection()->temp_cappedTruncateAfter(&txn, truncAt, false); ASSERT_EQUALS( collection()->numRecords() , 28u ); { RecordId loc; auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::FORWARD)); runner->getNext(NULL, &loc); ASSERT( first == loc); } { auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, ns(), collection(), InternalPlanner::BACKWARD)); RecordId loc; runner->getNext(NULL, &loc); ASSERT( last != loc ); ASSERT( !last.isNull() ); } // Too big BSONObjBuilder bob; bob.appendOID("_id", 0, true); bob.append( "a", string( MinExtentSize + 300, 'a' ) ); BSONObj bigger = bob.done(); StatusWith<RecordId> status = collection()->insertDocument( &txn, bigger, true ); ASSERT( !status.isOK() ); ASSERT_EQUALS( 0, nRecords() ); }
PlanStage::StageState CollectionScan::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_isDead) { return PlanStage::DEAD; } // Do some init if we haven't already. if (NULL == _iter) { if ( _params.collection == NULL ) { _isDead = true; return PlanStage::DEAD; } try { if (_lastSeenLoc.isNull()) { _iter.reset( _params.collection->getIterator( _txn, _params.start, _params.direction ) ); } else { invariant(_params.tailable); _iter.reset( _params.collection->getIterator( _txn, _lastSeenLoc, _params.direction ) ); // Advance _iter past where we were last time. If it returns something else, // mark us as dead since we want to signal an error rather than silently // dropping data from the stream. This is related to the _lastSeenLock handling // in invalidate. if (_iter->getNext() != _lastSeenLoc) { _isDead = true; return PlanStage::DEAD; } } } catch (const WriteConflictException& wce) { // Leave us in a state to try again next time. _iter.reset(); *out = WorkingSet::INVALID_ID; return PlanStage::NEED_YIELD; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Should we try getNext() on the underlying _iter? if (isEOF()) return PlanStage::IS_EOF; const RecordId curr = _iter->curr(); if (curr.isNull()) { // We just hit EOF if (_params.tailable) _iter.reset(); // pick up where we left off on the next call to work() return PlanStage::IS_EOF; } _lastSeenLoc = curr; // See if the record we're about to access is in memory. If not, pass a fetch request up. // Note that curr() does not touch the record. This way, we are able to yield before // fetching the record. { std::auto_ptr<RecordFetcher> fetcher( _params.collection->documentNeedsFetch(_txn, curr)); if (NULL != fetcher.get()) { WorkingSetMember* member = _workingSet->get(_wsidForFetch); member->loc = curr; // Pass the RecordFetcher off to the WSM. member->setFetcher(fetcher.release()); *out = _wsidForFetch; _commonStats.needYield++; return NEED_YIELD; } } // Do this before advancing because it is more efficient while the iterator is still on this // document. const Snapshotted<BSONObj> obj = Snapshotted<BSONObj>(_txn->recoveryUnit()->getSnapshotId(), _iter->dataFor(curr).releaseToBson()); // Advance the iterator. try { invariant(_iter->getNext() == curr); } catch (const WriteConflictException& wce) { // If getNext thows, it leaves us on the original document. invariant(_iter->curr() == curr); *out = WorkingSet::INVALID_ID; return PlanStage::NEED_YIELD; } WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = curr; member->obj = obj; member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; return returnIfMatches(member, id, out); }