ClientCursor::ClientCursor(ClientCursorParams params, CursorManager* cursorManager, CursorId cursorId, OperationContext* operationUsingCursor, Date_t now) : _cursorid(cursorId), _nss(std::move(params.nss)), _authenticatedUsers(std::move(params.authenticatedUsers)), _lsid(operationUsingCursor->getLogicalSessionId()), _txnNumber(operationUsingCursor->getTxnNumber()), _readConcernLevel(params.readConcernLevel), _cursorManager(cursorManager), _originatingCommand(params.originatingCommandObj), _queryOptions(params.queryOptions), _exec(std::move(params.exec)), _operationUsingCursor(operationUsingCursor), _lastUseDate(now) { invariant(_cursorManager); invariant(_exec); invariant(_operationUsingCursor); cursorStatsOpen.increment(); if (isNoTimeout()) { // cursors normally timeout after an inactivity period to prevent excess memory use // setting this prevents timeout of the cursor in question. cursorStatsOpenNoTimeout.increment(); } }
void CursorCache::appendInfo(BSONObjBuilder& result) const { stdx::lock_guard<stdx::mutex> lk(_mutex); result.append("sharded", static_cast<int>(cursorStatsMultiTarget.get())); result.appendNumber("shardedEver", _shardedTotal); result.append("refs", static_cast<int>(cursorStatsSingleTarget.get())); result.append("totalOpen", static_cast<int>(cursorStatsTotalOpen.get())); }
//------------------[ copy constructor ]--------------------------------- Counter64::Counter64( const Counter64 &ctr64 ) : SnmpSyntax (ctr64) { smival.syntax = sNMP_SYNTAX_CNTR64; smival.value.hNumber.hipart = ctr64.high(); smival.value.hNumber.lopart = ctr64.low(); }
ClientCursor::~ClientCursor() { // Cursors must be unpinned and deregistered from their cursor manager before being deleted. invariant(!_operationUsingCursor); invariant(_disposed); cursorStatsOpen.decrement(); if (isNoTimeout()) { cursorStatsOpenNoTimeout.decrement(); } }
void CursorCache::removeRef(long long id) { verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); _refs.erase(id); _refsNS.erase(id); cursorStatsSingleTarget.decrement(); }
void ClientCursorPin::deleteUnderlying() { invariant(_cursor); invariant(_cursor->_operationUsingCursor); // Note the following subtleties of this method's implementation: // - We must unpin the cursor before destruction, since it is an error to delete a pinned // cursor. // - In addition, we must deregister the cursor before unpinning, since it is an // error to unpin a registered cursor without holding the cursor manager lock (note that // we can't simply unpin with the cursor manager lock here, since we need to guarantee // exclusive ownership of the cursor when we are deleting it). // Note it's not safe to dereference _cursor->_cursorManager unless we know we haven't been // killed. If we're not locked we assume we haven't been killed because we're working with the // global cursor manager which never kills cursors. dassert(_opCtx->lockState()->isCollectionLockedForMode(_cursor->_nss.ns(), MODE_IS) || _cursor->_cursorManager->isGlobalManager()); if (!_cursor->getExecutor()->isMarkedAsKilled()) { _cursor->_cursorManager->deregisterCursor(_cursor); } // Make sure the cursor is disposed and unpinned before being destroyed. _cursor->dispose(_opCtx); _cursor->_operationUsingCursor = nullptr; delete _cursor; cursorStatsOpenPinned.decrement(); _cursor = nullptr; }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { // We assume all options have been validated earlier, if not, programming error dassert(validateWriteConcern(writeConcern).isOK()); // Next handle blocking on disk Timer syncTimer; switch (writeConcern.syncMode) { case WriteConcernOptions::NONE: break; case WriteConcernOptions::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::JOURNAL: txn->recoveryUnit()->waitUntilDurable(); break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) { // no desired replication check return Status::OK(); } // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplication(txn, replOpTime, writeConcern); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
void CursorCache::removeRef(long long id) { verify(id); scoped_lock lk(_mutex); _refs.erase(id); _refsNS.erase(id); cursorStatsSingleTarget.decrement(); }
void CursorCache::storeRef(const std::string& server, long long id, const std::string& ns) { LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl; verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); _refs[id] = server; _refsNS[id] = ns; cursorStatsSingleTarget.increment(); }
OplogReader::OplogReader() { _tailingQueryOptions = QueryOption_SlaveOk; _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay; /* TODO: slaveOk maybe shouldn't use? */ _tailingQueryOptions |= QueryOption_AwaitData; readersCreatedStats.increment(); }
Status Collection::recordStoreGoingToMove(OperationContext* txn, const RecordId& oldLocation, const char* oldBuffer, size_t oldSize) { moveCounter.increment(); _cursorManager.invalidateDocument(txn, oldLocation, INVALIDATION_DELETION); _indexCatalog.unindexRecord(txn, BSONObj(oldBuffer), oldLocation, true); return Status::OK(); }
ClientCursorPin::ClientCursorPin(OperationContext* opCtx, ClientCursor* cursor) : _opCtx(opCtx), _cursor(cursor) { invariant(_cursor); invariant(_cursor->_operationUsingCursor); invariant(_cursor->_cursorManager); invariant(!_cursor->_disposed); // We keep track of the number of cursors currently pinned. The cursor can become unpinned // either by being released back to the cursor manager or by being deleted. A cursor may be // transferred to another pin object via move construction or move assignment, but in this case // it is still considered pinned. cursorStatsOpenPinned.increment(); }
void OpDebug::recordStats() { if ( nreturned > 0 ) returnedCounter.increment( nreturned ); if ( ninserted > 0 ) insertedCounter.increment( ninserted ); if ( nMatched > 0 ) updatedCounter.increment( nMatched ); if ( ndeleted > 0 ) deletedCounter.increment( ndeleted ); if ( nscanned > 0 ) scannedCounter.increment( nscanned ); if ( nscannedObjects > 0 ) scannedObjectCounter.increment( nscannedObjects ); if ( idhack ) idhackCounter.increment(); if ( scanAndOrder ) scanAndOrderCounter.increment(); if ( fastmod ) fastmodCounter.increment(); if ( writeConflicts ) writeConflictsCounter.increment( writeConflicts ); }
void OpDebug::recordStats() { if ( nreturned > 0 ) returnedCounter.increment( nreturned ); if ( ninserted > 0 ) insertedCounter.increment( ninserted ); if ( nupdated > 0 ) updatedCounter.increment( nupdated ); if ( ndeleted > 0 ) deletedCounter.increment( ndeleted ); if ( nscanned > 0 ) scannedCounter.increment( nscanned ); if ( idhack ) idhackCounter.increment(); if ( scanAndOrder ) scanAndOrderCounter.increment(); if ( fastmod ) fastmodCounter.increment(); }
ShardedClientCursor::ShardedClientCursor(QueryMessage& q, ParallelSortClusteredCursor* cursor) { verify(cursor); _cursor = cursor; _skip = q.ntoskip; _ntoreturn = q.ntoreturn; _totalSent = 0; _done = false; _id = 0; if (q.queryOptions & QueryOption_NoCursorTimeout) { _lastAccessMillis = 0; } else _lastAccessMillis = Listener::getElapsedTimeMillis(); cursorStatsMultiTarget.increment(); }
virtual void run() { Client::initThread( name().c_str() ); while ( ! inShutdown() ) { sleepsecs( 60 ); LOG(3) << "TTLMonitor thread awake" << endl; if ( lockedForWriting() ) { // note: this is not perfect as you can go into fsync+lock between // this and actually doing the delete later LOG(3) << " locked for writing" << endl; continue; } // if part of replSet but not in a readable state (e.g. during initial sync), skip. if ( theReplSet && !theReplSet->state().readable() ) continue; set<string> dbs; { Lock::DBRead lk( "local" ); dbHolder().getAllShortNames( dbs ); } ttlPasses.increment(); for ( set<string>::const_iterator i=dbs.begin(); i!=dbs.end(); ++i ) { string db = *i; try { doTTLForDB( db ); } catch ( DBException& e ) { error() << "error processing ttl for db: " << db << " " << e << endl; } } } }
/* apply the log op that is in param o @return bool success (true) or failure (false) */ bool SyncTail::syncApply( OperationContext* txn, const BSONObj &op, bool convertUpdateToUpsert) { const char *ns = op.getStringField("ns"); verify(ns); if ( (*ns == '\0') || (*ns == '.') ) { // this is ugly // this is often a no-op // but can't be 100% sure if( *op.getStringField("op") != 'n' ) { error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog; } return true; } bool isCommand(op["op"].valuestrsafe()[0] == 'c'); boost::scoped_ptr<Lock::ScopedLock> lk; if(isCommand) { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( lk.reset(new Lock::GlobalWrite(txn->lockState())); } else { // DB level lock for this operation lk.reset(new Lock::DBWrite(txn->lockState(), ns)); } Client::Context ctx(ns); ctx.getClient()->curop()->reset(); // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert); opsAppliedStats.increment(); txn->recoveryUnit()->commitIfNeeded(); return ok; }
void ClientCursorPin::release() { if (!_cursor) return; // Note it's not safe to dereference _cursor->_cursorManager unless we know we haven't been // killed. If we're not locked we assume we haven't been killed because we're working with the // global cursor manager which never kills cursors. dassert(_opCtx->lockState()->isCollectionLockedForMode(_cursor->_nss.ns(), MODE_IS) || _cursor->_cursorManager->isGlobalManager()); invariant(_cursor->_operationUsingCursor); if (_cursor->getExecutor()->isMarkedAsKilled()) { // The ClientCursor was killed while we had it. Therefore, it is our responsibility to // call dispose() and delete it. deleteUnderlying(); } else { // Unpin the cursor under the collection cursor manager lock. _cursor->_cursorManager->unpin(_opCtx, _cursor); cursorStatsOpenPinned.decrement(); } _cursor = nullptr; }
namespace mongo { using std::endl; using std::vector; static Counter64 freelistAllocs; static Counter64 freelistBucketExhausted; static Counter64 freelistIterations; // TODO figure out what to do about these. static ServerStatusMetricField<Counter64> dFreelist1("storage.freelist.search.requests", &freelistAllocs); static ServerStatusMetricField<Counter64> dFreelist2("storage.freelist.search.bucketExhausted", &freelistBucketExhausted); static ServerStatusMetricField<Counter64> dFreelist3("storage.freelist.search.scanned", &freelistIterations); SimpleRecordStoreV1::SimpleRecordStoreV1(OperationContext* txn, StringData ns, RecordStoreV1MetaData* details, ExtentManager* em, bool isSystemIndexes) : RecordStoreV1Base(ns, details, em, isSystemIndexes) { invariant(!details->isCapped()); _normalCollection = NamespaceString::normal(ns); } SimpleRecordStoreV1::~SimpleRecordStoreV1() {} DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) { // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the // correct deleted list each time we try to allocate a new record. This ensures we won't // orphan any data when upgrading from old versions, without needing a long upgrade phase. // This is done before we try to allocate the new record so we can take advantage of the new // space immediately. { const DiskLoc head = _details->deletedListLegacyGrabBag(); if (!head.isNull()) { _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted()); addDeletedRec(txn, head); } } // align size up to a multiple of 4 const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1); freelistAllocs.increment(); DiskLoc loc; DeletedRecord* dr = NULL; { int myBucket; for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) { // Only look at the first entry in each bucket. This works because we are either // quantizing or allocating fixed-size blocks. const DiskLoc head = _details->deletedListEntry(myBucket); if (head.isNull()) continue; DeletedRecord* const candidate = drec(head); if (candidate->lengthWithHeaders() >= lenToAlloc) { loc = head; dr = candidate; break; } } if (!dr) return DiskLoc(); // no space // Unlink ourself from the deleted list _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted()); *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive } invariant(dr->extentOfs() < loc.getOfs()); // Split the deleted record if it has at least as much left over space as our smallest // allocation size. Otherwise, just take the whole DeletedRecord. const int remainingLength = dr->lengthWithHeaders() - lenToAlloc; if (remainingLength >= bucketSizes[0]) { txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc; const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc); DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc)); newDel->extentOfs() = dr->extentOfs(); newDel->lengthWithHeaders() = remainingLength; newDel->nextDeleted().Null(); addDeletedRec(txn, newDelLoc); } return loc; } StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord(OperationContext* txn, int lengthWithHeaders, bool enforceQuota) { if (lengthWithHeaders > MaxAllowedAllocation) { return StatusWith<DiskLoc>( ErrorCodes::InvalidLength, str::stream() << "Attempting to allocate a record larger than maximum size: " << lengthWithHeaders << " > 16.5MB"); } DiskLoc loc = _allocFromExistingExtents(txn, lengthWithHeaders); if (!loc.isNull()) return StatusWith<DiskLoc>(loc); LOG(1) << "allocating new extent"; increaseStorageSize( txn, _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(txn)), enforceQuota); loc = _allocFromExistingExtents(txn, lengthWithHeaders); if (!loc.isNull()) { // got on first try return StatusWith<DiskLoc>(loc); } log() << "warning: alloc() failed after allocating new extent. " << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:" << _details->lastExtentSize(txn) << "; trying again"; for (int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++) { log() << "try #" << z << endl; increaseStorageSize( txn, _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(txn)), enforceQuota); loc = _allocFromExistingExtents(txn, lengthWithHeaders); if (!loc.isNull()) return StatusWith<DiskLoc>(loc); } return StatusWith<DiskLoc>(ErrorCodes::InternalError, "cannot allocate space"); } Status SimpleRecordStoreV1::truncate(OperationContext* txn) { const DiskLoc firstExtLoc = _details->firstExtent(txn); if (firstExtLoc.isNull() || !firstExtLoc.isValid()) { // Already empty return Status::OK(); } // Free all extents except the first. Extent* firstExt = _extentManager->getExtent(firstExtLoc); if (!firstExt->xnext.isNull()) { const DiskLoc extNextLoc = firstExt->xnext; const DiskLoc oldLastExtLoc = _details->lastExtent(txn); Extent* const nextExt = _extentManager->getExtent(extNextLoc); // Unlink other extents; *txn->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc(); *txn->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc(); _details->setLastExtent(txn, firstExtLoc); _details->setLastExtentSize(txn, firstExt->length); _extentManager->freeExtents(txn, extNextLoc, oldLastExtLoc); } // Make the first (now only) extent a single large deleted record. *txn->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc(); *txn->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc(); _details->orphanDeletedList(txn); addDeletedRec(txn, _findFirstSpot(txn, firstExtLoc, firstExt)); // Make stats reflect that there are now no documents in this record store. _details->setStats(txn, 0, 0); return Status::OK(); } void SimpleRecordStoreV1::addDeletedRec(OperationContext* txn, const DiskLoc& dloc) { DeletedRecord* d = drec(dloc); int b = bucket(d->lengthWithHeaders()); *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b); _details->setDeletedListEntry(txn, b, dloc); } std::unique_ptr<SeekableRecordCursor> SimpleRecordStoreV1::getCursor(OperationContext* txn, bool forward) const { return stdx::make_unique<SimpleRecordStoreV1Iterator>(txn, this, forward); } vector<std::unique_ptr<RecordCursor>> SimpleRecordStoreV1::getManyCursors( OperationContext* txn) const { vector<std::unique_ptr<RecordCursor>> cursors; const Extent* ext; for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) { ext = _getExtent(txn, extLoc); if (ext->firstRecord.isNull()) continue; cursors.push_back( stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>(txn, ext->firstRecord, this)); } return cursors; } class CompactDocWriter final : public DocWriter { public: /** * param allocationSize - allocation size WITH header */ CompactDocWriter(const MmapV1RecordHeader* rec, unsigned dataSize, size_t allocationSize) : _rec(rec), _dataSize(dataSize), _allocationSize(allocationSize) {} virtual ~CompactDocWriter() {} virtual void writeDocument(char* buf) const { memcpy(buf, _rec->data(), _dataSize); } virtual size_t documentSize() const { return _allocationSize - MmapV1RecordHeader::HeaderSize; } virtual bool addPadding() const { return false; } private: const MmapV1RecordHeader* _rec; size_t _dataSize; size_t _allocationSize; }; void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc extentLoc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << extentLoc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent* const sourceExtent = _extentManager->getExtent(extentLoc); sourceExtent->assertOk(); fassert(17437, sourceExtent->validates(extentLoc)); { // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we // first page in the whole Extent sequentially. // TODO benchmark on slow storage to verify this is measurably faster. log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl; Timer t; size_t length = sourceExtent->length; touch_pages(reinterpret_cast<const char*>(sourceExtent), length); int ms = t.millis(); if (ms > 1000) log() << "compact end paging in " << ms << "ms " << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl; } { // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents. log() << "compact copying records" << endl; long long totalNetSize = 0; long long nrecords = 0; DiskLoc nextSourceLoc = sourceExtent->firstRecord; while (!nextSourceLoc.isNull()) { txn->checkForInterrupt(); WriteUnitOfWork wunit(txn); MmapV1RecordHeader* recOld = recordFor(nextSourceLoc); RecordData oldData = recOld->toRecordData(); nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc); if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) { // object is corrupt! log() << "compact removing corrupt document!"; stats->corruptDocuments++; } else { // How much data is in the record. Excludes padding and MmapV1RecordHeader headers. const unsigned rawDataSize = adaptor->dataSize(oldData); nrecords++; oldObjSize += rawDataSize; oldObjSizeWithPadding += recOld->netLength(); // Allocation sizes include the headers and possibly some padding. const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize; unsigned allocationSize = minAllocationSize; switch (compactOptions->paddingMode) { case CompactOptions::NONE: // default padding if (shouldPadInserts()) { allocationSize = quantizeAllocationSpace(minAllocationSize); } break; case CompactOptions::PRESERVE: // keep original padding allocationSize = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: // user specified how much padding to use allocationSize = compactOptions->computeRecordSize(minAllocationSize); if (allocationSize < minAllocationSize || allocationSize > BSONObjMaxUserSize / 2) { allocationSize = minAllocationSize; } break; } invariant(allocationSize >= minAllocationSize); // Copy the data to a new record. Because we orphaned the record freelist at the // start of the compact, this insert will allocate a record in a new extent. // See the comment in compact() for more details. CompactDocWriter writer(recOld, rawDataSize, allocationSize); StatusWith<RecordId> status = insertRecordWithDocWriter(txn, &writer); uassertStatusOK(status.getStatus()); const MmapV1RecordHeader* newRec = recordFor(DiskLoc::fromRecordId(status.getValue())); invariant(unsigned(newRec->netLength()) >= rawDataSize); totalNetSize += newRec->netLength(); // Tells the caller that the record has been moved, so it can do things such as // add it to indexes. adaptor->inserted(newRec->toRecordData(), status.getValue()); } // Remove the old record from the linked list of records withing the sourceExtent. // The old record is not added to the freelist as we will be freeing the whole // extent at the end. *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc; if (nextSourceLoc.isNull()) { // Just moved the last record out of the extent. Mark extent as empty. *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc(); } else { MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc); txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs; } // Adjust the stats to reflect the removal of the old record. The insert above // handled adjusting the stats for the new record. _details->incrementStats(txn, -(recOld->netLength()), -1); wunit.commit(); } // The extent must now be empty. invariant(sourceExtent->firstRecord.isNull()); invariant(sourceExtent->lastRecord.isNull()); // We are still the first extent, but we must not be the only extent. invariant(_details->firstExtent(txn) == extentLoc); invariant(_details->lastExtent(txn) != extentLoc); // Remove the newly emptied sourceExtent from the extent linked list and return it to // the extent manager. WriteUnitOfWork wunit(txn); const DiskLoc newFirst = sourceExtent->xnext; _details->setFirstExtent(txn, newFirst); *txn->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc(); _extentManager->freeExtent(txn, extentLoc); wunit.commit(); { const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize : 1.0; // defining 0/0 as 1 for this. log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)" << " oldPadding: " << oldPadding; } } } Status SimpleRecordStoreV1::compact(OperationContext* txn, RecordStoreCompactAdaptor* adaptor, const CompactOptions* options, CompactStats* stats) { std::vector<DiskLoc> extents; for (DiskLoc extLocation = _details->firstExtent(txn); !extLocation.isNull(); extLocation = _extentManager->getExtent(extLocation)->xnext) { extents.push_back(extLocation); } log() << "compact " << extents.size() << " extents"; { WriteUnitOfWork wunit(txn); // Orphaning the deleted lists ensures that all inserts go to new extents rather than // the ones that existed before starting the compact. If we abort the operation before // completion, any free space in the old extents will be leaked and never reused unless // the collection is compacted again or dropped. This is considered an acceptable // failure mode as no data will be lost. log() << "compact orphan deleted lists" << endl; _details->orphanDeletedList(txn); // Start over from scratch with our extent sizing and growth _details->setLastExtentSize(txn, 0); // create a new extent so new records go there increaseStorageSize(txn, _details->lastExtentSize(txn), true); wunit.commit(); } stdx::unique_lock<Client> lk(*txn->getClient()); ProgressMeterHolder pm( *txn->setMessage_inlock("compact extent", "Extent Compacting Progress", extents.size())); lk.unlock(); // Go through all old extents and move each record to a new set of extents. int extentNumber = 0; for (std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++) { txn->checkForInterrupt(); invariant(_details->firstExtent(txn) == *it); // empties and removes the first extent _compactExtent(txn, *it, extentNumber++, adaptor, options, stats); invariant(_details->firstExtent(txn) != *it); pm.hit(); } invariant(_extentManager->getExtent(_details->firstExtent(txn))->xprev.isNull()); invariant(_extentManager->getExtent(_details->lastExtent(txn))->xnext.isNull()); // indexes will do their own progress meter pm.finished(); return Status::OK(); } }
namespace repl { const BSONObj reverseNaturalObj = BSON("$natural" << -1); // number of readers created; // this happens when the source source changes, a reconfig/network-error or the cursor dies static Counter64 readersCreatedStats; static ServerStatusMetricField<Counter64> displayReadersCreated("repl.network.readersCreated", &readersCreatedStats); bool replAuthenticate(DBClientBase* conn) { if (!getGlobalAuthorizationManager()->isAuthEnabled()) return true; if (!isInternalAuthSet()) return false; return conn->authenticateInternalUser(); } const Seconds OplogReader::kSocketTimeout(30); OplogReader::OplogReader() { _tailingQueryOptions = QueryOption_SlaveOk; _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay; /* TODO: slaveOk maybe shouldn't use? */ _tailingQueryOptions |= QueryOption_AwaitData; readersCreatedStats.increment(); } bool OplogReader::connect(const HostAndPort& host) { if (conn() == NULL || _host != host) { resetConnection(); _conn = shared_ptr<DBClientConnection>( new DBClientConnection(false, durationCount<Seconds>(kSocketTimeout))); string errmsg; if (!_conn->connect(host, errmsg) || (getGlobalAuthorizationManager()->isAuthEnabled() && !replAuthenticate(_conn.get()))) { resetConnection(); error() << errmsg << endl; return false; } _conn->port().tag |= executor::NetworkInterface::kMessagingPortKeepOpen; _host = host; } return true; } void OplogReader::tailCheck() { if (cursor.get() && cursor->isDead()) { log() << "old cursor isDead, will initiate a new one" << std::endl; resetCursor(); } } void OplogReader::query( const char* ns, Query query, int nToReturn, int nToSkip, const BSONObj* fields) { cursor.reset( _conn->query(ns, query, nToReturn, nToSkip, fields, QueryOption_SlaveOk).release()); } void OplogReader::tailingQuery(const char* ns, const BSONObj& query) { verify(!haveCursor()); LOG(2) << ns << ".find(" << query.toString() << ')' << endl; cursor.reset(_conn->query(ns, query, 0, 0, nullptr, _tailingQueryOptions).release()); } void OplogReader::tailingQueryGTE(const char* ns, Timestamp optime) { BSONObjBuilder gte; gte.append("$gte", optime); BSONObjBuilder query; query.append("ts", gte.done()); tailingQuery(ns, query.done()); } HostAndPort OplogReader::getHost() const { return _host; } void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp()); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, oldestOpTimeSeen); bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) } } // namespace repl
namespace mongo { // todo : move more here CurOp::CurOp( Client * client , CurOp * wrapped ) : _client(client), _wrapped(wrapped) { if ( _wrapped ) _client->_curOp = this; _start = 0; _active = false; _reset(); _op = 0; _opNum = _nextOpNum++; // These addresses should never be written to again. The zeroes are // placed here as a precaution because currentOp may be accessed // without the db mutex. memset(_ns, 0, sizeof(_ns)); } void CurOp::_reset() { _suppressFromCurop = false; _command = false; _dbprofile = 0; _end = 0; _maxTimeTracker.reset(); _message = ""; _progressMeter.finished(); _killPending.store(0); killCurrentOp.notifyAllWaiters(); _numYields = 0; _expectedLatencyMs = 0; _lockStat.reset(); } void CurOp::reset() { _reset(); _start = 0; _opNum = _nextOpNum++; _ns[0] = 0; _debug.reset(); _query.reset(); _active = true; // this should be last for ui clarity } CurOp* CurOp::getOp(const BSONObj& criteria) { // Regarding Matcher: This is not quite the right hammer to use here. // Future: use an actual property of CurOp to flag index builds // and use that to filter. // This will probably need refactoring once we change index builds // to be a real command instead of an insert into system.indexes Matcher matcher(criteria); Client& me = cc(); scoped_lock client_lock(Client::clientsMutex); for (std::set<Client*>::iterator it = Client::clients.begin(); it != Client::clients.end(); it++) { Client *client = *it; verify(client); CurOp* curop = client->curop(); if (client == &me || curop == NULL) { continue; } if ( !curop->active() ) continue; if ( curop->killPendingStrict() ) continue; BSONObj info = curop->description(); if (matcher.matches(info)) { return curop; } } return NULL; } void CurOp::reset( const HostAndPort& remote, int op ) { reset(); if( _remote != remote ) { // todo : _remote is not thread safe yet is used as such! _remote = remote; } _op = op; } ProgressMeter& CurOp::setMessage(const char * msg, std::string name, unsigned long long progressMeterTotal, int secondsBetween) { if ( progressMeterTotal ) { if ( _progressMeter.isActive() ) { cout << "about to assert, old _message: " << _message << " new message:" << msg << endl; verify( ! _progressMeter.isActive() ); } _progressMeter.reset( progressMeterTotal , secondsBetween ); _progressMeter.setName(name); } else { _progressMeter.finished(); } _message = msg; return _progressMeter; } CurOp::~CurOp() { killCurrentOp.notifyAllWaiters(); if ( _wrapped ) { scoped_lock bl(Client::clientsMutex); _client->_curOp = _wrapped; } _client = 0; } void CurOp::ensureStarted() { if ( _start == 0 ) _start = curTimeMicros64(); } void CurOp::enter( Client::Context * context ) { ensureStarted(); strncpy( _ns, context->ns(), Namespace::MaxNsLen); _ns[Namespace::MaxNsLen] = 0; _dbprofile = std::max( context->_db ? context->_db->getProfilingLevel() : 0 , _dbprofile ); } void CurOp::leave( Client::Context * context ) { } void CurOp::recordGlobalTime( long long micros ) const { if ( _client ) { const LockState& ls = _client->lockState(); verify( ls.threadState() ); Top::global.record( _ns , _op , ls.hasAnyWriteLock() ? 1 : -1 , micros , _command ); } } BSONObj CurOp::info() { BSONObjBuilder b; b.append("opid", _opNum); bool a = _active && _start; b.append("active", a); if( a ) { b.append("secs_running", elapsedSeconds() ); } b.append( "op" , opToString( _op ) ); b.append("ns", _ns); if (_op == dbInsert) { _query.append(b, "insert"); } else { _query.append(b , "query"); } if( !_remote.empty() ) { b.append("client", _remote.toString()); } if ( _client ) { b.append( "desc" , _client->desc() ); if ( _client->_threadId.size() ) b.append( "threadId" , _client->_threadId ); if ( _client->_connectionId ) b.appendNumber( "connectionId" , _client->_connectionId ); _client->_ls.reportState(b); } if ( ! _message.empty() ) { if ( _progressMeter.isActive() ) { StringBuilder buf; buf << _message.toString() << " " << _progressMeter.toString(); b.append( "msg" , buf.str() ); BSONObjBuilder sub( b.subobjStart( "progress" ) ); sub.appendNumber( "done" , (long long)_progressMeter.done() ); sub.appendNumber( "total" , (long long)_progressMeter.total() ); sub.done(); } else { b.append( "msg" , _message.toString() ); } } if( killPending() ) b.append("killPending", true); b.append( "numYields" , _numYields ); b.append( "lockStats" , _lockStat.report() ); return b.obj(); } BSONObj CurOp::description() { BSONObjBuilder bob; bool a = _active && _start; bob.append("active", a); bob.append( "op" , opToString( _op ) ); bob.append("ns", _ns); if (_op == dbInsert) { _query.append(bob, "insert"); } else { _query.append(bob, "query"); } if( killPending() ) bob.append("killPending", true); return bob.obj(); } void CurOp::setKillWaiterFlags() { for (size_t i = 0; i < _notifyList.size(); ++i) *(_notifyList[i]) = true; _notifyList.clear(); } void CurOp::kill(bool* pNotifyFlag /* = NULL */) { _killPending.store(1); if (pNotifyFlag) { _notifyList.push_back(pNotifyFlag); } } void CurOp::setMaxTimeMicros(uint64_t maxTimeMicros) { if (maxTimeMicros == 0) { // 0 is "allow to run indefinitely". return; } // Note that calling startTime() will set CurOp::_start if it hasn't been set yet. _maxTimeTracker.setTimeLimit(startTime(), maxTimeMicros); } bool CurOp::maxTimeHasExpired() { return _maxTimeTracker.checkTimeLimit(); } uint64_t CurOp::getRemainingMaxTimeMicros() const { return _maxTimeTracker.getRemainingMicros(); } AtomicUInt CurOp::_nextOpNum; static Counter64 returnedCounter; static Counter64 insertedCounter; static Counter64 updatedCounter; static Counter64 deletedCounter; static Counter64 scannedCounter; static ServerStatusMetricField<Counter64> displayReturned( "document.returned", &returnedCounter ); static ServerStatusMetricField<Counter64> displayUpdated( "document.updated", &updatedCounter ); static ServerStatusMetricField<Counter64> displayInserted( "document.inserted", &insertedCounter ); static ServerStatusMetricField<Counter64> displayDeleted( "document.deleted", &deletedCounter ); static ServerStatusMetricField<Counter64> displayScanned( "queryExecutor.scanned", &scannedCounter ); static Counter64 idhackCounter; static Counter64 scanAndOrderCounter; static Counter64 fastmodCounter; static ServerStatusMetricField<Counter64> displayIdhack( "operation.idhack", &idhackCounter ); static ServerStatusMetricField<Counter64> displayScanAndOrder( "operation.scanAndOrder", &scanAndOrderCounter ); static ServerStatusMetricField<Counter64> displayFastMod( "operation.fastmod", &fastmodCounter ); void OpDebug::recordStats() { if ( nreturned > 0 ) returnedCounter.increment( nreturned ); if ( ninserted > 0 ) insertedCounter.increment( ninserted ); if ( nupdated > 0 ) updatedCounter.increment( nupdated ); if ( ndeleted > 0 ) deletedCounter.increment( ndeleted ); if ( nscanned > 0 ) scannedCounter.increment( nscanned ); if ( idhack ) idhackCounter.increment(); if ( scanAndOrder ) scanAndOrderCounter.increment(); if ( fastmod ) fastmodCounter.increment(); } CurOp::MaxTimeTracker::MaxTimeTracker() { reset(); } void CurOp::MaxTimeTracker::reset() { _enabled = false; _targetEpochMicros = 0; _approxTargetServerMillis = 0; } void CurOp::MaxTimeTracker::setTimeLimit(uint64_t startEpochMicros, uint64_t durationMicros) { dassert(durationMicros != 0); _enabled = true; _targetEpochMicros = startEpochMicros + durationMicros; uint64_t now = curTimeMicros64(); // If our accurate time source thinks time is not up yet, calculate the next target for // our approximate time source. if (_targetEpochMicros > now) { _approxTargetServerMillis = Listener::getElapsedTimeMillis() + static_cast<int64_t>((_targetEpochMicros - now) / 1000); } // Otherwise, set our approximate time source target such that it thinks time is already // up. else { _approxTargetServerMillis = Listener::getElapsedTimeMillis(); } } bool CurOp::MaxTimeTracker::checkTimeLimit() { if (!_enabled) { return false; } // Does our approximate time source think time is not up yet? If so, return early. if (_approxTargetServerMillis > Listener::getElapsedTimeMillis()) { return false; } uint64_t now = curTimeMicros64(); // Does our accurate time source think time is not up yet? If so, readjust the target for // our approximate time source and return early. if (_targetEpochMicros > now) { _approxTargetServerMillis = Listener::getElapsedTimeMillis() + static_cast<int64_t>((_targetEpochMicros - now) / 1000); return false; } // Otherwise, time is up. return true; } uint64_t CurOp::MaxTimeTracker::getRemainingMicros() const { if (!_enabled) { // 0 is "allow to run indefinitely". return 0; } // Does our accurate time source think time is up? If so, claim there is 1 microsecond // left for this operation. uint64_t now = curTimeMicros64(); if (_targetEpochMicros <= now) { return 1; } // Otherwise, calculate remaining time. return _targetEpochMicros - now; } }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn, int lenToAlloc ) { // align size up to a multiple of 4 lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1); freelistAllocs.increment(); DiskLoc loc; { DiskLoc *prev = 0; DiskLoc *bestprev = 0; DiskLoc bestmatch; int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough int b = bucket(lenToAlloc); DiskLoc cur = _details->deletedListEntry(b); int extra = 5; // look for a better fit, a little. int chain = 0; while ( 1 ) { { // defensive check int fileNumber = cur.a(); int fileOffset = cur.getOfs(); if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) { StringBuilder sb; sb << "Deleted record list corrupted in collection " << _ns << ", bucket " << b << ", link number " << chain << ", invalid link is " << cur.toString() << ", throwing Fatal Assertion"; log() << sb.str() << endl; fassertFailed(16469); } } if ( cur.isNull() ) { // move to next bucket. if we were doing "extra", just break if ( bestmatchlen < INT_MAX ) break; if ( chain > 0 ) { // if we looked at things in the right bucket, but they were not suitable freelistBucketExhausted.increment(); } b++; if ( b > MaxBucket ) { // out of space. alloc a new extent. freelistIterations.increment( 1 + chain ); return DiskLoc(); } cur = _details->deletedListEntry(b); prev = 0; continue; } DeletedRecord *r = drec(cur); if ( r->lengthWithHeaders() >= lenToAlloc && r->lengthWithHeaders() < bestmatchlen ) { bestmatchlen = r->lengthWithHeaders(); bestmatch = cur; bestprev = prev; if (r->lengthWithHeaders() == lenToAlloc) // exact match, stop searching break; } if ( bestmatchlen < INT_MAX && --extra <= 0 ) break; if ( ++chain > 30 && b <= MaxBucket ) { // too slow, force move to next bucket to grab a big chunk //b++; freelistIterations.increment( chain ); chain = 0; cur.Null(); } else { cur = r->nextDeleted(); prev = &r->nextDeleted(); } } // unlink ourself from the deleted list DeletedRecord *bmr = drec(bestmatch); if ( bestprev ) { *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted(); } else { // should be the front of a free-list int myBucket = bucket(bmr->lengthWithHeaders()); invariant( _details->deletedListEntry(myBucket) == bestmatch ); _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted()); } *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive. invariant(bmr->extentOfs() < bestmatch.getOfs()); freelistIterations.increment( 1 + chain ); loc = bestmatch; } if ( loc.isNull() ) return loc; // determine if we should chop up DeletedRecord *r = drec(loc); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ int regionlen = r->lengthWithHeaders(); invariant( r->extentOfs() < loc.getOfs() ); int left = regionlen - lenToAlloc; if ( left < 24 || left < (lenToAlloc / 8) ) { // you get the whole thing. return loc; } // don't quantize: // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425 if ( _normalCollection ) { // we quantize here so that it only impacts newly sized records // this prevents oddities with older records and space re-use SERVER-8435 lenToAlloc = std::min( r->lengthWithHeaders(), quantizeAllocationSpace( lenToAlloc ) ); left = regionlen - lenToAlloc; if ( left < 24 ) { // you get the whole thing. return loc; } } /* split off some for further use. */ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord* newDel = drec(newDelLoc); DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); newDelW->extentOfs() = r->extentOfs(); newDelW->lengthWithHeaders() = left; newDelW->nextDeleted().Null(); addDeletedRec( txn, newDelLoc ); return loc; }
namespace mongo { static Counter64 freelistAllocs; static Counter64 freelistBucketExhausted; static Counter64 freelistIterations; static ServerStatusMetricField<Counter64> dFreelist1( "storage.freelist.search.requests", &freelistAllocs ); static ServerStatusMetricField<Counter64> dFreelist2( "storage.freelist.search.bucketExhausted", &freelistBucketExhausted ); static ServerStatusMetricField<Counter64> dFreelist3( "storage.freelist.search.scanned", &freelistIterations ); SimpleRecordStoreV1::SimpleRecordStoreV1( OperationContext* txn, const StringData& ns, RecordStoreV1MetaData* details, ExtentManager* em, bool isSystemIndexes ) : RecordStoreV1Base( ns, details, em, isSystemIndexes ) { invariant( !details->isCapped() ); _normalCollection = NamespaceString::normal( ns ); if ( _details->paddingFactor() == 0 ) { warning() << "implicit updgrade of paddingFactor of very old collection" << endl; WriteUnitOfWork wunit(txn); _details->setPaddingFactor(txn, 1.0); wunit.commit(); } } SimpleRecordStoreV1::~SimpleRecordStoreV1() { } DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn, int lenToAlloc ) { // align size up to a multiple of 4 lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1); freelistAllocs.increment(); DiskLoc loc; { DiskLoc *prev = 0; DiskLoc *bestprev = 0; DiskLoc bestmatch; int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough int b = bucket(lenToAlloc); DiskLoc cur = _details->deletedListEntry(b); int extra = 5; // look for a better fit, a little. int chain = 0; while ( 1 ) { { // defensive check int fileNumber = cur.a(); int fileOffset = cur.getOfs(); if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) { StringBuilder sb; sb << "Deleted record list corrupted in collection " << _ns << ", bucket " << b << ", link number " << chain << ", invalid link is " << cur.toString() << ", throwing Fatal Assertion"; log() << sb.str() << endl; fassertFailed(16469); } } if ( cur.isNull() ) { // move to next bucket. if we were doing "extra", just break if ( bestmatchlen < INT_MAX ) break; if ( chain > 0 ) { // if we looked at things in the right bucket, but they were not suitable freelistBucketExhausted.increment(); } b++; if ( b > MaxBucket ) { // out of space. alloc a new extent. freelistIterations.increment( 1 + chain ); return DiskLoc(); } cur = _details->deletedListEntry(b); prev = 0; continue; } DeletedRecord *r = drec(cur); if ( r->lengthWithHeaders() >= lenToAlloc && r->lengthWithHeaders() < bestmatchlen ) { bestmatchlen = r->lengthWithHeaders(); bestmatch = cur; bestprev = prev; if (r->lengthWithHeaders() == lenToAlloc) // exact match, stop searching break; } if ( bestmatchlen < INT_MAX && --extra <= 0 ) break; if ( ++chain > 30 && b <= MaxBucket ) { // too slow, force move to next bucket to grab a big chunk //b++; freelistIterations.increment( chain ); chain = 0; cur.Null(); } else { cur = r->nextDeleted(); prev = &r->nextDeleted(); } } // unlink ourself from the deleted list DeletedRecord *bmr = drec(bestmatch); if ( bestprev ) { *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted(); } else { // should be the front of a free-list int myBucket = bucket(bmr->lengthWithHeaders()); invariant( _details->deletedListEntry(myBucket) == bestmatch ); _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted()); } *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive. invariant(bmr->extentOfs() < bestmatch.getOfs()); freelistIterations.increment( 1 + chain ); loc = bestmatch; } if ( loc.isNull() ) return loc; // determine if we should chop up DeletedRecord *r = drec(loc); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ int regionlen = r->lengthWithHeaders(); invariant( r->extentOfs() < loc.getOfs() ); int left = regionlen - lenToAlloc; if ( left < 24 || left < (lenToAlloc / 8) ) { // you get the whole thing. return loc; } // don't quantize: // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425 if ( _normalCollection ) { // we quantize here so that it only impacts newly sized records // this prevents oddities with older records and space re-use SERVER-8435 lenToAlloc = std::min( r->lengthWithHeaders(), quantizeAllocationSpace( lenToAlloc ) ); left = regionlen - lenToAlloc; if ( left < 24 ) { // you get the whole thing. return loc; } } /* split off some for further use. */ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord* newDel = drec(newDelLoc); DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); newDelW->extentOfs() = r->extentOfs(); newDelW->lengthWithHeaders() = left; newDelW->nextDeleted().Null(); addDeletedRec( txn, newDelLoc ); return loc; } StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord( OperationContext* txn, int lengthWithHeaders, bool enforceQuota ) { DiskLoc loc = _allocFromExistingExtents( txn, lengthWithHeaders ); if ( !loc.isNull() ) return StatusWith<DiskLoc>( loc ); LOG(1) << "allocating new extent"; increaseStorageSize( txn, _extentManager->followupSize( lengthWithHeaders, _details->lastExtentSize(txn)), enforceQuota ); loc = _allocFromExistingExtents( txn, lengthWithHeaders ); if ( !loc.isNull() ) { // got on first try return StatusWith<DiskLoc>( loc ); } log() << "warning: alloc() failed after allocating new extent. " << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:" << _details->lastExtentSize(txn) << "; trying again"; for ( int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++ ) { log() << "try #" << z << endl; increaseStorageSize( txn, _extentManager->followupSize( lengthWithHeaders, _details->lastExtentSize(txn)), enforceQuota ); loc = _allocFromExistingExtents( txn, lengthWithHeaders ); if ( ! loc.isNull() ) return StatusWith<DiskLoc>( loc ); } return StatusWith<DiskLoc>( ErrorCodes::InternalError, "cannot allocate space" ); } Status SimpleRecordStoreV1::truncate(OperationContext* txn) { const DiskLoc firstExtLoc = _details->firstExtent(txn); if (firstExtLoc.isNull() || !firstExtLoc.isValid()) { // Already empty return Status::OK(); } // Free all extents except the first. Extent* firstExt = _extentManager->getExtent(firstExtLoc); if (!firstExt->xnext.isNull()) { const DiskLoc extNextLoc = firstExt->xnext; const DiskLoc oldLastExtLoc = _details->lastExtent(txn); Extent* const nextExt = _extentManager->getExtent(extNextLoc); // Unlink other extents; *txn->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc(); *txn->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc(); _details->setLastExtent(txn, firstExtLoc); _details->setLastExtentSize(txn, firstExt->length); _extentManager->freeExtents(txn, extNextLoc, oldLastExtLoc); } // Make the first (now only) extent a single large deleted record. *txn->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc(); *txn->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc(); _details->orphanDeletedList(txn); addDeletedRec(txn, _findFirstSpot(txn, firstExtLoc, firstExt)); // Make stats reflect that there are now no documents in this record store. _details->setStats(txn, 0, 0); return Status::OK(); } void SimpleRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) { DeletedRecord* d = drec( dloc ); DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl; int b = bucket(d->lengthWithHeaders()); *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b); _details->setDeletedListEntry(txn, b, dloc); } RecordIterator* SimpleRecordStoreV1::getIterator( OperationContext* txn, const DiskLoc& start, bool tailable, const CollectionScanParams::Direction& dir) const { return new SimpleRecordStoreV1Iterator( txn, this, start, dir ); } vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const { OwnedPointerVector<RecordIterator> iterators; const Extent* ext; for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) { ext = _getExtent(txn, extLoc); if (ext->firstRecord.isNull()) continue; iterators.push_back( new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this)); } return iterators.release(); } class CompactDocWriter : public DocWriter { public: /** * param allocationSize - allocation size WITH header */ CompactDocWriter( const Record* rec, unsigned dataSize, size_t allocationSize ) : _rec( rec ), _dataSize( dataSize ), _allocationSize( allocationSize ) { } virtual ~CompactDocWriter() {} virtual void writeDocument( char* buf ) const { memcpy( buf, _rec->data(), _dataSize ); } virtual size_t documentSize() const { return _allocationSize - Record::HeaderSize; } virtual bool addPadding() const { return false; } private: const Record* _rec; size_t _dataSize; size_t _allocationSize; }; void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc extentLoc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << extentLoc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent* const sourceExtent = _extentManager->getExtent( extentLoc ); sourceExtent->assertOk(); fassert( 17437, sourceExtent->validates(extentLoc) ); { // The next/prev Record pointers within the Extent might not be in order so we first // page in the whole Extent sequentially. // TODO benchmark on slow storage to verify this is measurably faster. log() << "compact paging in len=" << sourceExtent->length/1000000.0 << "MB" << endl; Timer t; size_t length = sourceExtent->length; touch_pages( reinterpret_cast<const char*>(sourceExtent), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << sourceExtent->length/1000000.0/t.seconds() << "MB/sec" << endl; } { // Move each Record out of this extent and insert it in to the "new" extents. log() << "compact copying records" << endl; long long totalNetSize = 0; long long nrecords = 0; DiskLoc nextSourceLoc = sourceExtent->firstRecord; while (!nextSourceLoc.isNull()) { txn->checkForInterrupt(); WriteUnitOfWork wunit(txn); Record* recOld = recordFor(nextSourceLoc); RecordData oldData = recOld->toRecordData(); nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc); if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) { // object is corrupt! log() << "compact removing corrupt document!"; stats->corruptDocuments++; } else { // How much data is in the record. Excludes padding and Record headers. const unsigned rawDataSize = adaptor->dataSize( oldData ); nrecords++; oldObjSize += rawDataSize; oldObjSizeWithPadding += recOld->netLength(); // Allocation sizes include the headers and possibly some padding. const unsigned minAllocationSize = rawDataSize + Record::HeaderSize; unsigned allocationSize = minAllocationSize; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: // no padding, unless using powerOf2Sizes if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) ) allocationSize = quantizePowerOf2AllocationSpace(minAllocationSize); else allocationSize = minAllocationSize; break; case CompactOptions::PRESERVE: // keep original padding allocationSize = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: // user specified how much padding to use allocationSize = compactOptions->computeRecordSize(minAllocationSize); if (allocationSize < minAllocationSize || allocationSize > BSONObjMaxUserSize / 2 ) { allocationSize = minAllocationSize; } break; } invariant(allocationSize >= minAllocationSize); // Copy the data to a new record. Because we orphaned the record freelist at the // start of the compact, this insert will allocate a record in a new extent. // See the comment in compact() for more details. CompactDocWriter writer( recOld, rawDataSize, allocationSize ); StatusWith<DiskLoc> status = insertRecord( txn, &writer, false ); uassertStatusOK( status.getStatus() ); const Record* newRec = recordFor(status.getValue()); invariant(unsigned(newRec->netLength()) >= rawDataSize); totalNetSize += newRec->netLength(); // Tells the caller that the record has been moved, so it can do things such as // add it to indexes. adaptor->inserted(newRec->toRecordData(), status.getValue()); } // Remove the old record from the linked list of records withing the sourceExtent. // The old record is not added to the freelist as we will be freeing the whole // extent at the end. *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc; if (nextSourceLoc.isNull()) { // Just moved the last record out of the extent. Mark extent as empty. *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc(); } else { Record* newFirstRecord = recordFor(nextSourceLoc); txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs; } // Adjust the stats to reflect the removal of the old record. The insert above // handled adjusting the stats for the new record. _details->incrementStats(txn, -(recOld->netLength()), -1); wunit.commit(); } // The extent must now be empty. invariant(sourceExtent->firstRecord.isNull()); invariant(sourceExtent->lastRecord.isNull()); // We are still the first extent, but we must not be the only extent. invariant( _details->firstExtent(txn) == extentLoc ); invariant( _details->lastExtent(txn) != extentLoc ); // Remove the newly emptied sourceExtent from the extent linked list and return it to // the extent manager. WriteUnitOfWork wunit(txn); const DiskLoc newFirst = sourceExtent->xnext; _details->setFirstExtent( txn, newFirst ); *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc(); _extentManager->freeExtent( txn, extentLoc ); wunit.commit(); { const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize : 1.0; // defining 0/0 as 1 for this. log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << totalNetSize / (1024*1024.0) << "MB)" << " oldPadding: " << oldPadding; } } } Status SimpleRecordStoreV1::compact( OperationContext* txn, RecordStoreCompactAdaptor* adaptor, const CompactOptions* options, CompactStats* stats ) { std::vector<DiskLoc> extents; for( DiskLoc extLocation = _details->firstExtent(txn); !extLocation.isNull(); extLocation = _extentManager->getExtent( extLocation )->xnext ) { extents.push_back( extLocation ); } log() << "compact " << extents.size() << " extents"; { WriteUnitOfWork wunit(txn); // Orphaning the deleted lists ensures that all inserts go to new extents rather than // the ones that existed before starting the compact. If we abort the operation before // completion, any free space in the old extents will be leaked and never reused unless // the collection is compacted again or dropped. This is considered an acceptable // failure mode as no data will be lost. log() << "compact orphan deleted lists" << endl; _details->orphanDeletedList(txn); // Start over from scratch with our extent sizing and growth _details->setLastExtentSize( txn, 0 ); // create a new extent so new records go there increaseStorageSize( txn, _details->lastExtentSize(txn), true ); wunit.commit(); } ProgressMeterHolder pm(*txn->setMessage("compact extent", "Extent Compacting Progress", extents.size())); // Go through all old extents and move each record to a new set of extents. int extentNumber = 0; for( std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++ ) { txn->checkForInterrupt(); invariant(_details->firstExtent(txn) == *it); // empties and removes the first extent _compactExtent(txn, *it, extentNumber++, adaptor, options, stats ); invariant(_details->firstExtent(txn) != *it); pm.hit(); } invariant( _extentManager->getExtent( _details->firstExtent(txn) )->xprev.isNull() ); invariant( _extentManager->getExtent( _details->lastExtent(txn) )->xnext.isNull() ); // indexes will do their own progress meter pm.finished(); return Status::OK(); } }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) { // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the // correct deleted list each time we try to allocate a new record. This ensures we won't // orphan any data when upgrading from old versions, without needing a long upgrade phase. // This is done before we try to allocate the new record so we can take advantage of the new // space immediately. { const DiskLoc head = _details->deletedListLegacyGrabBag(); if (!head.isNull()) { _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted()); addDeletedRec(txn, head); } } // align size up to a multiple of 4 const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1); freelistAllocs.increment(); DiskLoc loc; DeletedRecord* dr = NULL; { int myBucket; for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) { // Only look at the first entry in each bucket. This works because we are either // quantizing or allocating fixed-size blocks. const DiskLoc head = _details->deletedListEntry(myBucket); if (head.isNull()) continue; DeletedRecord* const candidate = drec(head); if (candidate->lengthWithHeaders() >= lenToAlloc) { loc = head; dr = candidate; break; } } if (!dr) return DiskLoc(); // no space // Unlink ourself from the deleted list _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted()); *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive } invariant(dr->extentOfs() < loc.getOfs()); // Split the deleted record if it has at least as much left over space as our smallest // allocation size. Otherwise, just take the whole DeletedRecord. const int remainingLength = dr->lengthWithHeaders() - lenToAlloc; if (remainingLength >= bucketSizes[0]) { txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc; const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc); DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc)); newDel->extentOfs() = dr->extentOfs(); newDel->lengthWithHeaders() = remainingLength; newDel->nextDeleted().Null(); addDeletedRec(txn, newDelLoc); } return loc; }
long long get() const { return cursorStatsMultiTarget.get() + cursorStatsSingleTarget.get(); }
namespace mongo { using std::unique_ptr; using std::endl; using std::string; using std::stringstream; const int ShardedClientCursor::INIT_REPLY_BUFFER_SIZE = 32768; // Note: There is no counter for shardedEver from cursorInfo since it is deprecated static Counter64 cursorStatsMultiTarget; static Counter64 cursorStatsSingleTarget; // Simple class to report the sum total open cursors = sharded + refs class CursorStatsSum { public: operator long long() const { return get(); } long long get() const { return cursorStatsMultiTarget.get() + cursorStatsSingleTarget.get(); } }; static CursorStatsSum cursorStatsTotalOpen; // -------- ShardedCursor ----------- ShardedClientCursor::ShardedClientCursor(QueryMessage& q, ParallelSortClusteredCursor* cursor) { verify(cursor); _cursor = cursor; _skip = q.ntoskip; _ntoreturn = q.ntoreturn; _totalSent = 0; _done = false; _id = 0; if (q.queryOptions & QueryOption_NoCursorTimeout) { _lastAccessMillis = 0; } else _lastAccessMillis = Listener::getElapsedTimeMillis(); cursorStatsMultiTarget.increment(); } ShardedClientCursor::~ShardedClientCursor() { verify(_cursor); delete _cursor; _cursor = 0; cursorStatsMultiTarget.decrement(); } long long ShardedClientCursor::getId() { if (_id <= 0) { _id = cursorCache.genId(); verify(_id >= 0); } return _id; } int ShardedClientCursor::getTotalSent() const { return _totalSent; } void ShardedClientCursor::accessed() { if (_lastAccessMillis > 0) _lastAccessMillis = Listener::getElapsedTimeMillis(); } long long ShardedClientCursor::idleTime(long long now) { if (_lastAccessMillis == 0) return 0; return now - _lastAccessMillis; } bool ShardedClientCursor::sendNextBatch(int batchSize, BufBuilder& buffer, int& docCount) { uassert(10191, "cursor already done", !_done); int maxSize = 1024 * 1024; if (_totalSent > 0) maxSize *= 3; docCount = 0; // If batchSize is negative, it means that we should send up to -batchSize results // back to the client, and that we should only send a *single batch*. An batchSize of // 1 is also a special case which means "return up to 1 result in a single batch" (so // that +1 actually has the same meaning of -1). For all other values of batchSize, we // may have to return multiple batches. const bool sendMoreBatches = batchSize == 0 || batchSize > 1; batchSize = abs(batchSize); // Set the initial batch size to 101, just like mongoD. if (batchSize == 0 && _totalSent == 0) batchSize = 101; // Set batch size to batchSize requested by the current operation unconditionally. This is // necessary because if the loop exited due to docCount == batchSize then setBatchSize(0) was // called, so the next _cusor->more() will be called with a batch size of 0 if the cursor // buffer was drained the previous run. Unconditionally setting the batch size ensures that // we don't ask for a batch size of zero as a side effect. _cursor->setBatchSize(batchSize); bool cursorHasMore = true; while ((cursorHasMore = _cursor->more())) { BSONObj o = _cursor->next(); buffer.appendBuf((void*)o.objdata(), o.objsize()); ++docCount; // Ensure that the next batch will never wind up requesting more docs from the shard // than are remaining to satisfy the initial batchSize. if (batchSize != 0) { if (docCount == batchSize) break; _cursor->setBatchSize(batchSize - docCount); } if (buffer.len() > maxSize) { break; } } // We need to request another batch if the following two conditions hold: // // 1. batchSize is positive and not equal to 1 (see the comment above). This condition // is stored in 'sendMoreBatches'. // // 2. The last call to _cursor->more() was true (i.e. we never explicitly got a false // value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server // hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the // query response to indicate that there are no more results. In this case, _cursor->more() // will be explicitly false, and we know for sure that we do not have to send more batches. // // On the other hand, if _cursor->more() is true there may or may not be more results. // Suppose that the mongod generates enough results to fill this batch. In this case it // does not know whether not there are more, because doing so would require requesting an // extra result and seeing whether we get EOF. The mongod sends a valid cursorId to // indicate that there may be more. We do the same here: we indicate that there may be // more results to retrieve by setting 'hasMoreBatches' to true. bool hasMoreBatches = sendMoreBatches && cursorHasMore; LOG(5) << "\t hasMoreBatches: " << hasMoreBatches << " sendMoreBatches: " << sendMoreBatches << " cursorHasMore: " << cursorHasMore << " batchSize: " << batchSize << " num: " << docCount << " id:" << getId() << " totalSent: " << _totalSent << endl; _totalSent += docCount; _done = !hasMoreBatches; return hasMoreBatches; } // ---- CursorCache ----- unsigned getCCRandomSeed() { unique_ptr<SecureRandom> sr(SecureRandom::create()); return sr->nextInt64(); } CursorCache::CursorCache() : _random(getCCRandomSeed()), _shardedTotal(0) {} CursorCache::~CursorCache() { // TODO: delete old cursors? bool print = shouldLog(logger::LogSeverity::Debug(1)); if (_cursors.size() || _refs.size()) print = true; verify(_refs.size() == _refsNS.size()); if (print) log() << " CursorCache at shutdown - " << " sharded: " << _cursors.size() << " passthrough: " << _refs.size() << endl; } ShardedClientCursorPtr CursorCache::get(long long id) const { LOG(_myLogLevel) << "CursorCache::get id: " << id << endl; stdx::lock_guard<stdx::mutex> lk(_mutex); MapSharded::const_iterator i = _cursors.find(id); if (i == _cursors.end()) { return ShardedClientCursorPtr(); } i->second->accessed(); return i->second; } int CursorCache::getMaxTimeMS(long long id) const { verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); MapShardedInt::const_iterator i = _cursorsMaxTimeMS.find(id); return (i != _cursorsMaxTimeMS.end()) ? i->second : 0; } void CursorCache::store(ShardedClientCursorPtr cursor, int maxTimeMS) { LOG(_myLogLevel) << "CursorCache::store cursor " << " id: " << cursor->getId() << (maxTimeMS != kMaxTimeCursorNoTimeLimit ? str::stream() << "maxTimeMS: " << maxTimeMS : string("")) << endl; verify(cursor->getId()); verify(maxTimeMS == kMaxTimeCursorTimeLimitExpired || maxTimeMS == kMaxTimeCursorNoTimeLimit || maxTimeMS > 0); stdx::lock_guard<stdx::mutex> lk(_mutex); _cursorsMaxTimeMS[cursor->getId()] = maxTimeMS; _cursors[cursor->getId()] = cursor; _shardedTotal++; } void CursorCache::updateMaxTimeMS(long long id, int maxTimeMS) { verify(id); verify(maxTimeMS == kMaxTimeCursorTimeLimitExpired || maxTimeMS == kMaxTimeCursorNoTimeLimit || maxTimeMS > 0); stdx::lock_guard<stdx::mutex> lk(_mutex); _cursorsMaxTimeMS[id] = maxTimeMS; } void CursorCache::remove(long long id) { verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); _cursorsMaxTimeMS.erase(id); _cursors.erase(id); } void CursorCache::removeRef(long long id) { verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); _refs.erase(id); _refsNS.erase(id); cursorStatsSingleTarget.decrement(); } void CursorCache::storeRef(const std::string& server, long long id, const std::string& ns) { LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl; verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); _refs[id] = server; _refsNS[id] = ns; cursorStatsSingleTarget.increment(); } string CursorCache::getRef(long long id) const { verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); MapNormal::const_iterator i = _refs.find(id); LOG(_myLogLevel) << "CursorCache::getRef id: " << id << " out: " << (i == _refs.end() ? " NONE " : i->second) << endl; if (i == _refs.end()) return ""; return i->second; } std::string CursorCache::getRefNS(long long id) const { verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); MapNormal::const_iterator i = _refsNS.find(id); LOG(_myLogLevel) << "CursorCache::getRefNs id: " << id << " out: " << (i == _refsNS.end() ? " NONE " : i->second) << std::endl; if (i == _refsNS.end()) return ""; return i->second; } long long CursorCache::genId() { while (true) { stdx::lock_guard<stdx::mutex> lk(_mutex); long long x = Listener::getElapsedTimeMillis() << 32; x |= _random.nextInt32(); if (x == 0) continue; if (x < 0) x *= -1; MapSharded::iterator i = _cursors.find(x); if (i != _cursors.end()) continue; MapNormal::iterator j = _refs.find(x); if (j != _refs.end()) continue; return x; } } void CursorCache::gotKillCursors(Message& m) { LastError::get(cc()).disable(); DbMessage dbmessage(m); int n = dbmessage.pullInt(); if (n > 2000) { (n < 30000 ? warning() : error()) << "receivedKillCursors, n=" << n << endl; } uassert(13286, "sent 0 cursors to kill", n >= 1); uassert(13287, "too many cursors to kill", n < 30000); massert(18632, str::stream() << "bad kill cursors size: " << m.dataSize(), m.dataSize() == 8 + (8 * n)); ConstDataCursor cursors(dbmessage.getArray(n)); ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = AuthorizationSession::get(client); for (int i = 0; i < n; i++) { long long id = cursors.readAndAdvance<LittleEndian<int64_t>>(); LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl; if (!id) { warning() << " got cursor id of 0 to kill" << endl; continue; } string server; { stdx::lock_guard<stdx::mutex> lk(_mutex); MapSharded::iterator i = _cursors.find(id); if (i != _cursors.end()) { Status authorizationStatus = authSession->checkAuthForKillCursors(NamespaceString(i->second->getNS()), id); audit::logKillCursorsAuthzCheck( client, NamespaceString(i->second->getNS()), id, authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (authorizationStatus.isOK()) { _cursorsMaxTimeMS.erase(i->second->getId()); _cursors.erase(i); } continue; } MapNormal::iterator refsIt = _refs.find(id); MapNormal::iterator refsNSIt = _refsNS.find(id); if (refsIt == _refs.end()) { warning() << "can't find cursor: " << id << endl; continue; } verify(refsNSIt != _refsNS.end()); Status authorizationStatus = authSession->checkAuthForKillCursors(NamespaceString(refsNSIt->second), id); audit::logKillCursorsAuthzCheck(client, NamespaceString(refsNSIt->second), id, authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (!authorizationStatus.isOK()) { continue; } server = refsIt->second; _refs.erase(refsIt); _refsNS.erase(refsNSIt); cursorStatsSingleTarget.decrement(); } LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server << endl; verify(server.size()); ScopedDbConnection conn(server); conn->killCursor(id); conn.done(); } } void CursorCache::appendInfo(BSONObjBuilder& result) const { stdx::lock_guard<stdx::mutex> lk(_mutex); result.append("sharded", static_cast<int>(cursorStatsMultiTarget.get())); result.appendNumber("shardedEver", _shardedTotal); result.append("refs", static_cast<int>(cursorStatsSingleTarget.get())); result.append("totalOpen", static_cast<int>(cursorStatsTotalOpen.get())); } void CursorCache::doTimeouts() { long long now = Listener::getElapsedTimeMillis(); stdx::lock_guard<stdx::mutex> lk(_mutex); for (MapSharded::iterator i = _cursors.begin(); i != _cursors.end(); ++i) { // Note: cursors with no timeout will always have an idleTime of 0 long long idleFor = i->second->idleTime(now); if (idleFor < ClusterCursorCleanupJob::cursorTimeoutMillis) { continue; } log() << "killing old cursor " << i->second->getId() << " idle for: " << idleFor << "ms" << endl; // TODO: make LOG(1) _cursorsMaxTimeMS.erase(i->second->getId()); _cursors.erase(i); i = _cursors.begin(); // possible 2nd entry will get skipped, will get on next pass if (i == _cursors.end()) break; } } CursorCache cursorCache; const int CursorCache::_myLogLevel = 3; class CursorTimeoutTask : public task::Task { public: virtual string name() const { return "cursorTimeout"; } virtual void doWork() { cursorCache.doTimeouts(); } }; void CursorCache::startTimeoutThread() { task::repeat(new CursorTimeoutTask, 4000); } } // namespace mongo
namespace mongo { namespace { const auto bannedExpressionsInValidators = std::set<StringData>{ "$geoNear", "$near", "$nearSphere", "$text", "$where", }; Status checkValidatorForBannedExpressions(const BSONObj& validator) { for (auto field : validator) { const auto name = field.fieldNameStringData(); if (name[0] == '$' && bannedExpressionsInValidators.count(name)) { return {ErrorCodes::InvalidOptions, str::stream() << name << " is not allowed in collection validators"}; } if (field.type() == Object || field.type() == Array) { auto status = checkValidatorForBannedExpressions(field.Obj()); if (!status.isOK()) return status; } } return Status::OK(); } } using std::unique_ptr; using std::endl; using std::string; using std::vector; using logger::LogComponent; std::string CompactOptions::toString() const { std::stringstream ss; ss << "paddingMode: "; switch (paddingMode) { case NONE: ss << "NONE"; break; case PRESERVE: ss << "PRESERVE"; break; case MANUAL: ss << "MANUAL (" << paddingBytes << " + ( doc * " << paddingFactor << ") )"; } ss << " validateDocuments: " << validateDocuments; return ss.str(); } // // CappedInsertNotifier // CappedInsertNotifier::CappedInsertNotifier() : _version(0), _dead(false) {} void CappedInsertNotifier::notifyAll() { stdx::lock_guard<stdx::mutex> lk(_mutex); ++_version; _notifier.notify_all(); } void CappedInsertNotifier::_wait(stdx::unique_lock<stdx::mutex>& lk, uint64_t prevVersion, Microseconds timeout) const { while (!_dead && prevVersion == _version) { if (timeout == Microseconds::max()) { _notifier.wait(lk); } else if (stdx::cv_status::timeout == _notifier.wait_for(lk, timeout)) { return; } } } void CappedInsertNotifier::wait(uint64_t prevVersion, Microseconds timeout) const { stdx::unique_lock<stdx::mutex> lk(_mutex); _wait(lk, prevVersion, timeout); } void CappedInsertNotifier::wait(Microseconds timeout) const { stdx::unique_lock<stdx::mutex> lk(_mutex); _wait(lk, _version, timeout); } void CappedInsertNotifier::wait() const { stdx::unique_lock<stdx::mutex> lk(_mutex); _wait(lk, _version, Microseconds::max()); } void CappedInsertNotifier::kill() { stdx::lock_guard<stdx::mutex> lk(_mutex); _dead = true; _notifier.notify_all(); } bool CappedInsertNotifier::isDead() { stdx::lock_guard<stdx::mutex> lk(_mutex); return _dead; } // ---- Collection::Collection(OperationContext* txn, StringData fullNS, CollectionCatalogEntry* details, RecordStore* recordStore, DatabaseCatalogEntry* dbce) : _ns(fullNS), _details(details), _recordStore(recordStore), _dbce(dbce), _needCappedLock(supportsDocLocking() && _recordStore->isCapped() && _ns.db() != "local"), _infoCache(this), _indexCatalog(this), _validatorDoc(_details->getCollectionOptions(txn).validator.getOwned()), _validator(uassertStatusOK(parseValidator(_validatorDoc))), _validationAction(uassertStatusOK( _parseValidationAction(_details->getCollectionOptions(txn).validationAction))), _validationLevel(uassertStatusOK( _parseValidationLevel(_details->getCollectionOptions(txn).validationLevel))), _cursorManager(fullNS), _cappedNotifier(_recordStore->isCapped() ? new CappedInsertNotifier() : nullptr), _mustTakeCappedLockOnInsert(isCapped() && !_ns.isSystemDotProfile() && !_ns.isOplog()) { _magic = 1357924; _indexCatalog.init(txn); if (isCapped()) _recordStore->setCappedCallback(this); _infoCache.init(txn); } Collection::~Collection() { verify(ok()); _magic = 0; if (_cappedNotifier) { _cappedNotifier->kill(); } } bool Collection::requiresIdIndex() const { if (_ns.ns().find('$') != string::npos) { // no indexes on indexes return false; } if (_ns.isSystem()) { StringData shortName = _ns.coll().substr(_ns.coll().find('.') + 1); if (shortName == "indexes" || shortName == "namespaces" || shortName == "profile") { return false; } } if (_ns.db() == "local") { if (_ns.coll().startsWith("oplog.")) return false; } if (!_ns.isSystem()) { // non system collections definitely have an _id index return true; } return true; } std::unique_ptr<SeekableRecordCursor> Collection::getCursor(OperationContext* txn, bool forward) const { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS)); invariant(ok()); return _recordStore->getCursor(txn, forward); } vector<std::unique_ptr<RecordCursor>> Collection::getManyCursors(OperationContext* txn) const { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS)); return _recordStore->getManyCursors(txn); } Snapshotted<BSONObj> Collection::docFor(OperationContext* txn, const RecordId& loc) const { return Snapshotted<BSONObj>(txn->recoveryUnit()->getSnapshotId(), _recordStore->dataFor(txn, loc).releaseToBson()); } bool Collection::findDoc(OperationContext* txn, const RecordId& loc, Snapshotted<BSONObj>* out) const { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS)); RecordData rd; if (!_recordStore->findRecord(txn, loc, &rd)) return false; *out = Snapshotted<BSONObj>(txn->recoveryUnit()->getSnapshotId(), rd.releaseToBson()); return true; } Status Collection::checkValidation(OperationContext* txn, const BSONObj& document) const { if (!_validator) return Status::OK(); if (_validationLevel == OFF) return Status::OK(); if (documentValidationDisabled(txn)) return Status::OK(); if (_validator->matchesBSON(document)) return Status::OK(); if (_validationAction == WARN) { warning() << "Document would fail validation" << " collection: " << ns() << " doc: " << document; return Status::OK(); } return {ErrorCodes::DocumentValidationFailure, "Document failed validation"}; } StatusWithMatchExpression Collection::parseValidator(const BSONObj& validator) const { if (validator.isEmpty()) return {nullptr}; if (ns().isSystem()) { return {ErrorCodes::InvalidOptions, "Document validators not allowed on system collections."}; } if (ns().isOnInternalDb()) { return {ErrorCodes::InvalidOptions, str::stream() << "Document validators are not allowed on collections in" << " the " << ns().db() << " database"}; } { auto status = checkValidatorForBannedExpressions(validator); if (!status.isOK()) return status; } auto statusWithMatcher = MatchExpressionParser::parse(validator, ExtensionsCallbackDisallowExtensions()); if (!statusWithMatcher.isOK()) return statusWithMatcher.getStatus(); return statusWithMatcher; } Status Collection::insertDocument(OperationContext* txn, const DocWriter* doc, bool enforceQuota) { invariant(!_validator || documentValidationDisabled(txn)); dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(!_indexCatalog.haveAnyIndexes()); // eventually can implement, just not done if (_mustTakeCappedLockOnInsert) synchronizeOnCappedInFlightResource(txn->lockState(), _ns); StatusWith<RecordId> loc = _recordStore->insertRecord(txn, doc, _enforceQuota(enforceQuota)); if (!loc.isOK()) return loc.getStatus(); // we cannot call into the OpObserver here because the document being written is not present // fortunately, this is currently only used for adding entries to the oplog. txn->recoveryUnit()->onCommit([this]() { notifyCappedWaitersIfNeeded(); }); return loc.getStatus(); } Status Collection::insertDocuments(OperationContext* txn, const vector<BSONObj>::const_iterator begin, const vector<BSONObj>::const_iterator end, bool enforceQuota, bool fromMigrate) { // Should really be done in the collection object at creation and updated on index create. const bool hasIdIndex = _indexCatalog.findIdIndex(txn); for (auto it = begin; it != end; it++) { if (hasIdIndex && (*it)["_id"].eoo()) { return Status(ErrorCodes::InternalError, str::stream() << "Collection::insertDocument got " "document without _id for ns:" << _ns.ns()); } auto status = checkValidation(txn, *it); if (!status.isOK()) return status; } const SnapshotId sid = txn->recoveryUnit()->getSnapshotId(); if (_mustTakeCappedLockOnInsert) synchronizeOnCappedInFlightResource(txn->lockState(), _ns); Status status = _insertDocuments(txn, begin, end, enforceQuota); if (!status.isOK()) return status; invariant(sid == txn->recoveryUnit()->getSnapshotId()); getGlobalServiceContext()->getOpObserver()->onInserts(txn, ns(), begin, end, fromMigrate); txn->recoveryUnit()->onCommit([this]() { notifyCappedWaitersIfNeeded(); }); return Status::OK(); } Status Collection::insertDocument(OperationContext* txn, const BSONObj& docToInsert, bool enforceQuota, bool fromMigrate) { vector<BSONObj> docs; docs.push_back(docToInsert); return insertDocuments(txn, docs.begin(), docs.end(), enforceQuota, fromMigrate); } Status Collection::insertDocument(OperationContext* txn, const BSONObj& doc, MultiIndexBlock* indexBlock, bool enforceQuota) { { auto status = checkValidation(txn, doc); if (!status.isOK()) return status; } dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); if (_mustTakeCappedLockOnInsert) synchronizeOnCappedInFlightResource(txn->lockState(), _ns); StatusWith<RecordId> loc = _recordStore->insertRecord(txn, doc.objdata(), doc.objsize(), _enforceQuota(enforceQuota)); if (!loc.isOK()) return loc.getStatus(); Status status = indexBlock->insert(doc, loc.getValue()); if (!status.isOK()) return status; vector<BSONObj> docs; docs.push_back(doc); getGlobalServiceContext()->getOpObserver()->onInserts(txn, ns(), docs.begin(), docs.end()); txn->recoveryUnit()->onCommit([this]() { notifyCappedWaitersIfNeeded(); }); return loc.getStatus(); } Status Collection::_insertDocuments(OperationContext* txn, const vector<BSONObj>::const_iterator begin, const vector<BSONObj>::const_iterator end, bool enforceQuota) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); if (isCapped() && _indexCatalog.haveAnyIndexes() && std::distance(begin, end) > 1) { // We require that inserts to indexed capped collections be done one-at-a-time to avoid the // possibility that a later document causes an earlier document to be deleted before it can // be indexed. // TODO SERVER-21512 It would be better to handle this here by just doing single inserts. return {ErrorCodes::OperationCannotBeBatched, "Can't batch inserts into indexed capped collections"}; } if (_needCappedLock) { // X-lock the metadata resource for this capped collection until the end of the WUOW. This // prevents the primary from executing with more concurrency than secondaries. // See SERVER-21646. Lock::ResourceLock{txn->lockState(), ResourceId(RESOURCE_METADATA, _ns.ns()), MODE_X}; } std::vector<Record> records; for (auto it = begin; it != end; it++) { Record record = {RecordId(), RecordData(it->objdata(), it->objsize())}; records.push_back(record); } Status status = _recordStore->insertRecords(txn, &records, _enforceQuota(enforceQuota)); if (!status.isOK()) return status; std::vector<BsonRecord> bsonRecords; int recordIndex = 0; for (auto it = begin; it != end; it++) { RecordId loc = records[recordIndex++].id; invariant(RecordId::min() < loc); invariant(loc < RecordId::max()); BsonRecord bsonRecord = {loc, &(*it)}; bsonRecords.push_back(bsonRecord); } return _indexCatalog.indexRecords(txn, bsonRecords); } void Collection::notifyCappedWaitersIfNeeded() { // If there is a notifier object and another thread is waiting on it, then we notify // waiters of this document insert. Waiters keep a shared_ptr to '_cappedNotifier', so // there are waiters if this Collection's shared_ptr is not unique (use_count > 1). if (_cappedNotifier && !_cappedNotifier.unique()) _cappedNotifier->notifyAll(); } Status Collection::aboutToDeleteCapped(OperationContext* txn, const RecordId& loc, RecordData data) { /* check if any cursors point to us. if so, advance them. */ _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION); BSONObj doc = data.releaseToBson(); _indexCatalog.unindexRecord(txn, doc, loc, false); return Status::OK(); } void Collection::deleteDocument( OperationContext* txn, const RecordId& loc, bool fromMigrate, bool cappedOK, bool noWarn) { if (isCapped() && !cappedOK) { log() << "failing remove on a capped ns " << _ns << endl; uasserted(10089, "cannot remove from a capped collection"); return; } Snapshotted<BSONObj> doc = docFor(txn, loc); auto opObserver = getGlobalServiceContext()->getOpObserver(); OpObserver::DeleteState deleteState = opObserver->aboutToDelete(txn, ns(), doc.value()); /* check if any cursors point to us. if so, advance them. */ _cursorManager.invalidateDocument(txn, loc, INVALIDATION_DELETION); _indexCatalog.unindexRecord(txn, doc.value(), loc, noWarn); _recordStore->deleteRecord(txn, loc); opObserver->onDelete(txn, ns(), std::move(deleteState), fromMigrate); } Counter64 moveCounter; ServerStatusMetricField<Counter64> moveCounterDisplay("record.moves", &moveCounter); StatusWith<RecordId> Collection::updateDocument(OperationContext* txn, const RecordId& oldLocation, const Snapshotted<BSONObj>& oldDoc, const BSONObj& newDoc, bool enforceQuota, bool indexesAffected, OpDebug* debug, oplogUpdateEntryArgs& args) { { auto status = checkValidation(txn, newDoc); if (!status.isOK()) { if (_validationLevel == STRICT_V) { return status; } // moderate means we have to check the old doc auto oldDocStatus = checkValidation(txn, oldDoc.value()); if (oldDocStatus.isOK()) { // transitioning from good -> bad is not ok return status; } // bad -> bad is ok in moderate mode } } dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId()); if (_needCappedLock) { // X-lock the metadata resource for this capped collection until the end of the WUOW. This // prevents the primary from executing with more concurrency than secondaries. // See SERVER-21646. Lock::ResourceLock{txn->lockState(), ResourceId(RESOURCE_METADATA, _ns.ns()), MODE_X}; } SnapshotId sid = txn->recoveryUnit()->getSnapshotId(); BSONElement oldId = oldDoc.value()["_id"]; if (!oldId.eoo() && (oldId != newDoc["_id"])) return StatusWith<RecordId>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596); // The MMAPv1 storage engine implements capped collections in a way that does not allow records // to grow beyond their original size. If MMAPv1 part of a replicaset with storage engines that // do not have this limitation, replication could result in errors, so it is necessary to set a // uniform rule here. Similarly, it is not sufficient to disallow growing records, because this // happens when secondaries roll back an update shrunk a record. Exactly replicating legacy // MMAPv1 behavior would require padding shrunk documents on all storage engines. Instead forbid // all size changes. const auto oldSize = oldDoc.value().objsize(); if (_recordStore->isCapped() && oldSize != newDoc.objsize()) return {ErrorCodes::CannotGrowDocumentInCappedNamespace, str::stream() << "Cannot change the size of a document in a capped collection: " << oldSize << " != " << newDoc.objsize()}; // At the end of this step, we will have a map of UpdateTickets, one per index, which // represent the index updates needed to be done, based on the changes between oldDoc and // newDoc. OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexCatalogEntry* entry = ii.catalogEntry(descriptor); IndexAccessMethod* iam = ii.accessMethod(descriptor); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor); UpdateTicket* updateTicket = new UpdateTicket(); updateTickets.mutableMap()[descriptor] = updateTicket; Status ret = iam->validateUpdate(txn, oldDoc.value(), newDoc, oldLocation, options, updateTicket, entry->getFilterExpression()); if (!ret.isOK()) { return StatusWith<RecordId>(ret); } } } // This can call back into Collection::recordStoreGoingToMove. If that happens, the old // object is removed from all indexes. StatusWith<RecordId> newLocation = _recordStore->updateRecord( txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this); if (!newLocation.isOK()) { return newLocation; } // At this point, the old object may or may not still be indexed, depending on if it was // moved. If the object did move, we need to add the new location to all indexes. if (newLocation.getValue() != oldLocation) { if (debug) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } std::vector<BsonRecord> bsonRecords; BsonRecord bsonRecord = {newLocation.getValue(), &newDoc}; bsonRecords.push_back(bsonRecord); Status s = _indexCatalog.indexRecords(txn, bsonRecords); if (!s.isOK()) return StatusWith<RecordId>(s); invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; } // Object did not move. We update each index with each respective UpdateTicket. if (debug) debug->keyUpdates = 0; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = ii.accessMethod(descriptor); int64_t updatedKeys; Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys); if (!ret.isOK()) return StatusWith<RecordId>(ret); if (debug) debug->keyUpdates += updatedKeys; } } invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; } Status Collection::recordStoreGoingToMove(OperationContext* txn, const RecordId& oldLocation, const char* oldBuffer, size_t oldSize) { moveCounter.increment(); _cursorManager.invalidateDocument(txn, oldLocation, INVALIDATION_DELETION); _indexCatalog.unindexRecord(txn, BSONObj(oldBuffer), oldLocation, true); return Status::OK(); } Status Collection::recordStoreGoingToUpdateInPlace(OperationContext* txn, const RecordId& loc) { // Broadcast the mutation so that query results stay correct. _cursorManager.invalidateDocument(txn, loc, INVALIDATION_MUTATION); return Status::OK(); } bool Collection::updateWithDamagesSupported() const { if (_validator) return false; return _recordStore->updateWithDamagesSupported(); } StatusWith<RecordData> Collection::updateDocumentWithDamages( OperationContext* txn, const RecordId& loc, const Snapshotted<RecordData>& oldRec, const char* damageSource, const mutablebson::DamageVector& damages, oplogUpdateEntryArgs& args) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(oldRec.snapshotId() == txn->recoveryUnit()->getSnapshotId()); invariant(updateWithDamagesSupported()); // Broadcast the mutation so that query results stay correct. _cursorManager.invalidateDocument(txn, loc, INVALIDATION_MUTATION); auto newRecStatus = _recordStore->updateWithDamages(txn, loc, oldRec.value(), damageSource, damages); if (newRecStatus.isOK()) { args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); } return newRecStatus; } bool Collection::_enforceQuota(bool userEnforeQuota) const { if (!userEnforeQuota) return false; if (!mmapv1GlobalOptions.quota) return false; if (_ns.db() == "local") return false; if (_ns.isSpecial()) return false; return true; } bool Collection::isCapped() const { return _cappedNotifier.get(); } std::shared_ptr<CappedInsertNotifier> Collection::getCappedInsertNotifier() const { invariant(isCapped()); return _cappedNotifier; } uint64_t Collection::numRecords(OperationContext* txn) const { return _recordStore->numRecords(txn); } uint64_t Collection::dataSize(OperationContext* txn) const { return _recordStore->dataSize(txn); } uint64_t Collection::getIndexSize(OperationContext* opCtx, BSONObjBuilder* details, int scale) { IndexCatalog* idxCatalog = getIndexCatalog(); IndexCatalog::IndexIterator ii = idxCatalog->getIndexIterator(opCtx, true); uint64_t totalSize = 0; while (ii.more()) { IndexDescriptor* d = ii.next(); IndexAccessMethod* iam = idxCatalog->getIndex(d); long long ds = iam->getSpaceUsedBytes(opCtx); totalSize += ds; if (details) { details->appendNumber(d->indexName(), ds / scale); } } return totalSize; } /** * order will be: * 1) store index specs * 2) drop indexes * 3) truncate record store * 4) re-write indexes */ Status Collection::truncate(OperationContext* txn) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X)); BackgroundOperation::assertNoBgOpInProgForNs(ns()); invariant(_indexCatalog.numIndexesInProgress(txn) == 0); // 1) store index specs vector<BSONObj> indexSpecs; { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, false); while (ii.more()) { const IndexDescriptor* idx = ii.next(); indexSpecs.push_back(idx->infoObj().getOwned()); } } // 2) drop indexes Status status = _indexCatalog.dropAllIndexes(txn, true); if (!status.isOK()) return status; _cursorManager.invalidateAll(false, "collection truncated"); // 3) truncate record store status = _recordStore->truncate(txn); if (!status.isOK()) return status; // 4) re-create indexes for (size_t i = 0; i < indexSpecs.size(); i++) { status = _indexCatalog.createIndexOnEmptyCollection(txn, indexSpecs[i]); if (!status.isOK()) return status; } return Status::OK(); } void Collection::temp_cappedTruncateAfter(OperationContext* txn, RecordId end, bool inclusive) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(isCapped()); BackgroundOperation::assertNoBgOpInProgForNs(ns()); invariant(_indexCatalog.numIndexesInProgress(txn) == 0); _cursorManager.invalidateAll(false, "capped collection truncated"); _recordStore->temp_cappedTruncateAfter(txn, end, inclusive); } Status Collection::setValidator(OperationContext* txn, BSONObj validatorDoc) { invariant(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X)); // Make owned early so that the parsed match expression refers to the owned object. if (!validatorDoc.isOwned()) validatorDoc = validatorDoc.getOwned(); auto statusWithMatcher = parseValidator(validatorDoc); if (!statusWithMatcher.isOK()) return statusWithMatcher.getStatus(); _details->updateValidator(txn, validatorDoc, getValidationLevel(), getValidationAction()); _validator = std::move(statusWithMatcher.getValue()); _validatorDoc = std::move(validatorDoc); return Status::OK(); } StatusWith<Collection::ValidationLevel> Collection::_parseValidationLevel(StringData newLevel) { if (newLevel == "") { // default return STRICT_V; } else if (newLevel == "off") { return OFF; } else if (newLevel == "moderate") { return MODERATE; } else if (newLevel == "strict") { return STRICT_V; } else { return Status(ErrorCodes::BadValue, str::stream() << "invalid validation level: " << newLevel); } } StatusWith<Collection::ValidationAction> Collection::_parseValidationAction(StringData newAction) { if (newAction == "") { // default return ERROR_V; } else if (newAction == "warn") { return WARN; } else if (newAction == "error") { return ERROR_V; } else { return Status(ErrorCodes::BadValue, str::stream() << "invalid validation action: " << newAction); } } StringData Collection::getValidationLevel() const { switch (_validationLevel) { case STRICT_V: return "strict"; case OFF: return "off"; case MODERATE: return "moderate"; } MONGO_UNREACHABLE; } StringData Collection::getValidationAction() const { switch (_validationAction) { case ERROR_V: return "error"; case WARN: return "warn"; } MONGO_UNREACHABLE; } Status Collection::setValidationLevel(OperationContext* txn, StringData newLevel) { invariant(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X)); StatusWith<ValidationLevel> status = _parseValidationLevel(newLevel); if (!status.isOK()) { return status.getStatus(); } _validationLevel = status.getValue(); _details->updateValidator(txn, _validatorDoc, getValidationLevel(), getValidationAction()); return Status::OK(); } Status Collection::setValidationAction(OperationContext* txn, StringData newAction) { invariant(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_X)); StatusWith<ValidationAction> status = _parseValidationAction(newAction); if (!status.isOK()) { return status.getStatus(); } _validationAction = status.getValue(); _details->updateValidator(txn, _validatorDoc, getValidationLevel(), getValidationAction()); return Status::OK(); } namespace { class MyValidateAdaptor : public ValidateAdaptor { public: virtual ~MyValidateAdaptor() {} virtual Status validate(const RecordData& record, size_t* dataSize) { BSONObj obj = record.toBson(); const Status status = validateBSON(obj.objdata(), obj.objsize()); if (status.isOK()) *dataSize = obj.objsize(); return Status::OK(); } }; void validateIndexKeyCount(OperationContext* txn, const IndexDescriptor& idx, int64_t numIdxKeys, int64_t numRecs, ValidateResults* results) { if (idx.isIdIndex() && numIdxKeys != numRecs) { string err = str::stream() << "number of _id index entries (" << numIdxKeys << ") does not match the number of documents (" << numRecs << ")"; results->errors.push_back(err); results->valid = false; return; // Avoid failing the next two checks, they just add redundant/confusing messages } if (!idx.isMultikey(txn) && numIdxKeys > numRecs) { string err = str::stream() << "index " << idx.indexName() << " is not multi-key, but has more entries (" << numIdxKeys << ") than documents (" << numRecs << ")"; results->errors.push_back(err); results->valid = false; } // If an access method name is given, the index may be a full text, geo or special // index plugin with different semantics. if (!idx.isSparse() && !idx.isPartial() && idx.getAccessMethodName() == "" && numIdxKeys < numRecs) { string err = str::stream() << "index " << idx.indexName() << " is not sparse or partial, but has fewer entries (" << numIdxKeys << ") than documents (" << numRecs << ")"; results->errors.push_back(err); results->valid = false; } } } // namespace Status Collection::validate(OperationContext* txn, bool full, bool scanData, ValidateResults* results, BSONObjBuilder* output) { dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IS)); MyValidateAdaptor adaptor; Status status = _recordStore->validate(txn, full, scanData, &adaptor, results, output); if (!status.isOK()) return status; { // indexes output->append("nIndexes", _indexCatalog.numIndexesReady(txn)); int idxn = 0; try { // Only applicable when 'full' validation is requested. std::unique_ptr<BSONObjBuilder> indexDetails(full ? new BSONObjBuilder() : NULL); BSONObjBuilder indexes; // not using subObjStart to be exception safe IndexCatalog::IndexIterator i = _indexCatalog.getIndexIterator(txn, false); while (i.more()) { const IndexDescriptor* descriptor = i.next(); log(LogComponent::kIndex) << "validating index " << descriptor->indexNamespace() << endl; IndexAccessMethod* iam = _indexCatalog.getIndex(descriptor); invariant(iam); std::unique_ptr<BSONObjBuilder> bob( indexDetails.get() ? new BSONObjBuilder(indexDetails->subobjStart( descriptor->indexNamespace())) : NULL); int64_t keys; iam->validate(txn, full, &keys, bob.get()); indexes.appendNumber(descriptor->indexNamespace(), static_cast<long long>(keys)); validateIndexKeyCount( txn, *descriptor, keys, _recordStore->numRecords(txn), results); if (bob) { BSONObj obj = bob->done(); BSONElement valid = obj["valid"]; if (valid.ok() && !valid.trueValue()) { results->valid = false; } } idxn++; } output->append("keysPerIndex", indexes.done()); if (indexDetails.get()) { output->append("indexDetails", indexDetails->done()); } } catch (DBException& exc) { string err = str::stream() << "exception during index validate idxn " << BSONObjBuilder::numStr(idxn) << ": " << exc.toString(); results->errors.push_back(err); results->valid = false; } } return Status::OK(); } Status Collection::touch(OperationContext* txn, bool touchData, bool touchIndexes, BSONObjBuilder* output) const { if (touchData) { BSONObjBuilder b; Status status = _recordStore->touch(txn, &b); if (!status.isOK()) return status; output->append("data", b.obj()); } if (touchIndexes) { Timer t; IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, false); while (ii.more()) { const IndexDescriptor* desc = ii.next(); const IndexAccessMethod* iam = _indexCatalog.getIndex(desc); Status status = iam->touch(txn); if (!status.isOK()) return status; } output->append("indexes", BSON("num" << _indexCatalog.numIndexesTotal(txn) << "millis" << t.millis())); } return Status::OK(); } }
void CursorCache::gotKillCursors(Message& m) { LastError::get(cc()).disable(); DbMessage dbmessage(m); int n = dbmessage.pullInt(); if (n > 2000) { (n < 30000 ? warning() : error()) << "receivedKillCursors, n=" << n << endl; } uassert(13286, "sent 0 cursors to kill", n >= 1); uassert(13287, "too many cursors to kill", n < 30000); massert(18632, str::stream() << "bad kill cursors size: " << m.dataSize(), m.dataSize() == 8 + (8 * n)); ConstDataCursor cursors(dbmessage.getArray(n)); ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = AuthorizationSession::get(client); for (int i = 0; i < n; i++) { long long id = cursors.readAndAdvance<LittleEndian<int64_t>>(); LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl; if (!id) { warning() << " got cursor id of 0 to kill" << endl; continue; } string server; { stdx::lock_guard<stdx::mutex> lk(_mutex); MapSharded::iterator i = _cursors.find(id); if (i != _cursors.end()) { Status authorizationStatus = authSession->checkAuthForKillCursors(NamespaceString(i->second->getNS()), id); audit::logKillCursorsAuthzCheck( client, NamespaceString(i->second->getNS()), id, authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (authorizationStatus.isOK()) { _cursorsMaxTimeMS.erase(i->second->getId()); _cursors.erase(i); } continue; } MapNormal::iterator refsIt = _refs.find(id); MapNormal::iterator refsNSIt = _refsNS.find(id); if (refsIt == _refs.end()) { warning() << "can't find cursor: " << id << endl; continue; } verify(refsNSIt != _refsNS.end()); Status authorizationStatus = authSession->checkAuthForKillCursors(NamespaceString(refsNSIt->second), id); audit::logKillCursorsAuthzCheck(client, NamespaceString(refsNSIt->second), id, authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (!authorizationStatus.isOK()) { continue; } server = refsIt->second; _refs.erase(refsIt); _refsNS.erase(refsNSIt); cursorStatsSingleTarget.decrement(); } LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server << endl; verify(server.size()); ScopedDbConnection conn(server); conn->killCursor(id); conn.done(); } }
namespace mongo { // Enabling the maxTimeAlwaysTimeOut fail point will cause any query or command run with a valid // non-zero max time to fail immediately. Any getmore operation on a cursor already created // with a valid non-zero max time will also fail immediately. // // This fail point cannot be used with the maxTimeNeverTimeOut fail point. MONGO_FP_DECLARE(maxTimeAlwaysTimeOut); // Enabling the maxTimeNeverTimeOut fail point will cause the server to never time out any // query, command, or getmore operation, regardless of whether a max time is set. // // This fail point cannot be used with the maxTimeAlwaysTimeOut fail point. MONGO_FP_DECLARE(maxTimeNeverTimeOut); // todo : move more here CurOp::CurOp( Client * client , CurOp * wrapped ) : _client(client), _wrapped(wrapped) { if ( _wrapped ) _client->_curOp = this; _start = 0; _active = false; _reset(); _op = 0; _opNum = _nextOpNum.fetchAndAdd(1); _command = NULL; } void CurOp::_reset() { _suppressFromCurop = false; _isCommand = false; _dbprofile = 0; _end = 0; _maxTimeMicros = 0; _maxTimeTracker.reset(); _message = ""; _progressMeter.finished(); _killPending.store(0); _numYields = 0; _expectedLatencyMs = 0; _lockStat.reset(); } void CurOp::reset() { _reset(); _start = 0; _opNum = _nextOpNum.fetchAndAdd(1); _debug.reset(); _query.reset(); _active = true; // this should be last for ui clarity } void CurOp::reset( const HostAndPort& remote, int op ) { reset(); if( _remote != remote ) { // todo : _remote is not thread safe yet is used as such! _remote = remote; } _op = op; } ProgressMeter& CurOp::setMessage(const char * msg, std::string name, unsigned long long progressMeterTotal, int secondsBetween) { if ( progressMeterTotal ) { if ( _progressMeter.isActive() ) { cout << "about to assert, old _message: " << _message << " new message:" << msg << endl; verify( ! _progressMeter.isActive() ); } _progressMeter.reset( progressMeterTotal , secondsBetween ); _progressMeter.setName(name); } else { _progressMeter.finished(); } _message = msg; return _progressMeter; } CurOp::~CurOp() { if ( _wrapped ) { scoped_lock bl(Client::clientsMutex); _client->_curOp = _wrapped; } _client = 0; } void CurOp::setNS( const StringData& ns ) { // _ns copies the data in the null-terminated ptr it's given _ns = ns.toString().c_str(); } void CurOp::ensureStarted() { if ( _start == 0 ) { _start = curTimeMicros64(); // If ensureStarted() is invoked after setMaxTimeMicros(), then time limit tracking will // start here. This is because time limit tracking can only commence after the // operation is assigned a start time. if (_maxTimeMicros > 0) { _maxTimeTracker.setTimeLimit(_start, _maxTimeMicros); } } } void CurOp::enter( Client::Context * context ) { ensureStarted(); _ns = context->ns(); _dbprofile = std::max( context->_db ? context->_db->getProfilingLevel() : 0 , _dbprofile ); } void CurOp::recordGlobalTime(bool isWriteLocked, long long micros) const { string nsStr = _ns.toString(); Top::global.record(nsStr, _op, isWriteLocked ? 1 : -1, micros, _isCommand); } void CurOp::reportState(BSONObjBuilder* builder) { builder->append("opid", _opNum); bool a = _active && _start; builder->append("active", a); if( a ) { builder->append("secs_running", elapsedSeconds() ); builder->append("microsecs_running", static_cast<long long int>(elapsedMicros()) ); } builder->append( "op" , opToString( _op ) ); builder->append("ns", _ns.toString()); if (_op == dbInsert) { _query.append(*builder, "insert"); } else { _query.append(*builder, "query"); } if ( !debug().planSummary.empty() ) { builder->append( "planSummary" , debug().planSummary.toString() ); } if( !_remote.empty() ) { builder->append("client", _remote.toString()); } if ( ! _message.empty() ) { if ( _progressMeter.isActive() ) { StringBuilder buf; buf << _message.toString() << " " << _progressMeter.toString(); builder->append( "msg" , buf.str() ); BSONObjBuilder sub( builder->subobjStart( "progress" ) ); sub.appendNumber( "done" , (long long)_progressMeter.done() ); sub.appendNumber( "total" , (long long)_progressMeter.total() ); sub.done(); } else { builder->append( "msg" , _message.toString() ); } } if( killPending() ) builder->append("killPending", true); builder->append( "numYields" , _numYields ); builder->append( "lockStats" , _lockStat.report() ); } BSONObj CurOp::description() { BSONObjBuilder bob; bool a = _active && _start; bob.append("active", a); bob.append( "op" , opToString( _op ) ); bob.append("ns", _ns.toString()); if (_op == dbInsert) { _query.append(bob, "insert"); } else { _query.append(bob, "query"); } if( killPending() ) bob.append("killPending", true); return bob.obj(); } void CurOp::kill() { _killPending.store(1); } void CurOp::setMaxTimeMicros(uint64_t maxTimeMicros) { _maxTimeMicros = maxTimeMicros; if (_maxTimeMicros == 0) { // 0 is "allow to run indefinitely". return; } // If the operation has a start time, then enable the tracker. // // If the operation has no start time yet, then ensureStarted() will take responsibility for // enabling the tracker. if (isStarted()) { _maxTimeTracker.setTimeLimit(startTime(), _maxTimeMicros); } } bool CurOp::maxTimeHasExpired() { if (MONGO_FAIL_POINT(maxTimeNeverTimeOut)) { return false; } if (_maxTimeMicros > 0 && MONGO_FAIL_POINT(maxTimeAlwaysTimeOut)) { return true; } return _maxTimeTracker.checkTimeLimit(); } uint64_t CurOp::getRemainingMaxTimeMicros() const { return _maxTimeTracker.getRemainingMicros(); } AtomicUInt32 CurOp::_nextOpNum; static Counter64 returnedCounter; static Counter64 insertedCounter; static Counter64 updatedCounter; static Counter64 deletedCounter; static Counter64 scannedCounter; static Counter64 scannedObjectCounter; static ServerStatusMetricField<Counter64> displayReturned( "document.returned", &returnedCounter ); static ServerStatusMetricField<Counter64> displayUpdated( "document.updated", &updatedCounter ); static ServerStatusMetricField<Counter64> displayInserted( "document.inserted", &insertedCounter ); static ServerStatusMetricField<Counter64> displayDeleted( "document.deleted", &deletedCounter ); static ServerStatusMetricField<Counter64> displayScanned( "queryExecutor.scanned", &scannedCounter ); static ServerStatusMetricField<Counter64> displayScannedObjects( "queryExecutor.scannedObjects", &scannedObjectCounter ); static Counter64 idhackCounter; static Counter64 scanAndOrderCounter; static Counter64 fastmodCounter; static ServerStatusMetricField<Counter64> displayIdhack( "operation.idhack", &idhackCounter ); static ServerStatusMetricField<Counter64> displayScanAndOrder( "operation.scanAndOrder", &scanAndOrderCounter ); static ServerStatusMetricField<Counter64> displayFastMod( "operation.fastmod", &fastmodCounter ); void OpDebug::recordStats() { if ( nreturned > 0 ) returnedCounter.increment( nreturned ); if ( ninserted > 0 ) insertedCounter.increment( ninserted ); if ( nMatched > 0 ) updatedCounter.increment( nMatched ); if ( ndeleted > 0 ) deletedCounter.increment( ndeleted ); if ( nscanned > 0 ) scannedCounter.increment( nscanned ); if ( nscannedObjects > 0 ) scannedObjectCounter.increment( nscannedObjects ); if ( idhack ) idhackCounter.increment(); if ( scanAndOrder ) scanAndOrderCounter.increment(); if ( fastmod ) fastmodCounter.increment(); } CurOp::MaxTimeTracker::MaxTimeTracker() { reset(); } void CurOp::MaxTimeTracker::reset() { _enabled = false; _targetEpochMicros = 0; _approxTargetServerMillis = 0; } void CurOp::MaxTimeTracker::setTimeLimit(uint64_t startEpochMicros, uint64_t durationMicros) { dassert(durationMicros != 0); _enabled = true; _targetEpochMicros = startEpochMicros + durationMicros; uint64_t now = curTimeMicros64(); // If our accurate time source thinks time is not up yet, calculate the next target for // our approximate time source. if (_targetEpochMicros > now) { _approxTargetServerMillis = Listener::getElapsedTimeMillis() + static_cast<int64_t>((_targetEpochMicros - now) / 1000); } // Otherwise, set our approximate time source target such that it thinks time is already // up. else { _approxTargetServerMillis = Listener::getElapsedTimeMillis(); } } bool CurOp::MaxTimeTracker::checkTimeLimit() { if (!_enabled) { return false; } // Does our approximate time source think time is not up yet? If so, return early. if (_approxTargetServerMillis > Listener::getElapsedTimeMillis()) { return false; } uint64_t now = curTimeMicros64(); // Does our accurate time source think time is not up yet? If so, readjust the target for // our approximate time source and return early. if (_targetEpochMicros > now) { _approxTargetServerMillis = Listener::getElapsedTimeMillis() + static_cast<int64_t>((_targetEpochMicros - now) / 1000); return false; } // Otherwise, time is up. return true; } uint64_t CurOp::MaxTimeTracker::getRemainingMicros() const { if (!_enabled) { // 0 is "allow to run indefinitely". return 0; } // Does our accurate time source think time is up? If so, claim there is 1 microsecond // left for this operation. uint64_t now = curTimeMicros64(); if (_targetEpochMicros <= now) { return 1; } // Otherwise, calculate remaining time. return _targetEpochMicros - now; } }
ShardedClientCursor::~ShardedClientCursor() { verify(_cursor); delete _cursor; _cursor = 0; cursorStatsMultiTarget.decrement(); }