ClientCursor::ClientCursor(ClientCursorParams params, CursorManager* cursorManager, CursorId cursorId, OperationContext* operationUsingCursor, Date_t now) : _cursorid(cursorId), _nss(std::move(params.nss)), _authenticatedUsers(std::move(params.authenticatedUsers)), _lsid(operationUsingCursor->getLogicalSessionId()), _txnNumber(operationUsingCursor->getTxnNumber()), _readConcernLevel(params.readConcernLevel), _cursorManager(cursorManager), _originatingCommand(params.originatingCommandObj), _queryOptions(params.queryOptions), _exec(std::move(params.exec)), _operationUsingCursor(operationUsingCursor), _lastUseDate(now) { invariant(_cursorManager); invariant(_exec); invariant(_operationUsingCursor); cursorStatsOpen.increment(); if (isNoTimeout()) { // cursors normally timeout after an inactivity period to prevent excess memory use // setting this prevents timeout of the cursor in question. cursorStatsOpenNoTimeout.increment(); } }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { // We assume all options have been validated earlier, if not, programming error dassert(validateWriteConcern(writeConcern).isOK()); // Next handle blocking on disk Timer syncTimer; switch (writeConcern.syncMode) { case WriteConcernOptions::NONE: break; case WriteConcernOptions::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::JOURNAL: txn->recoveryUnit()->waitUntilDurable(); break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) { // no desired replication check return Status::OK(); } // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplication(txn, replOpTime, writeConcern); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
void CursorCache::storeRef(const std::string& server, long long id, const std::string& ns) { LOG(_myLogLevel) << "CursorCache::storeRef server: " << server << " id: " << id << endl; verify(id); stdx::lock_guard<stdx::mutex> lk(_mutex); _refs[id] = server; _refsNS[id] = ns; cursorStatsSingleTarget.increment(); }
Status Collection::recordStoreGoingToMove(OperationContext* txn, const RecordId& oldLocation, const char* oldBuffer, size_t oldSize) { moveCounter.increment(); _cursorManager.invalidateDocument(txn, oldLocation, INVALIDATION_DELETION); _indexCatalog.unindexRecord(txn, BSONObj(oldBuffer), oldLocation, true); return Status::OK(); }
OplogReader::OplogReader() { _tailingQueryOptions = QueryOption_SlaveOk; _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay; /* TODO: slaveOk maybe shouldn't use? */ _tailingQueryOptions |= QueryOption_AwaitData; readersCreatedStats.increment(); }
ClientCursorPin::ClientCursorPin(OperationContext* opCtx, ClientCursor* cursor) : _opCtx(opCtx), _cursor(cursor) { invariant(_cursor); invariant(_cursor->_operationUsingCursor); invariant(_cursor->_cursorManager); invariant(!_cursor->_disposed); // We keep track of the number of cursors currently pinned. The cursor can become unpinned // either by being released back to the cursor manager or by being deleted. A cursor may be // transferred to another pin object via move construction or move assignment, but in this case // it is still considered pinned. cursorStatsOpenPinned.increment(); }
void OpDebug::recordStats() { if ( nreturned > 0 ) returnedCounter.increment( nreturned ); if ( ninserted > 0 ) insertedCounter.increment( ninserted ); if ( nMatched > 0 ) updatedCounter.increment( nMatched ); if ( ndeleted > 0 ) deletedCounter.increment( ndeleted ); if ( nscanned > 0 ) scannedCounter.increment( nscanned ); if ( nscannedObjects > 0 ) scannedObjectCounter.increment( nscannedObjects ); if ( idhack ) idhackCounter.increment(); if ( scanAndOrder ) scanAndOrderCounter.increment(); if ( fastmod ) fastmodCounter.increment(); if ( writeConflicts ) writeConflictsCounter.increment( writeConflicts ); }
void OpDebug::recordStats() { if ( nreturned > 0 ) returnedCounter.increment( nreturned ); if ( ninserted > 0 ) insertedCounter.increment( ninserted ); if ( nupdated > 0 ) updatedCounter.increment( nupdated ); if ( ndeleted > 0 ) deletedCounter.increment( ndeleted ); if ( nscanned > 0 ) scannedCounter.increment( nscanned ); if ( idhack ) idhackCounter.increment(); if ( scanAndOrder ) scanAndOrderCounter.increment(); if ( fastmod ) fastmodCounter.increment(); }
ShardedClientCursor::ShardedClientCursor(QueryMessage& q, ParallelSortClusteredCursor* cursor) { verify(cursor); _cursor = cursor; _skip = q.ntoskip; _ntoreturn = q.ntoreturn; _totalSent = 0; _done = false; _id = 0; if (q.queryOptions & QueryOption_NoCursorTimeout) { _lastAccessMillis = 0; } else _lastAccessMillis = Listener::getElapsedTimeMillis(); cursorStatsMultiTarget.increment(); }
virtual void run() { Client::initThread( name().c_str() ); while ( ! inShutdown() ) { sleepsecs( 60 ); LOG(3) << "TTLMonitor thread awake" << endl; if ( lockedForWriting() ) { // note: this is not perfect as you can go into fsync+lock between // this and actually doing the delete later LOG(3) << " locked for writing" << endl; continue; } // if part of replSet but not in a readable state (e.g. during initial sync), skip. if ( theReplSet && !theReplSet->state().readable() ) continue; set<string> dbs; { Lock::DBRead lk( "local" ); dbHolder().getAllShortNames( dbs ); } ttlPasses.increment(); for ( set<string>::const_iterator i=dbs.begin(); i!=dbs.end(); ++i ) { string db = *i; try { doTTLForDB( db ); } catch ( DBException& e ) { error() << "error processing ttl for db: " << db << " " << e << endl; } } } }
/* apply the log op that is in param o @return bool success (true) or failure (false) */ bool SyncTail::syncApply( OperationContext* txn, const BSONObj &op, bool convertUpdateToUpsert) { const char *ns = op.getStringField("ns"); verify(ns); if ( (*ns == '\0') || (*ns == '.') ) { // this is ugly // this is often a no-op // but can't be 100% sure if( *op.getStringField("op") != 'n' ) { error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog; } return true; } bool isCommand(op["op"].valuestrsafe()[0] == 'c'); boost::scoped_ptr<Lock::ScopedLock> lk; if(isCommand) { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( lk.reset(new Lock::GlobalWrite(txn->lockState())); } else { // DB level lock for this operation lk.reset(new Lock::DBWrite(txn->lockState(), ns)); } Client::Context ctx(ns); ctx.getClient()->curop()->reset(); // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert); opsAppliedStats.increment(); txn->recoveryUnit()->commitIfNeeded(); return ok; }
void doTTLForDB( const string& dbName ) { //check isMaster before becoming god bool isMaster = isMasterNs( dbName.c_str() ); Client::GodScope god; vector<BSONObj> indexes; { auto_ptr<DBClientCursor> cursor = db.query( dbName + ".system.indexes" , BSON( secondsExpireField << BSON( "$exists" << true ) ) , 0 , /* default nToReturn */ 0 , /* default nToSkip */ 0 , /* default fieldsToReturn */ QueryOption_SlaveOk ); /* perform on secondaries too */ if ( cursor.get() ) { while ( cursor->more() ) { indexes.push_back( cursor->next().getOwned() ); } } } for ( unsigned i=0; i<indexes.size(); i++ ) { BSONObj idx = indexes[i]; BSONObj key = idx["key"].Obj(); if ( key.nFields() != 1 ) { error() << "key for ttl index can only have 1 field" << endl; continue; } BSONObj query; { BSONObjBuilder b; b.appendDate( "$lt" , curTimeMillis64() - ( 1000 * idx[secondsExpireField].numberLong() ) ); query = BSON( key.firstElement().fieldName() << b.obj() ); } LOG(1) << "TTL: " << key << " \t " << query << endl; long long n = 0; { string ns = idx["ns"].String(); Client::WriteContext ctx( ns ); NamespaceDetails* nsd = nsdetails( ns ); if ( ! nsd ) { // collection was dropped continue; } if ( nsd->setUserFlag( NamespaceDetails::Flag_UsePowerOf2Sizes ) ) { nsd->syncUserFlags( ns ); } // only do deletes if on master if ( ! isMaster ) { continue; } n = deleteObjects( ns.c_str() , query , false , true ); ttlDeletedDocuments.increment( n ); } LOG(1) << "\tTTL deleted: " << n << endl; } }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { LOG(2) << "Waiting for write concern. OpTime: " << replOpTime << ", write concern: " << writeConcern.toBSON(); auto replCoord = repl::ReplicationCoordinator::get(txn); MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeWaitingForWriteConcern); // Next handle blocking on disk Timer syncTimer; WriteConcernOptions writeConcernWithPopulatedSyncMode = replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); switch (writeConcernWithPopulatedSyncMode.syncMode) { case WriteConcernOptions::SyncMode::UNSET: severe() << "Attempting to wait on a WriteConcern with an unset sync option"; fassertFailed(34410); case WriteConcernOptions::SyncMode::NONE: break; case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::SyncMode::JOURNAL: if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { // Wait for ops to become durable then update replication system's // knowledge of this. OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); replCoord->setMyLastDurableOpTimeForward(appliedOpTime); } else { txn->recoveryUnit()->waitUntilDurable(); } break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } // Replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = replCoord->awaitReplication(txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( replOpTime, writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
// returns number of seconds to sleep, if any uint32_t BackgroundSync::produce() { // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); // There should never be ops to sync in a 1-member set, anyway return 1; } } OplogReader r(true /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found GTID lastGTIDFetched = theReplSet->gtidManager->getLiveState(); { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { // if there is no one to sync from return 1; //sleep one second } } r.tailingQueryGTE(rsoplog, lastGTIDFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return 0; } try { // this method may actually run rollback, yes, the name is bad if (isRollbackRequired(r)) { // sleep 2 seconds and try again. (The 2 is arbitrary). // If we are not fatal, then we will keep trying to sync // from another machine return 2; } } catch (RollbackOplogException& re){ // we attempted a rollback and failed, we must go fatal. log() << "Caught a RollbackOplogException during rollback, going fatal" << rsLog; theReplSet->fatal(); return 2; // 2 is arbitrary, if we are going fatal, we are done } while (!_opSyncShouldExit) { while (!_opSyncShouldExit) { { // check if we should bail out boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { return 0; } } if (!r.moreInCurrentBatch()) { // check to see if we have a request to sync // from a specific target. If so, get out so that // we can restart the act of syncing and // do so from the correct target if (theReplSet->gotForceSync()) { return 0; } verify(!theReplSet->isPrimary()); if (shouldChangeSyncTarget()) { return 0; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) { break; } // This is the operation we have received from the target // that we must put in our oplog with an applied field of false BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); LOG(3) << "replicating " << o.toString(false, true) << " from " << _currentSyncTarget->fullName() << endl; uint64_t ts = o["ts"]._numberLong(); // now that we have the element in o, let's check // if there a delay is required (via slaveDelay) before // writing it to the oplog if (theReplSet->myConfig().slaveDelay > 0) { handleSlaveDelay(ts); { boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { break; } } } { Timer timer; bool bigTxn = false; { Client::Transaction transaction(DB_SERIALIZABLE); replicateFullTransactionToOplog(o, r, &bigTxn); // we are operating as a secondary. We don't have to fsync transaction.commit(DB_TXN_NOSYNC); } { GTID currEntry = getGTIDFromOplogEntry(o); uint64_t lastHash = o["h"].numberLong(); boost::unique_lock<boost::mutex> lock(_mutex); // update counters theReplSet->gtidManager->noteGTIDAdded(currEntry, ts, lastHash); // notify applier thread that data exists if (_deque.size() == 0) { _queueCond.notify_all(); } _deque.push_back(o); bufferCountGauge.increment(); bufferSizeGauge.increment(o.objsize()); // this is a flow control mechanism, with bad numbers // hard coded for now just to get something going. // If the opSync thread notices that we have over 20000 // transactions in the queue, it waits until we get below // 10000. This is where we wait if we get too high // Once we have spilling of transactions working, this // logic will need to be redone if (_deque.size() > 20000) { _queueCond.wait(lock); } if (bigTxn) { // if we have a large transaction, we don't want // to let it pile up. We want to process it immedietely // before processing anything else. while (_deque.size() > 0) { _queueDone.wait(lock); } } } } } // end while if (shouldChangeSyncTarget()) { return 0; } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end opSync pass" << rsLog; return 0; } // looping back is ok because this is a tailable cursor } return 0; }
void BackgroundSync::applyOpsFromOplog() { GTID lastLiveGTID; GTID lastUnappliedGTID; while (1) { try { BSONObj curr; { boost::unique_lock<boost::mutex> lck(_mutex); // wait until we know an item has been produced while (_deque.size() == 0 && !_applierShouldExit) { _queueDone.notify_all(); _queueCond.wait(lck); } if (_deque.size() == 0 && _applierShouldExit) { return; } curr = _deque.front(); } GTID currEntry = getGTIDFromOplogEntry(curr); theReplSet->gtidManager->noteApplyingGTID(currEntry); // we must do applyTransactionFromOplog in a loop // because once we have called noteApplyingGTID, we must // continue until we are successful in applying the transaction. for (uint32_t numTries = 0; numTries <= 100; numTries++) { try { numTries++; TimerHolder timer(&applyBatchStats); applyTransactionFromOplog(curr); opsAppliedStats.increment(); break; } catch (std::exception &e) { log() << "exception during applying transaction from oplog: " << e.what() << endl; log() << "oplog entry: " << curr.str() << endl; if (numTries == 100) { // something is really wrong if we fail 100 times, let's abort ::abort(); } sleepsecs(1); } } LOG(3) << "applied " << curr.toString(false, true) << endl; theReplSet->gtidManager->noteGTIDApplied(currEntry); { boost::unique_lock<boost::mutex> lck(_mutex); dassert(_deque.size() > 0); _deque.pop_front(); bufferCountGauge.increment(-1); bufferSizeGauge.increment(-curr.objsize()); // this is a flow control mechanism, with bad numbers // hard coded for now just to get something going. // If the opSync thread notices that we have over 20000 // transactions in the queue, it waits until we get below // 10000. This is where we signal that we have gotten there // Once we have spilling of transactions working, this // logic will need to be redone if (_deque.size() == 10000) { _queueCond.notify_all(); } } } catch (DBException& e) { sethbmsg(str::stream() << "db exception in producer on applier thread: " << e.toString()); sleepsecs(2); } catch (std::exception& e2) { sethbmsg(str::stream() << "exception in producer on applier thread: " << e2.what()); sleepsecs(2); } } }
StatusWith<DiskLoc> Collection::updateDocument( const DiskLoc& oldLocation, const BSONObj& objNew, bool enforceQuota, OpDebug* debug ) { Record* oldRecord = getExtentManager()->recordFor( oldLocation ); BSONObj objOld = BSONObj::make( oldRecord ); if ( objOld.hasElement( "_id" ) ) { BSONElement oldId = objOld["_id"]; BSONElement newId = objNew["_id"]; if ( oldId != newId ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596 ); } if ( ns().coll() == "system.users" ) { // XXX - andy and spencer think this should go away now V2UserDocumentParser parser; Status s = parser.checkValidUserDocument(objNew); if ( !s.isOK() ) return StatusWith<DiskLoc>( s ); } /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerVector<UpdateTicket> updateTickets; updateTickets.mutableVector().resize(_indexCatalog.numIndexesTotal()); for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) { IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i ); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || ignoreUniqueIndex(descriptor); updateTickets.mutableVector()[i] = new UpdateTicket(); Status ret = iam->validateUpdate(objOld, objNew, oldLocation, options, updateTickets.mutableVector()[i]); if ( !ret.isOK() ) { return StatusWith<DiskLoc>( ret ); } } if ( oldRecord->netLength() < objNew.objsize() ) { // doesn't fit, have to move to new location if ( _details->isCapped() ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); moveCounter.increment(); _details->paddingTooSmall(); // unindex old record, don't delete // this way, if inserting new doc fails, we can re-index this one ClientCursor::aboutToDelete(_ns.ns(), _details, oldLocation); _indexCatalog.unindexRecord( objOld, oldLocation, true ); if ( debug ) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } StatusWith<DiskLoc> loc = insertDocument( objNew, enforceQuota ); if ( loc.isOK() ) { // insert successful, now lets deallocate the old location // remember its already unindexed _recordStore.deallocRecord( oldLocation, oldRecord ); } else { // new doc insert failed, so lets re-index the old document and location _indexCatalog.indexRecord( objOld, oldLocation ); } return loc; } _infoCache.notifyOfWriteOp(); _details->paddingFits(); if ( debug ) debug->keyUpdates = 0; for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) { IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i ); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); int64_t updatedKeys; Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys); if ( !ret.isOK() ) return StatusWith<DiskLoc>( ret ); if ( debug ) debug->keyUpdates += updatedKeys; } // update in place int sz = objNew.objsize(); memcpy(getDur().writingPtr(oldRecord->data(), sz), objNew.objdata(), sz); return StatusWith<DiskLoc>( oldLocation ); }
Status waitForWriteConcern( OperationContext* txn, const WriteConcernOptions& writeConcern, const OpTime& replOpTime, WriteConcernResult* result ) { // We assume all options have been validated earlier, if not, programming error dassert( validateWriteConcern( writeConcern ).isOK() ); // Next handle blocking on disk Timer syncTimer; switch( writeConcern.syncMode ) { case WriteConcernOptions::NONE: break; case WriteConcernOptions::FSYNC: if ( !getDur().isDurable() ) { result->fsyncFiles = MemoryMappedFile::flushAll( true ); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->awaitCommit(); } break; case WriteConcernOptions::JOURNAL: txn->recoveryUnit()->awaitCommit(); break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if ( replOpTime.isNull() ) { // no write happened for this client yet return Status::OK(); } if ( writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty() ) { // no desired replication check return Status::OK(); } if (!replset::anyReplEnabled() || serverGlobalParams.configsvr) { // no replication check needed (validated above) return Status::OK(); } const bool isMasterSlaveNode = replset::anyReplEnabled() && !replset::theReplSet; if ( writeConcern.wMode == "majority" && isMasterSlaveNode ) { // with master/slave, majority is equivalent to w=1 return Status::OK(); } // We're sure that replication is enabled and that we have more than one node or a wMode TimerHolder gleTimerHolder( &gleWtimeStats ); // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors // TODO: Make this cleaner Status replStatus = Status::OK(); try { while ( 1 ) { if ( writeConcern.wNumNodes > 0 ) { if (replset::opReplicatedEnough(replOpTime, writeConcern.wNumNodes)) { break; } } else if (replset::opReplicatedEnough(replOpTime, writeConcern.wMode)) { break; } if ( writeConcern.wTimeout > 0 && gleTimerHolder.millis() >= writeConcern.wTimeout ) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; replStatus = Status( ErrorCodes::WriteConcernFailed, "waiting for replication timed out" ); break; } sleepmillis(1); txn->checkForInterrupt(); } } catch( const AssertionException& ex ) { // Our replication state changed while enforcing write concern replStatus = ex.toStatus(); } // Add stats result->writtenTo = replset::getHostsWrittenTo(replOpTime); result->wTime = gleTimerHolder.recordMillis(); return replStatus; }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn, int lenToAlloc ) { // align size up to a multiple of 4 lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1); freelistAllocs.increment(); DiskLoc loc; { DiskLoc *prev = 0; DiskLoc *bestprev = 0; DiskLoc bestmatch; int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough int b = bucket(lenToAlloc); DiskLoc cur = _details->deletedListEntry(b); int extra = 5; // look for a better fit, a little. int chain = 0; while ( 1 ) { { // defensive check int fileNumber = cur.a(); int fileOffset = cur.getOfs(); if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) { StringBuilder sb; sb << "Deleted record list corrupted in collection " << _ns << ", bucket " << b << ", link number " << chain << ", invalid link is " << cur.toString() << ", throwing Fatal Assertion"; log() << sb.str() << endl; fassertFailed(16469); } } if ( cur.isNull() ) { // move to next bucket. if we were doing "extra", just break if ( bestmatchlen < INT_MAX ) break; if ( chain > 0 ) { // if we looked at things in the right bucket, but they were not suitable freelistBucketExhausted.increment(); } b++; if ( b > MaxBucket ) { // out of space. alloc a new extent. freelistIterations.increment( 1 + chain ); return DiskLoc(); } cur = _details->deletedListEntry(b); prev = 0; continue; } DeletedRecord *r = drec(cur); if ( r->lengthWithHeaders() >= lenToAlloc && r->lengthWithHeaders() < bestmatchlen ) { bestmatchlen = r->lengthWithHeaders(); bestmatch = cur; bestprev = prev; if (r->lengthWithHeaders() == lenToAlloc) // exact match, stop searching break; } if ( bestmatchlen < INT_MAX && --extra <= 0 ) break; if ( ++chain > 30 && b <= MaxBucket ) { // too slow, force move to next bucket to grab a big chunk //b++; freelistIterations.increment( chain ); chain = 0; cur.Null(); } else { cur = r->nextDeleted(); prev = &r->nextDeleted(); } } // unlink ourself from the deleted list DeletedRecord *bmr = drec(bestmatch); if ( bestprev ) { *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted(); } else { // should be the front of a free-list int myBucket = bucket(bmr->lengthWithHeaders()); invariant( _details->deletedListEntry(myBucket) == bestmatch ); _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted()); } *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive. invariant(bmr->extentOfs() < bestmatch.getOfs()); freelistIterations.increment( 1 + chain ); loc = bestmatch; } if ( loc.isNull() ) return loc; // determine if we should chop up DeletedRecord *r = drec(loc); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ int regionlen = r->lengthWithHeaders(); invariant( r->extentOfs() < loc.getOfs() ); int left = regionlen - lenToAlloc; if ( left < 24 || left < (lenToAlloc / 8) ) { // you get the whole thing. return loc; } // don't quantize: // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425 if ( _normalCollection ) { // we quantize here so that it only impacts newly sized records // this prevents oddities with older records and space re-use SERVER-8435 lenToAlloc = std::min( r->lengthWithHeaders(), quantizeAllocationSpace( lenToAlloc ) ); left = regionlen - lenToAlloc; if ( left < 24 ) { // you get the whole thing. return loc; } } /* split off some for further use. */ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord* newDel = drec(newDelLoc); DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); newDelW->extentOfs() = r->extentOfs(); newDelW->lengthWithHeaders() = left; newDelW->nextDeleted().Null(); addDeletedRec( txn, newDelLoc ); return loc; }
/** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. */ const DiskLoc DataFileMgr::updateRecord( const char *ns, Collection* collection, Record *toupdate, const DiskLoc& dl, const char *_buf, int _len, OpDebug& debug, bool god) { dassert( toupdate == dl.rec() ); BSONObj objOld = BSONObj::make(toupdate); BSONObj objNew(_buf); DEV verify( objNew.objsize() == _len ); DEV verify( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { /* add back the old _id value if the update removes it. Note this implementation is slow (copies entire object multiple times), but this shouldn't happen often, so going for simple code, not speed. */ BSONObjBuilder b; BSONElement e; verify( objOld.getObjectID(e) ); b.append(e); // put _id first, for best performance b.appendElements(objNew); objNew = b.obj(); } NamespaceString nsstring(ns); if (nsstring.coll() == "system.users") { V2UserDocumentParser parser; uassertStatusOK(parser.checkValidUserDocument(objNew)); } uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew, objNew["_id"] == objOld["_id"]); /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerVector<UpdateTicket> updateTickets; updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount()); for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) { auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i)); auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get())); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || ignoreUniqueIndex(descriptor->getOnDisk()); updateTickets.mutableVector()[i] = new UpdateTicket(); Status ret = iam->validateUpdate(objOld, objNew, dl, options, updateTickets.mutableVector()[i]); if (Status::OK() != ret) { uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString()); } } if ( toupdate->netLength() < objNew.objsize() ) { // doesn't fit. reallocate ----------------------------------------------------- moveCounter.increment(); uassert( 10003, "failing update: objects in a capped ns cannot grow", !(collection && collection->details()->isCapped())); collection->details()->paddingTooSmall(); deleteRecord(ns, toupdate, dl); DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god); if (debug.nmoved == -1) // default of -1 rather than 0 debug.nmoved = 1; else debug.nmoved += 1; return res; } collection->infoCache()->notifyOfWriteOp(); collection->details()->paddingFits(); debug.keyUpdates = 0; for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) { auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i)); auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get())); int64_t updatedKeys; Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys); if (Status::OK() != ret) { // This shouldn't happen unless something disastrous occurred. massert(16799, "update failed: " + ret.toString(), false); } debug.keyUpdates += updatedKeys; } // update in place int sz = objNew.objsize(); memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz); return dl; }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) { // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the // correct deleted list each time we try to allocate a new record. This ensures we won't // orphan any data when upgrading from old versions, without needing a long upgrade phase. // This is done before we try to allocate the new record so we can take advantage of the new // space immediately. { const DiskLoc head = _details->deletedListLegacyGrabBag(); if (!head.isNull()) { _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted()); addDeletedRec(txn, head); } } // align size up to a multiple of 4 const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1); freelistAllocs.increment(); DiskLoc loc; DeletedRecord* dr = NULL; { int myBucket; for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) { // Only look at the first entry in each bucket. This works because we are either // quantizing or allocating fixed-size blocks. const DiskLoc head = _details->deletedListEntry(myBucket); if (head.isNull()) continue; DeletedRecord* const candidate = drec(head); if (candidate->lengthWithHeaders() >= lenToAlloc) { loc = head; dr = candidate; break; } } if (!dr) return DiskLoc(); // no space // Unlink ourself from the deleted list _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted()); *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive } invariant(dr->extentOfs() < loc.getOfs()); // Split the deleted record if it has at least as much left over space as our smallest // allocation size. Otherwise, just take the whole DeletedRecord. const int remainingLength = dr->lengthWithHeaders() - lenToAlloc; if (remainingLength >= bucketSizes[0]) { txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc; const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc); DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc)); newDel->extentOfs() = dr->extentOfs(); newDel->lengthWithHeaders() = remainingLength; newDel->nextDeleted().Null(); addDeletedRec(txn, newDelLoc); } return loc; }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { // We assume all options have been validated earlier, if not, programming error. // Passing localDB name is a hack to avoid more rigorous check that performed for non local DB. dassert(validateWriteConcern(txn, writeConcern, kLocalDB).isOK()); // We should never be waiting for write concern while holding any sort of lock, because this may // lead to situations where the replication heartbeats are stalled. // // This check does not hold for writes done through dbeval because it runs with a global X lock. dassert(!txn->lockState()->isLocked() || txn->getClient()->isInDirectClient()); // Next handle blocking on disk Timer syncTimer; auto replCoord = repl::getGlobalReplicationCoordinator(); WriteConcernOptions writeConcernWithPopulatedSyncMode = replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); switch (writeConcernWithPopulatedSyncMode.syncMode) { case WriteConcernOptions::SyncMode::UNSET: severe() << "Attempting to wait on a WriteConcern with an unset sync option"; fassertFailed(34410); case WriteConcernOptions::SyncMode::NONE: break; case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::SyncMode::JOURNAL: if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { // Wait for ops to become durable then update replication system's // knowledge of this. OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); replCoord->setMyLastDurableOpTimeForward(appliedOpTime); } else { txn->recoveryUnit()->waitUntilDurable(); } break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplication( txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( replOpTime, writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }