void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, const Timestamp& oplogApplicationStartPoint, const Timestamp& topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; OplogBufferLocalOplog oplogBuffer(oplogApplicationStartPoint); oplogBuffer.startup(opCtx); RecoveryOplogApplierStats stats; auto writerPool = OplogApplier::makeWriterPool(); OplogApplier::Options options; options.allowNamespaceNotFoundErrorsOnCrudOps = true; options.skipWritesToOplog = true; // During replication recovery, the stableTimestampForRecovery refers to the stable timestamp // from which we replay the oplog. // For startup recovery, this will be the recovery timestamp, which is the stable timestamp that // the storage engine recovered to on startup. For rollback recovery, this will be the last // stable timestamp, returned when we call recoverToStableTimestamp. // We keep track of this for prepared transactions so that when we apply a commitTransaction // oplog entry, we can check if it occurs before or after the stable timestamp and decide // whether the operations would have already been reflected in the data. options.stableTimestampForRecovery = oplogApplicationStartPoint; OplogApplierImpl oplogApplier(nullptr, &oplogBuffer, &stats, nullptr, _consistencyMarkers, _storageInterface, options, writerPool.get()); OplogApplier::BatchLimits batchLimits; batchLimits.bytes = OplogApplier::calculateBatchLimitBytes(opCtx, _storageInterface); batchLimits.ops = OplogApplier::getBatchLimitOperations(); OpTime applyThroughOpTime; OplogApplier::Operations batch; while ( !(batch = fassert(50763, oplogApplier.getNextApplierBatch(opCtx, batchLimits))).empty()) { applyThroughOpTime = uassertStatusOK(oplogApplier.multiApply(opCtx, std::move(batch))); } stats.complete(applyThroughOpTime); invariant(oplogBuffer.isEmpty(), str::stream() << "Oplog buffer not empty after applying operations. Last operation " "applied with optime: " << applyThroughOpTime.toBSON()); invariant(applyThroughOpTime.getTimestamp() == topOfOplog, str::stream() << "Did not apply to top of oplog. Applied through: " << applyThroughOpTime.toString() << ". Top of oplog: " << topOfOplog.toString()); oplogBuffer.shutdown(opCtx); // We may crash before setting appliedThrough. If we have a stable checkpoint, we will recover // to that checkpoint at a replication consistent point, and applying the oplog is safe. // If we don't have a stable checkpoint, then we must be in startup recovery, and not rollback // recovery, because we only roll back to a stable timestamp when we have a stable checkpoint. // Startup recovery from an unstable checkpoint only ever applies a single batch and it is safe // to replay the batch from any point. _consistencyMarkers->setAppliedThrough(opCtx, applyThroughOpTime); }
/* slave: pull some data from the master's oplog note: not yet in db mutex at this point. @return -1 error 0 ok, don't sleep 1 ok, sleep */ int ReplSource::sync_pullOpLog(int& nApplied) { int okResultCode = 1; string ns = string("local.oplog.$") + sourceName(); LOG(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; bool tailing = true; oplogReader.tailCheck(); bool initial = syncedTo.isNull(); if ( !oplogReader.haveCursor() || initial ) { if ( initial ) { // Important to grab last oplog timestamp before listing databases. syncToTailOfRemoteLog(); BSONObj info; bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); massert( 10389 , "Unable to get database list", ok ); BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; string name = e.embeddedObject().getField( "name" ).valuestr(); if ( !e.embeddedObject().getBoolField( "empty" ) ) { if ( name != "local" ) { if ( only.empty() || only == name ) { LOG( 2 ) << "adding to 'addDbNextPass': "******"$gte", syncedTo.asDate()); BSONObjBuilder query; query.append("ts", gte.done()); if ( !only.empty() ) { // note we may here skip a LOT of data table scanning, a lot of work for the master. // maybe append "\\." here? query.appendRegex("ns", string("^") + pcrecpp::RE::QuoteMeta( only )); } BSONObj queryObj = query.done(); // e.g. queryObj = { ts: { $gte: syncedTo } } oplogReader.tailingQuery(ns.c_str(), queryObj); tailing = false; } else { LOG(2) << "repl: tailing=true\n"; } if( !oplogReader.haveCursor() ) { problem() << "repl: dbclient::query returns null (conn closed?)" << endl; oplogReader.resetConnection(); return -1; } // show any deferred database creates from a previous pass { set<string>::iterator i = addDbNextPass.begin(); if ( i != addDbNextPass.end() ) { BSONObjBuilder b; b.append("ns", *i + '.'); b.append("op", "db"); BSONObj op = b.done(); sync_pullOpLog_applyOperation(op, false); } } if ( !oplogReader.more() ) { if ( tailing ) { LOG(2) << "repl: tailing & no new activity\n"; if( oplogReader.awaitCapable() ) okResultCode = 0; // don't sleep } else { log() << "repl: " << ns << " oplog is empty" << endl; } { Lock::GlobalWrite lk; save(); } return okResultCode; } OpTime nextOpTime; { BSONObj op = oplogReader.next(); BSONElement ts = op.getField("ts"); if ( ts.type() != Date && ts.type() != Timestamp ) { string err = op.getStringField("$err"); if ( !err.empty() ) { // 13051 is "tailable cursor requested on non capped collection" if (op.getIntField("code") == 13051) { problem() << "trying to slave off of a non-master" << '\n'; massert( 13344 , "trying to slave off of a non-master", false ); } else { problem() << "repl: $err reading remote oplog: " + err << '\n'; massert( 10390 , "got $err reading remote oplog", false ); } } else { problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; massert( 10391 , "repl: bad object read from remote oplog", false); } } nextOpTime = OpTime( ts.date() ); LOG(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; if ( initial ) { LOG(1) << "repl: initial run\n"; } if( tailing ) { if( !( syncedTo < nextOpTime ) ) { log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; log() << "repl syncTo: " << syncedTo.toStringLong() << endl; log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; verify(false); } oplogReader.putBack( op ); // op will be processed in the loop below nextOpTime = OpTime(); // will reread the op below } else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error log() << "repl: nextOpTime " << nextOpTime.toStringLong() << ' ' << ((nextOpTime < syncedTo) ? "<??" : ">") << " syncedTo " << syncedTo.toStringLong() << '\n' << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n" << "repl: tailing: " << tailing << '\n' << "repl: data too stale, halting replication" << endl; replInfo = replAllDead = "data too stale halted replication"; verify( syncedTo < nextOpTime ); throw SyncException(); } else { /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ } } // apply operations { int n = 0; time_t saveLast = time(0); while ( 1 ) { bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place. if ( moreInitialSyncsPending || !oplogReader.more() ) { Lock::GlobalWrite lk; // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior. { dbtemprelease t; if ( !moreInitialSyncsPending && oplogReader.more() ) { continue; } // otherwise, break out of loop so we can set to completed or clone more dbs } if( oplogReader.awaitCapable() && tailing ) okResultCode = 0; // don't sleep syncedTo = nextOpTime; save(); // note how far we are synced up to now log() << "repl: applied " << n << " operations" << endl; nApplied = n; log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; break; } else { } OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { // periodically note our progress, in case we are doing a lot of work and crash Lock::GlobalWrite lk; syncedTo = nextOpTime; // can't update local log ts since there are pending operations from our peer save(); log() << "repl: checkpoint applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; saveLast = time(0); n = 0; } BSONObj op = oplogReader.next(); int b = replApplyBatchSize.get(); bool justOne = b == 1; scoped_ptr<Lock::GlobalWrite> lk( justOne ? 0 : new Lock::GlobalWrite() ); while( 1 ) { BSONElement ts = op.getField("ts"); if( !( ts.type() == Date || ts.type() == Timestamp ) ) { log() << "sync error: problem querying remote oplog record" << endl; log() << "op: " << op.toString() << endl; log() << "halting replication" << endl; replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; throw SyncException(); } OpTime last = nextOpTime; nextOpTime = OpTime( ts.date() ); if ( !( last < nextOpTime ) ) { log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; log() << " last: " << last.toStringLong() << endl; log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; log() << " halting replication" << endl; replInfo = replAllDead = "sync error last >= nextOpTime"; uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); } if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { verify( justOne ); oplogReader.putBack( op ); _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; Lock::GlobalWrite lk; if ( n > 0 ) { syncedTo = last; save(); } log() << "repl: applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; return okResultCode; } sync_pullOpLog_applyOperation(op, !justOne); n++; if( --b == 0 ) break; // if to here, we are doing mulpile applications in a singel write lock acquisition if( !oplogReader.moreInCurrentBatch() ) { // break if no more in batch so we release lock while reading from the master break; } op = oplogReader.next(); getDur().commitIfNeeded(); } } } return okResultCode; }
void GhostSync::percolate(const BSONObj& id, const OpTime& last) { const OID rid = id["_id"].OID(); GhostSlave* slave; { rwlock lk( _lock , false ); MAP::iterator i = _ghostCache.find( rid ); if ( i == _ghostCache.end() ) { OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog; return; } slave = i->second.get(); if (!slave->init) { OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog; return; } } verify(slave->slave); const Member *target = replset::BackgroundSync::get()->getSyncTarget(); if (!target || rs->box.getState().primary() // we are currently syncing from someone who's syncing from us // the target might end up with a new Member, but s.slave never // changes so we'll compare the names || target == slave->slave || target->fullName() == slave->slave->fullName()) { LOG(1) << "replica set ghost target no good" << endl; return; } try { // haveCursor() does not necessarily tell us if we have a non-dead cursor, so we check // tailCheck() as well; see SERVER-8420 slave->reader.tailCheck(); if (!slave->reader.haveCursor()) { if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) { // error message logged in OplogReader::connect return; } slave->reader.ghostQueryGTE(rsoplog, last); // if we lose the connection between connecting and querying, the cursor may not // exist so we have to check again before using it. if (!slave->reader.haveCursor()) { return; } } LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog; if (slave->last > last) { return; } while (slave->last <= last) { if (!slave->reader.more()) { // we'll be back return; } BSONObj o = slave->reader.nextSafe(); slave->last = o["ts"]._opTime(); } LOG(2) << "now last is " << slave->last.toString() << rsLog; } catch (DBException& e) { // we'll be back LOG(2) << "replSet ghost sync error: " << e.what() << " for " << slave->slave->fullName() << rsLog; slave->reader.resetConnection(); } }