int run() { Client::initThread( "oplogreplay" ); toolInfoLog() << "going to connect" << std::endl; OplogReader r; r.setTailingQueryOptions( QueryOption_SlaveOk | QueryOption_AwaitData ); r.connect(mongoOplogGlobalParams.from); toolInfoLog() << "connected" << std::endl; OpTime start(time(0) - mongoOplogGlobalParams.seconds, 0); toolInfoLog() << "starting from " << start.toStringPretty() << std::endl; r.tailingQueryGTE(mongoOplogGlobalParams.ns.c_str(), start); int num = 0; while ( r.more() ) { BSONObj o = r.next(); if (logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(2))) { toolInfoLog() << o << std::endl; } if ( o["$err"].type() ) { toolError() << "error getting oplog" << std::endl; toolError() << o << std::endl; return -1; } bool print = ++num % 100000 == 0; if (print) { toolInfoLog() << num << "\t" << o << std::endl; } if ( o["op"].String() == "n" ) continue; BSONObjBuilder b( o.objsize() + 32 ); BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) ); updates.append( o ); updates.done(); BSONObj c = b.obj(); BSONObj res; bool ok = conn().runCommand( "admin" , c , res ); if (!ok) { toolError() << res << std::endl; } else if (print) { toolInfoLog() << res << std::endl; } } return 0; }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r; OpTime lastOpTimeFetched; // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; } r.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { if (!r.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. r.more(); } networkByteStats.increment(r.currentBatchMessageSize()); if (!r.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
bool ReplSetImpl::_initialSyncOplogApplication(OplogReader& r, const Member *source, const OpTime& applyGTE, const OpTime& minValid) { const string hn = source->fullName(); OplogReader missingObjReader; try { r.tailingQueryGTE( rsoplog, applyGTE ); if ( !r.haveCursor() ) { log() << "replSet initial sync oplog query error" << rsLog; return false; } { if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); if( op.firstElementFieldName() == string("$err") ) { log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; return false; } uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at " << t.toStringPretty() << " to " << minValid.toStringPretty()); } } catch(DBException& e) { log() << "replSet initial sync failing: " << e.toString() << rsLog; return false; } /* we lock outside the loop to avoid the overhead of locking on every operation. */ writelock lk(""); // todo : use exhaust OpTime ts; time_t start = time(0); unsigned long long n = 0; while( 1 ) { try { if( !r.more() ) break; BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ ts = o["ts"]._opTime(); { if( (source->state() != MemberState::RS_PRIMARY && source->state() != MemberState::RS_SECONDARY) || replSetForceInitialSyncFailure ) { int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; throw DBException("forced error",0); } log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } if( ts >= applyGTE ) { // optimes before we started copying need not be applied. bool failedUpdate = syncApply(o); if( failedUpdate ) { // we don't have the object yet, which is possible on initial sync. get it. log() << "replSet info adding missing object" << endl; // rare enough we can log if( !missingObjReader.connect(hn) ) { // ok to call more than once log() << "replSet initial sync fails, couldn't connect to " << hn << endl; return false; } const char *ns = o.getStringField("ns"); BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); // might be more than just _id in the update criteria BSONObj missingObj; try { missingObj = missingObjReader.findOne( ns, query ); } catch(...) { log() << "replSet assertion fetching missing object" << endl; throw; } if( missingObj.isEmpty() ) { log() << "replSet missing object not found on source. presumably deleted later in oplog" << endl; log() << "replSet o2: " << o.getObjectField("o2").toString() << endl; log() << "replSet o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl; } else { Client::Context ctx(ns); try { DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize()); assert( !d.isNull() ); } catch(...) { log() << "replSet assertion during insert of missing object" << endl; throw; } // now reapply the update from above bool failed = syncApply(o); if( failed ) { log() << "replSet update still fails after adding missing object " << ns << endl; assert(false); } } } } _logOpObjRS(o); /* with repl sets we write the ops to our oplog too */ } if ( ++n % 1000 == 0 ) { time_t now = time(0); if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; } } if ( ts > minValid ) { break; } getDur().commitIfNeeded(); } catch (DBException& e) { // skip duplicate key exceptions if( e.getCode() == 11000 || e.getCode() == 11001 ) { continue; } // handle cursor not found (just requery) if( e.getCode() == 13127 ) { log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl; r.resetCursor(); r.tailingQueryGTE(rsoplog, ts); if( r.haveCursor() ) { continue; } } // TODO: handle server restart if( ts <= minValid ) { // didn't make it far enough log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog; return false; } // otherwise, whatever break; } } return true; }
bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source, const OpTime& applyGTE, const OpTime& minValid) { const string hn = source->fullName(); try { r.tailingQueryGTE( rsoplog, applyGTE ); if ( !r.haveCursor() ) { log() << "replSet initial sync oplog query error" << rsLog; return false; } { if( !r.more() ) { sethbmsg("replSet initial sync error reading remote oplog"); log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; return false; } bo op = r.next(); OpTime t = op["ts"]._opTime(); r.putBack(op); if( op.firstElementFieldName() == string("$err") ) { log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; return false; } uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); if( t > applyGTE ) { sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; return false; } sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at " << t.toStringPretty() << " to " << minValid.toStringPretty()); } } catch(DBException& e) { log() << "replSet initial sync failing: " << e.toString() << rsLog; return false; } /* we lock outside the loop to avoid the overhead of locking on every operation. */ writelock lk(""); // todo : use exhaust OpTime ts; time_t start = time(0); unsigned long long n = 0; int fails = 0; while( ts < minValid ) { try { // There are some special cases with initial sync (see the catch block), so we // don't want to break out of this while until we've reached minvalid. Thus, we'll // keep trying to requery. if( !r.more() ) { OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl; sleepsecs(1); r.resetCursor(); r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten); if ( !r.haveCursor() ) { if (fails++ > 30) { log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl; return false; } } continue; } BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ ts = o["ts"]._opTime(); { if( (source->state() != MemberState::RS_PRIMARY && source->state() != MemberState::RS_SECONDARY) || replSetForceInitialSyncFailure ) { int f = replSetForceInitialSyncFailure; if( f > 0 ) { replSetForceInitialSyncFailure = f-1; log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; throw DBException("forced error",0); } log() << "replSet we are now primary" << rsLog; throw DBException("primary changed",0); } applyOp(o, applyGTE); } if ( ++n % 1000 == 0 ) { time_t now = time(0); if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; } } getDur().commitIfNeeded(); } catch (DBException& e) { // Skip duplicate key exceptions. // These are relatively common on initial sync: if a document is inserted // early in the clone step, the insert will be replayed but the document // will probably already have been cloned over. if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) { continue; } // handle cursor not found (just requery) if( e.getCode() == 13127 ) { log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl; r.resetCursor(); r.tailingQueryGTE(rsoplog, ts); if( r.haveCursor() ) { continue; } } // TODO: handle server restart if( ts <= minValid ) { // didn't make it far enough log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog; return false; } // otherwise, whatever, we'll break out of the loop and catch // anything that's really wrong in syncTail } } return true; }