/* grab initial copy of a database from the master */ void ReplSource::resync(const std::string& dbName) { const std::string db(dbName); // need local copy of the name, we're dropping the original string dummyNs = resyncDrop( db.c_str(), "internal" ); Client::Context ctx( dummyNs ); { log() << "resync: cloning database " << db << " to get an initial copy" << endl; ReplInfo r("resync: cloning a database"); string errmsg; int errCode = 0; bool ok = Cloner::cloneFrom(hostName.c_str(), errmsg, cc().database()->name(), false, /*slaveOk*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false, &errCode); if ( !ok ) { if ( errCode == DatabaseDifferCaseCode ) { resyncDrop( db.c_str(), "internal" ); log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl; return; } else { problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl; throw SyncException(); } } } log() << "resync: done with initial clone for db: " << db << endl; return; }
FixedBuffer::FixedBuffer(size_t capacity, size_t osize) : Buffer(capacity) { objsize = osize; buf = new char[capacity * objsize]; #ifdef CCXX_EXCEPTIONS if(!buf && Thread::getException() == Thread::throwObject) throw(this); #ifdef COMMON_STD_EXCEPTION else if(!buf && Thread::getException() == Thread::throwException) throw(SyncException("fixed buffer failure")); #endif #endif head = tail = buf; }
/* grab initial copy of a database from the master */ bool ReplSource::resync(string db) { string dummyNs = resyncDrop( db.c_str(), "internal" ); Client::Context ctx( dummyNs ); { log() << "resync: cloning database " << db << " to get an initial copy" << endl; ReplInfo r("resync: cloning a database"); string errmsg; bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveok*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false); if ( !ok ) { problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl; throw SyncException(); } } log() << "resync: done with initial clone for db: " << db << endl; return true; }
/* slave: pull some data from the master's oplog note: not yet in db mutex at this point. @return -1 error 0 ok, don't sleep 1 ok, sleep */ int ReplSource::sync_pullOpLog(int& nApplied) { int okResultCode = 1; string ns = string("local.oplog.$") + sourceName(); log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; bool tailing = true; oplogReader.tailCheck(); bool initial = syncedTo.isNull(); if ( !oplogReader.haveCursor() || initial ) { if ( initial ) { // Important to grab last oplog timestamp before listing databases. syncToTailOfRemoteLog(); BSONObj info; bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); massert( 10389 , "Unable to get database list", ok ); BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; string name = e.embeddedObject().getField( "name" ).valuestr(); if ( !e.embeddedObject().getBoolField( "empty" ) ) { if ( name != "local" ) { if ( only.empty() || only == name ) { log( 2 ) << "adding to 'addDbNextPass': "******"$gte", syncedTo.asDate()); BSONObjBuilder query; query.append("ts", q.done()); if ( !only.empty() ) { // note we may here skip a LOT of data table scanning, a lot of work for the master. query.appendRegex("ns", string("^") + only); // maybe append "\\." here? } BSONObj queryObj = query.done(); // e.g. queryObj = { ts: { $gte: syncedTo } } oplogReader.tailingQuery(ns.c_str(), queryObj); tailing = false; } else { log(2) << "repl: tailing=true\n"; } if( !oplogReader.haveCursor() ) { problem() << "repl: dbclient::query returns null (conn closed?)" << endl; oplogReader.resetConnection(); return -1; } // show any deferred database creates from a previous pass { set<string>::iterator i = addDbNextPass.begin(); if ( i != addDbNextPass.end() ) { BSONObjBuilder b; b.append("ns", *i + '.'); b.append("op", "db"); BSONObj op = b.done(); sync_pullOpLog_applyOperation(op, false); } } if ( !oplogReader.more() ) { if ( tailing ) { log(2) << "repl: tailing & no new activity\n"; if( oplogReader.awaitCapable() ) okResultCode = 0; // don't sleep } else { log() << "repl: " << ns << " oplog is empty\n"; } { dblock lk; save(); } return okResultCode; } OpTime nextOpTime; { BSONObj op = oplogReader.next(); BSONElement ts = op.getField("ts"); if ( ts.type() != Date && ts.type() != Timestamp ) { string err = op.getStringField("$err"); if ( !err.empty() ) { // 13051 is "tailable cursor requested on non capped collection" if (op.getIntField("code") == 13051) { problem() << "trying to slave off of a non-master" << '\n'; massert( 13344 , "trying to slave off of a non-master", false ); } else { problem() << "repl: $err reading remote oplog: " + err << '\n'; massert( 10390 , "got $err reading remote oplog", false ); } } else { problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; massert( 10391 , "repl: bad object read from remote oplog", false); } } nextOpTime = OpTime( ts.date() ); log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; if ( initial ) { log(1) << "repl: initial run\n"; } if( tailing ) { if( !( syncedTo < nextOpTime ) ) { log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; log() << "repl syncTo: " << syncedTo.toStringLong() << endl; log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; assert(false); } oplogReader.putBack( op ); // op will be processed in the loop below nextOpTime = OpTime(); // will reread the op below } else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error Nullstream& l = log(); l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' '; if ( nextOpTime < syncedTo ) l << "<??"; else l << ">"; l << " syncedTo " << syncedTo.toStringLong() << '\n'; log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n"; log() << "repl: tailing: " << tailing << '\n'; log() << "repl: data too stale, halting replication" << endl; replInfo = replAllDead = "data too stale halted replication"; assert( syncedTo < nextOpTime ); throw SyncException(); } else { /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ } } // apply operations { int n = 0; time_t saveLast = time(0); while ( 1 ) { bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place. if ( moreInitialSyncsPending || !oplogReader.more() ) { dblock lk; // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior. { dbtemprelease t; if ( !moreInitialSyncsPending && oplogReader.more() ) { continue; } // otherwise, break out of loop so we can set to completed or clone more dbs } if( oplogReader.awaitCapable() && tailing ) okResultCode = 0; // don't sleep syncedTo = nextOpTime; save(); // note how far we are synced up to now log() << "repl: applied " << n << " operations" << endl; nApplied = n; log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; break; } else { } OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { // periodically note our progress, in case we are doing a lot of work and crash dblock lk; syncedTo = nextOpTime; // can't update local log ts since there are pending operations from our peer save(); log() << "repl: checkpoint applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; saveLast = time(0); n = 0; } BSONObj op = oplogReader.next(); unsigned b = replApplyBatchSize; bool justOne = b == 1; scoped_ptr<writelock> lk( justOne ? 0 : new writelock() ); while( 1 ) { BSONElement ts = op.getField("ts"); if( !( ts.type() == Date || ts.type() == Timestamp ) ) { log() << "sync error: problem querying remote oplog record" << endl; log() << "op: " << op.toString() << endl; log() << "halting replication" << endl; replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; throw SyncException(); } OpTime last = nextOpTime; nextOpTime = OpTime( ts.date() ); if ( !( last < nextOpTime ) ) { log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; log() << " last: " << last.toStringLong() << endl; log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; log() << " halting replication" << endl; replInfo = replAllDead = "sync error last >= nextOpTime"; uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); } if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { assert( justOne ); oplogReader.putBack( op ); _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; dblock lk; if ( n > 0 ) { syncedTo = last; save(); } log() << "repl: applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; return okResultCode; } sync_pullOpLog_applyOperation(op, !justOne); n++; if( --b == 0 ) break; // if to here, we are doing mulpile applications in a singel write lock acquisition if( !oplogReader.moreInCurrentBatch() ) { // break if no more in batch so we release lock while reading from the master break; } op = oplogReader.next(); getDur().commitIfNeeded(); } } } return okResultCode; }
/* local.$oplog.main is of the form: { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> } ... see logOp() comments. @param alreadyLocked caller already put us in write lock if true */ void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) { if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly log(6) << "processing op: " << op << endl; if( op.getStringField("op")[0] == 'n' ) return; char clientName[MaxDatabaseNameLen]; const char *ns = op.getStringField("ns"); nsToDatabase(ns, clientName); if ( *ns == '.' ) { problem() << "skipping bad op in oplog: " << op.toString() << endl; return; } else if ( *ns == 0 ) { /*if( op.getStringField("op")[0] != 'n' )*/ { problem() << "halting replication, bad op in oplog:\n " << op.toString() << endl; replAllDead = "bad object in oplog"; throw SyncException(); } //ns = "local.system.x"; //nsToDatabase(ns, clientName); } if ( !only.empty() && only != clientName ) return; if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) { if( cmdLine.pretouch > 1 ) { /* note: this is bad - should be put in ReplSource. but this is first test... */ static int countdown; assert( countdown >= 0 ); if( countdown > 0 ) { countdown--; // was pretouched on a prev pass } else { const int m = 4; if( tp.get() == 0 ) { int nthr = min(8, cmdLine.pretouch); nthr = max(nthr, 1); tp.reset( new ThreadPool(nthr) ); } vector<BSONObj> v; oplogReader.peek(v, cmdLine.pretouch); unsigned a = 0; while( 1 ) { if( a >= v.size() ) break; unsigned b = a + m - 1; // v[a..b] if( b >= v.size() ) b = v.size() - 1; tp->schedule(pretouchN, v, a, b); DEV cout << "pretouch task: " << a << ".." << b << endl; a += m; } // we do one too... pretouchOperation(op); tp->join(); countdown = v.size(); } } else { pretouchOperation(op); } } scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() ); if ( replAllDead ) { // hmmm why is this check here and not at top of this function? does it get set between top and here? log() << "replAllDead, throwing SyncException: " << replAllDead << endl; throw SyncException(); } Client::Context ctx( ns ); ctx.getClient()->curop()->reset(); bool empty = ctx.db()->isEmpty(); bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0; if( logLevel >= 6 ) log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl; // always apply admin command command // this is a bit hacky -- the semantics of replication/commands aren't well specified if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) { applyOperation( op ); return; } if ( ctx.justCreated() || empty || incompleteClone ) { // we must add to incomplete list now that setClient has been called incompleteCloneDbs.insert( clientName ); if ( nClonedThisPass ) { /* we only clone one database per pass, even if a lot need done. This helps us avoid overflowing the master's transaction log by doing too much work before going back to read more transactions. (Imagine a scenario of slave startup where we try to clone 100 databases in one pass.) */ addDbNextPass.insert( clientName ); } else { if ( incompleteClone ) { log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl; } save(); Client::Context ctx(ns); nClonedThisPass++; resync(ctx.db()->name); addDbNextPass.erase(clientName); incompleteCloneDbs.erase( clientName ); } save(); } else { applyOperation( op ); addDbNextPass.erase( clientName ); } }