void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } r.tailingQueryGTE(rsoplog, _lastOpTimeFetched); } // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) { int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as postentiallyd efating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } // looping back is ok because this is a tailable cursor } }
void BackgroundSync::produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer) { _condvar.wait(lock); } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { boost::unique_lock<boost::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } _syncSourceReader.resetConnection(); _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); { boost::unique_lock<boost::mutex> lock(_mutex); // no server found if (_syncSourceReader.getHost().empty()) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = _syncSourceReader.getHost(); } _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!_syncSourceReader.haveCursor()) { return; } if (_rollbackIfNeeded(txn, _syncSourceReader)) { stop(); return; } while (!inShutdown()) { if (!_syncSourceReader.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = _syncSourceReader.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getCurrentMemberState().primary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncSource()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. _syncSourceReader.more(); } networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); if (!_syncSourceReader.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause) { return; } } _syncSourceReader.tailCheck(); if( !_syncSourceReader.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = _syncSourceReader.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r; OpTime lastOpTimeFetched; // find a target to sync from the last op time written getOplogReader(r); // no server found { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; } r.tailingQueryGTE(rsoplog, lastOpTimeFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return; } uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); if (isRollbackRequired(r)) { stop(); return; } while (!inShutdown()) { if (!r.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = r.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // dassert( !Lock::isLocked() ); sleepmillis(SleepToAllowBatchingMillis); } if (theReplSet->gotForceSync()) { return; } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (isAssumingPrimary() || theReplSet->isPrimary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncTarget()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. r.more(); } networkByteStats.increment(r.currentBatchMessageSize()); if (!r.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end syncTail pass" << rsLog; return; } continue; } } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog; } // the blocking queue will wait (forever) until there's room for us to push _buffer.push(o); bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); { boost::unique_lock<boost::mutex> lock(_mutex); _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime(); LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty() << rsLog; } } }
// returns number of seconds to sleep, if any uint32_t BackgroundSync::produce() { // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); // There should never be ops to sync in a 1-member set, anyway return 1; } } OplogReader r(true /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found GTID lastGTIDFetched = theReplSet->gtidManager->getLiveState(); { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { // if there is no one to sync from return 1; //sleep one second } } r.tailingQueryGTE(rsoplog, lastGTIDFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return 0; } try { // this method may actually run rollback, yes, the name is bad if (isRollbackRequired(r)) { // sleep 2 seconds and try again. (The 2 is arbitrary). // If we are not fatal, then we will keep trying to sync // from another machine return 2; } } catch (RollbackOplogException& re){ // we attempted a rollback and failed, we must go fatal. log() << "Caught a RollbackOplogException during rollback, going fatal" << rsLog; theReplSet->fatal(); return 2; // 2 is arbitrary, if we are going fatal, we are done } while (!_opSyncShouldExit) { while (!_opSyncShouldExit) { { // check if we should bail out boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { return 0; } } if (!r.moreInCurrentBatch()) { // check to see if we have a request to sync // from a specific target. If so, get out so that // we can restart the act of syncing and // do so from the correct target if (theReplSet->gotForceSync()) { return 0; } verify(!theReplSet->isPrimary()); if (shouldChangeSyncTarget()) { return 0; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) { break; } // This is the operation we have received from the target // that we must put in our oplog with an applied field of false BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); LOG(3) << "replicating " << o.toString(false, true) << " from " << _currentSyncTarget->fullName() << endl; uint64_t ts = o["ts"]._numberLong(); // now that we have the element in o, let's check // if there a delay is required (via slaveDelay) before // writing it to the oplog if (theReplSet->myConfig().slaveDelay > 0) { handleSlaveDelay(ts); { boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { break; } } } { Timer timer; bool bigTxn = false; { Client::Transaction transaction(DB_SERIALIZABLE); replicateFullTransactionToOplog(o, r, &bigTxn); // we are operating as a secondary. We don't have to fsync transaction.commit(DB_TXN_NOSYNC); } { GTID currEntry = getGTIDFromOplogEntry(o); uint64_t lastHash = o["h"].numberLong(); boost::unique_lock<boost::mutex> lock(_mutex); // update counters theReplSet->gtidManager->noteGTIDAdded(currEntry, ts, lastHash); // notify applier thread that data exists if (_deque.size() == 0) { _queueCond.notify_all(); } _deque.push_back(o); bufferCountGauge.increment(); bufferSizeGauge.increment(o.objsize()); // this is a flow control mechanism, with bad numbers // hard coded for now just to get something going. // If the opSync thread notices that we have over 20000 // transactions in the queue, it waits until we get below // 10000. This is where we wait if we get too high // Once we have spilling of transactions working, this // logic will need to be redone if (_deque.size() > 20000) { _queueCond.wait(lock); } if (bigTxn) { // if we have a large transaction, we don't want // to let it pile up. We want to process it immedietely // before processing anything else. while (_deque.size() > 0) { _queueDone.wait(lock); } } } } } // end while if (shouldChangeSyncTarget()) { return 0; } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end opSync pass" << rsLog; return 0; } // looping back is ok because this is a tailable cursor } return 0; }