Beispiel #1
0
    void ChunkManager::loadExistingRanges(const ChunkManager* oldManager) {
        int tries = 3;

        while (tries--) {
            ChunkMap chunkMap;
            set<Shard> shards;
            ShardVersionMap shardVersions;

            Timer t;

            bool success = _load(chunkMap, shards, &shardVersions, oldManager);
            if (success) {
                log() << "ChunkManager: time to load chunks for " << _ns << ": "
                      << t.millis() << "ms"
                      << " sequenceNumber: " << _sequenceNumber
                      << " version: " << _version.toString()
                      << " based on: "
                            << (oldManager ? oldManager->getVersion().toString() : "(empty)");

                // TODO: Merge into diff code above, so we validate in one place
                if (isChunkMapValid(chunkMap)) {
                    _chunkMap.swap(chunkMap);
                    _shards.swap(shards);
                    _shardVersions.swap(shardVersions);
                    _chunkRanges.reloadAll(_chunkMap);

                    return;
                }
            }

            if (_chunkMap.size() < 10) {
                _printChunks();
            }

            warning() << "ChunkManager loaded an invalid config for " << _ns << ", trying again";

            sleepmillis(10 * (3 - tries));
        }

        // This will abort construction so we should never have a reference to an invalid config
        msgasserted(13282, str::stream() << "Couldn't load a valid config for " << _ns
                                         << " after 3 attempts. Please try again.");
    }
Beispiel #2
0
 bool waitForSyncToFinish(OperationContext* txn, string &errmsg) const {
     // Wait for slave thread to finish syncing, so sources will be be
     // reloaded with new saved state on next pass.
     Timer t;
     while ( 1 ) {
         if ( syncing == 0 || t.millis() > 30000 )
             break;
         {
             Lock::TempRelease t(txn->lockState());
             relinquishSyncingSome = 1;
             sleepmillis(1);
         }
     }
     if ( syncing ) {
         errmsg = "timeout waiting for sync() to finish";
         return false;
     }
     return true;
 }
Beispiel #3
0
 void MiniWebServer::run() {
     SockAddr from;
     while ( 1 ) {
         int s = accept(sock, from.getSockAddr(), &from.addressSize);
         if ( s < 0 ) {
             if ( errno == ECONNABORTED ) {
                 log() << "Listener on port " << port << " aborted." << endl;
                 return;
             }
             log() << "MiniWebServer: accept() returns " << s << " errno:" << errno << endl;
             sleepmillis(200);
             continue;
         }
         disableNagle(s);
         RARELY log() << "MiniWebServer: connection accepted from " << from.toString() << endl;
         accepted( s, from );
         closesocket(s);
     }
 }
void WiredTigerSessionCache::shuttingDown() {
    uint32_t actual = _shuttingDown.load();
    uint32_t expected;

    // Try to atomically set _shuttingDown flag, but just return if another thread was first.
    do {
        expected = actual;
        actual = _shuttingDown.compareAndSwap(expected, expected | kShuttingDownMask);
        if (actual & kShuttingDownMask)
            return;
    } while (actual != expected);

    // Spin as long as there are threads in releaseSession
    while (_shuttingDown.load() != kShuttingDownMask) {
        sleepmillis(1);
    }

    closeAll();
}
Beispiel #5
0
    /* static */
    BSONObj WriteBackListener::waitFor( const ConnectionIdent& ident, const OID& oid ) {

        Timer t;
        Timer lastMessageTimer;

        while ( t.minutes() < 60 ) {
            {
                scoped_lock lk( _seenWritebacksLock );
                WBStatus s = _seenWritebacks[ident];

                if ( oid < s.id ) {
                    // this means we're waiting for a GLE that already passed.
                    // it should be impossible because once we call GLE, no other
                    // writebacks should happen with that connection id

                    msgasserted( 14041 , str::stream() << "got writeback waitfor for older id " <<
                                 " oid: " << oid << " s.id: " << s.id << " ident: " << ident.toString() );
                }
                else if ( oid == s.id ) {
                    return s.gle;
                }

                // Stay in lock so we can use the status
                if( lastMessageTimer.seconds() > 10 ){

                    warning() << "waiting for writeback " << oid
                              << " from connection " << ident.toString()
                              << " for " << t.seconds() << " secs"
                              << ", currently at id " << s.id << endl;

                    lastMessageTimer.reset();
                }
            }

            sleepmillis( 10 );
        }

        uasserted( 13403 , str::stream() << "didn't get writeback for: " << oid
                                         << " after: " << t.millis() << " ms"
                                         << " from connection " << ident.toString() );

        throw 1; // never gets here
    }
Beispiel #6
0
void BackgroundSync::_run() {
    Client::initThread("rsBackgroundSync");
    AuthorizationSession::get(cc())->grantInternalAuthorization();

    while (!inShutdown()) {
        try {
            _runProducer();
        } catch (const DBException& e) {
            std::string msg(str::stream() << "sync producer problem: " << redact(e));
            error() << msg;
            _replCoord->setMyHeartbeatMessage(msg);
            sleepmillis(100);  // sleep a bit to keep from hammering this thread with temp. errors.
        } catch (const std::exception& e2) {
            // redact(std::exception&) doesn't work
            severe() << "sync producer exception: " << redact(e2.what());
            fassertFailed(28546);
        }
    }
    stop();
}
Beispiel #7
0
 void t() {
     for( int i = 0; i < 20; i++ ) {
         sleepmillis(21);
         string fn = "/tmp/t1";
         MongoMMF f;
         unsigned long long len = 1 * 1024 * 1024;
         assert( f.create(fn, len, /*sequential*/rand()%2==0) );
         {
             char *p = (char *) f.getView();
             assert(p);
             // write something to the private view as a test
             strcpy(p, "hello");
         }
         if( cmdLine.dur ) {
             char *w = (char *) f.view_write();
             strcpy(w + 6, "world");
         }
         MongoFileFinder ff;
         ASSERT( ff.findByPath(fn) );
     }
 }
Beispiel #8
0
 static void durThread() { 
     Client::initThread("dur");
     const int HowOftenToGroupCommitMs = 100;
     while( 1 ) { 
         try {
             int millis = HowOftenToGroupCommitMs;
             {
                 Timer t;
                 journalRotate(); // note we do this part outside of mongomutex
                 millis -= t.millis();
                 if( millis < 5 || millis > HowOftenToGroupCommitMs )
                     millis = 5;
             }
             sleepmillis(millis);
             go();
         }
         catch(std::exception& e) { 
             log() << "exception in durThread " << e.what() << endl;
         }
     }
 }
Beispiel #9
0
        static void runThread() {
            while (keepGoing) {
                try {
                    if (current->lock_try( "test" )) {
                        int before = count.addAndFetch(1);
                        sleepmillis(3);
                        int after = count.loadRelaxed();

                        if (after != before) {
                            error() << " before: " << before << " after: " << after
                                    << endl;
                        }

                        current->unlock();
                    }
                }
                catch ( const DBException& ex ) {
                    log() << "*** !Could not try distributed lock." << causedBy( ex ) << endl;
                }
            }
        }
Beispiel #10
0
    void DataFileSync::run() {
        Client::initThread( name().c_str() );

        if (mmapv1GlobalOptions.syncdelay == 0) {
            log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
        }
        else if (mmapv1GlobalOptions.syncdelay == 1) {
            log() << "--syncdelay 1" << endl;
        }
        else if (mmapv1GlobalOptions.syncdelay != 60) {
            LOG(1) << "--syncdelay " << mmapv1GlobalOptions.syncdelay << endl;
        }
        int time_flushing = 0;
        while ( ! inShutdown() ) {
            _diaglog.flush();
            if (mmapv1GlobalOptions.syncdelay == 0) {
                // in case at some point we add an option to change at runtime
                sleepsecs(5);
                continue;
            }

            sleepmillis((long long) std::max(0.0, (mmapv1GlobalOptions.syncdelay * 1000) - time_flushing));

            if ( inShutdown() ) {
                // occasional issue trying to flush during shutdown when sleep interrupted
                break;
            }

            Date_t start = jsTime();
            StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
            int numFiles = storageEngine->flushAllFiles( true );
            time_flushing = (int) (jsTime() - start);

            _flushed(time_flushing);

            if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) {
                log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
            }
        }
    }
Beispiel #11
0
 void BackgroundSync::handleSlaveDelay(uint64_t opTimestamp) {
     dassert(_opSyncRunning);
     uint64_t slaveDelayMillis = theReplSet->myConfig().slaveDelay * 1000;
     uint64_t currTime = curTimeMillis64();
     uint64_t timeOpShouldBeApplied = opTimestamp + slaveDelayMillis;
     while (currTime < timeOpShouldBeApplied) {
         uint64_t sleepTime = (timeOpShouldBeApplied - currTime);
         // let's sleep for at most one second
         sleepmillis((sleepTime < 1000) ? sleepTime : 1000);        
         // check if we should bail out, as we don't want to 
         // sleep the whole time possibly long delay time
         // if we see we should be stopping
         {
             boost::unique_lock<boost::mutex> lck(_mutex);
             if (!_opSyncShouldRun) {
                 break;
             }
         }
         // reset currTime
         currTime = curTimeMillis64();
     }
 }
Beispiel #12
0
        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {

            cc().curop()->suppressFromCurop();
            cc().curop()->setExpectedLatencyMs( 30000 );

            BSONElement e = cmdObj.firstElement();
            if ( e.type() != jstOID ) {
                errmsg = "need oid as first value";
                return 0;
            }

            // get the command issuer's (a mongos) serverID
            const OID id = e.__oid();

            // the command issuer is blocked awaiting a response
            // we want to do return at least at every 5 minutes so sockets don't timeout
            BSONObj z;
            if ( writeBackManager.getWritebackQueue(id.str())->queue.blockingPop( z, 5 * 60 /* 5 minutes */ ) ) {
                LOG(1) << "WriteBackCommand got : " << z << endl;
                result.append( "data" , z );
            }
            else {
                result.appendBool( "noop" , true );
            }

#ifdef _DEBUG
            PseudoRandom r(static_cast<int64_t>(time(0)));
            // Sleep a short amount of time usually
            int sleepFor = r.nextInt32( 10 );
            sleepmillis( sleepFor );

            // Sleep a longer amount of time every once and awhile
            int sleepLong = r.nextInt32( 50 );
            if( sleepLong == 0 ) sleepsecs( 2 );
#endif

            return true;
        }
Beispiel #13
0
void BackgroundSync::_rollback(OperationContext* txn,
                               const HostAndPort& source,
                               stdx::function<DBClientBase*()> getConnection) {
    // Abort only when syncRollback detects we are in a unrecoverable state.
    // In other cases, we log the message contained in the error status and retry later.
    auto status = syncRollback(txn,
                               OplogInterfaceLocal(txn, rsOplogName),
                               RollbackSourceImpl(getConnection, source, rsOplogName),
                               _replCoord);
    if (status.isOK()) {
        // When the syncTail thread sees there is no new data by adding something to the buffer.
        _signalNoNewDataForApplier();
        // Wait until the buffer is empty.
        // This is an indication that syncTail has removed the sentinal marker from the buffer
        // and reset its local lastAppliedOpTime via the replCoord.
        while (!_buffer.empty()) {
            sleepmillis(10);
            if (inShutdown()) {
                return;
            }
        }

        // It is now safe to clear the ROLLBACK state, which may result in the applier thread
        // transitioning to SECONDARY.  This is safe because the applier thread has now reloaded
        // the new rollback minValid from the database.
        if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK)
                      << " but found self in " << _replCoord->getMemberState();
        }
        return;
    }
    if (ErrorCodes::UnrecoverableRollbackError == status.code()) {
        fassertNoTrace(28723, status);
    }
    warning() << "rollback cannot proceed at this time (retrying later): " << status;
}
Beispiel #14
0
    void BackgroundSync::markOplog() {
        LOG(3) << "replset markOplog: " << _consumedOpTime << " "
               << theReplSet->lastOpTimeWritten << rsLog;

        if (theReplSet->syncSourceFeedback.supportsUpdater()) {
            _consumedOpTime = theReplSet->lastOpTimeWritten;
            theReplSet->syncSourceFeedback.updateSelfInMap(theReplSet->lastOpTimeWritten);
        }
        else {
            boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock);
            if (!hasCursor()) {
                oplogLockSSF.unlock();
                sleepmillis(500);
                return;
            }

            if (!theReplSet->syncSourceFeedback.moreInCurrentBatch()) {
                theReplSet->syncSourceFeedback.more();
            }

            if (!theReplSet->syncSourceFeedback.more()) {
                theReplSet->syncSourceFeedback.tailCheck();
                return;
            }

            // if this member has written the op at optime T
            // we want to nextSafe up to and including T
            while (_consumedOpTime < theReplSet->lastOpTimeWritten
                   && theReplSet->syncSourceFeedback.more()) {
                BSONObj temp = theReplSet->syncSourceFeedback.nextSafe();
                _consumedOpTime = temp["ts"]._opTime();
            }

            // call more() to signal the sync target that we've synced T
            theReplSet->syncSourceFeedback.more();
        }
    }
Beispiel #15
0
    /* static */
    BSONObj WriteBackListener::waitFor( const ConnectionIdent& ident, const OID& oid ) {
        Timer t;
        for ( int i=0; i<5000; i++ ) {
            {
                scoped_lock lk( _seenWritebacksLock );
                WBStatus s = _seenWritebacks[ident];
                if ( oid < s.id ) {
                    // this means we're waiting for a GLE that already passed.
                    // it should be impossible because once we call GLE, no other
                    // writebacks should happen with that connection id

                    msgasserted( 14041 , str::stream() << "got writeback waitfor for older id " <<
                                 " oid: " << oid << " s.id: " << s.id << " ident: " << ident.toString() );
                }
                else if ( oid == s.id ) {
                    return s.gle;
                }
                
            }
            sleepmillis( 10 );
        }
        uasserted( 13403 , str::stream() << "didn't get writeback for: " << oid << " after: " << t.millis() << " ms" );
        throw 1; // never gets here
    }
Beispiel #16
0
 void run() {
     Client::initThread("fsyncjob");
     Client& c = cc();
     {
         scoped_lock lk(lockedForWritingMutex);
         lockedForWriting++;
     }
     readlock lk("");
     MemoryMappedFile::flushAll(true);
     log() << "db is now locked for snapshotting, no writes allowed. use db.$cmd.sys.unlock.findOne() to unlock" << endl;
     _ready = true;
     while( 1 ) {
         if( unlockRequested ) {
             unlockRequested = false;
             break;
         }
         sleepmillis(20);
     }
     {
         scoped_lock lk(lockedForWritingMutex);
         lockedForWriting--;
     }
     c.shutdown();
 }
Beispiel #17
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r(false /* doHandshake */);

        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);
        }

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            while (!inShutdown()) {

                if (!r.moreInCurrentBatch()) {
                    int bs = r.currentBatchMessageSize();
                    if( bs > 0 && bs < BatchIsSmallish ) {
                        // on a very low latency network, if we don't wait a little, we'll be 
                        // getting ops to write almost one at a time.  this will both be expensive
                        // for the upstream server as well as postentiallyd efating our parallel 
                        // application of batches on the secondary.
                        //
                        // the inference here is basically if the batch is really small, we are 
                        // "caught up".
                        //
                        dassert( !Lock::isLocked() );
                        sleepmillis(SleepToAllowBatchingMillis);
                    }
  
                    if (theReplSet->gotForceSync()) {
                        return;
                    }

                    if (isAssumingPrimary() || theReplSet->isPrimary()) {
                        return;
                    }

                    // re-evaluate quality of sync target
                    if (shouldChangeSyncTarget()) {
                        return;
                    }
                    //record time for each getmore
                    {
                        TimerHolder batchTimer(&getmoreReplStats);
                        r.more();
                    }
                    //increment
                    networkByteStats.increment(r.currentBatchMessageSize());

                }

                if (!r.more())
                    break;

                BSONObj o = r.nextSafe().getOwned();
                opsReadStats.increment();

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _appliedBuffer = false;
                }

                OCCASIONALLY {
                    LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
                }
                // the blocking queue will wait (forever) until there's room for us to push
                _buffer.push(o);
                bufferCountGauge.increment();
                bufferSizeGauge.increment(getSize(o));

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _lastH = o["h"].numberLong();
                    _lastOpTimeFetched = o["ts"]._opTime();
                }
            } // end while

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
                    return;
                }
            }


            r.tailCheck();
            if( !r.haveCursor() ) {
                LOG(1) << "replSet end syncTail pass" << rsLog;
                return;
            }

            // looping back is ok because this is a tailable cursor
        }
    }
Beispiel #18
0
        bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            log(1) << " authenticate: " << cmdObj << endl;

            string user = cmdObj.getStringField("user");
            string key = cmdObj.getStringField("key");
            string received_nonce = cmdObj.getStringField("nonce");
            
            if( user.empty() || key.empty() || received_nonce.empty() ) { 
                log() << "field missing/wrong type in received authenticate command " 
                    << cc().database()->name
                    << endl;               
                errmsg = "auth fails";
                sleepmillis(10);
                return false;
            }
            
            stringstream digestBuilder;

            {
                bool reject = false;
                nonce *ln = lastNonce.release();
                if ( ln == 0 ) {
                    reject = true;
                } else {
                    digestBuilder << hex << *ln;
                    reject = digestBuilder.str() != received_nonce;
                }
                    
                if ( reject ) {
                    log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << cc().database()->name << endl;
                    errmsg = "auth fails";
                    sleepmillis(30);
                    return false;
                }
            }

            static BSONObj userPattern = fromjson("{\"user\":1}");
            string systemUsers = cc().database()->name + ".system.users";
            OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1");

            BSONObj userObj;
            {
                BSONObjBuilder b;
                b << "user" << user;
                BSONObj query = b.done();
                if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { 
                    log() << "auth: couldn't find user " << user << ", " << systemUsers << endl;
                    errmsg = "auth fails";
                    return false;
                }
            }
            
            md5digest d;
            {
                
                string pwd = userObj.getStringField("pwd");
                digestBuilder << user << pwd;
                string done = digestBuilder.str();
                
                md5_state_t st;
                md5_init(&st);
                md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
                md5_finish(&st, d);
            }
            
            string computed = digestToString( d );
            
            if ( key != computed ){
                log() << "auth: key mismatch " << user << ", ns:" << ns << endl;
                errmsg = "auth fails";
                return false;
            }

            AuthenticationInfo *ai = cc().getAuthenticationInfo();
            
            if ( userObj[ "readOnly" ].isBoolean() && userObj[ "readOnly" ].boolean() ) {
                if ( readLockSupported() ){
                    ai->authorizeReadOnly( cc().database()->name.c_str() );
                }
                else {
                    log() << "warning: old version of boost, read-only users not supported" << endl;
                    ai->authorize( cc().database()->name.c_str() );
                }
            } else {
                ai->authorize( cc().database()->name.c_str() );
            }
            return true;
        }
Beispiel #19
0
    virtual void subthread(int tnumber) {
        Client::initThread("mongomutextest");

        const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext();
        OperationContext& txn = *txnPtr;

        sleepmillis(0);
        for (int i = 0; i < N; i++) {
            int x = std::rand();
            bool sometimes = (x % 15 == 0);
            if (i % 7 == 0) {
                Lock::GlobalRead r(txn.lockState());  // nested test
                Lock::GlobalRead r2(txn.lockState());
            } else if (i % 7 == 1) {
                Lock::GlobalRead r(txn.lockState());
                ASSERT(txn.lockState()->isReadLocked());
            } else if (i % 7 == 4 && tnumber == 1 /*only one upgrader legal*/) {
                Lock::GlobalWrite w(txn.lockState());
                ASSERT(txn.lockState()->isW());
                if (i % 7 == 2) {
                    Lock::TempRelease t(txn.lockState());
                }
            } else if (i % 7 == 2) {
                Lock::GlobalWrite w(txn.lockState());
                ASSERT(txn.lockState()->isW());
                if (sometimes) {
                    Lock::TempRelease t(txn.lockState());
                }
            } else if (i % 7 == 3) {
                Lock::GlobalWrite w(txn.lockState());
                { Lock::TempRelease t(txn.lockState()); }
                Lock::GlobalRead r(txn.lockState());
                ASSERT(txn.lockState()->isW());
                if (sometimes) {
                    Lock::TempRelease t(txn.lockState());
                }
            } else if (i % 7 == 5) {
                {
                    ScopedTransaction scopedXact(&txn, MODE_IS);
                    Lock::DBLock r(txn.lockState(), "foo", MODE_S);
                }
                {
                    ScopedTransaction scopedXact(&txn, MODE_IS);
                    Lock::DBLock r(txn.lockState(), "bar", MODE_S);
                }
            } else if (i % 7 == 6) {
                if (i > N / 2) {
                    int q = i % 11;
                    if (q == 0) {
                        ScopedTransaction scopedXact(&txn, MODE_IS);

                        Lock::DBLock r(txn.lockState(), "foo", MODE_S);
                        ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S));

                        Lock::DBLock r2(txn.lockState(), "foo", MODE_S);
                        ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S));

                        Lock::DBLock r3(txn.lockState(), "local", MODE_S);
                        ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S));
                        ASSERT(txn.lockState()->isDbLockedForMode("local", MODE_S));
                    } else if (q == 1) {
                        // test locking local only -- with no preceding lock
                        {
                            ScopedTransaction scopedXact(&txn, MODE_IS);
                            Lock::DBLock x(txn.lockState(), "local", MODE_S);
                        }
                        {
                            ScopedTransaction scopedXact(&txn, MODE_IX);
                            Lock::DBLock x(txn.lockState(), "local", MODE_X);

                            //  No actual writing here, so no WriteUnitOfWork
                            if (sometimes) {
                                Lock::TempRelease t(txn.lockState());
                            }
                        }
                    } else if (q == 1) {
                        {
                            ScopedTransaction scopedXact(&txn, MODE_IS);
                            Lock::DBLock x(txn.lockState(), "admin", MODE_S);
                        }

                        {
                            ScopedTransaction scopedXact(&txn, MODE_IX);
                            Lock::DBLock x(txn.lockState(), "admin", MODE_X);
                        }
                    } else if (q == 3) {
                        ScopedTransaction scopedXact(&txn, MODE_IX);

                        Lock::DBLock x(txn.lockState(), "foo", MODE_X);
                        Lock::DBLock y(txn.lockState(), "admin", MODE_S);
                    } else if (q == 4) {
                        ScopedTransaction scopedXact(&txn, MODE_IS);

                        Lock::DBLock x(txn.lockState(), "foo2", MODE_S);
                        Lock::DBLock y(txn.lockState(), "admin", MODE_S);
                    } else {
                        ScopedTransaction scopedXact(&txn, MODE_IX);

                        Lock::DBLock w(txn.lockState(), "foo", MODE_X);

                        { Lock::TempRelease t(txn.lockState()); }

                        Lock::DBLock r2(txn.lockState(), "foo", MODE_S);
                        Lock::DBLock r3(txn.lockState(), "local", MODE_S);
                    }
                } else {
                    ScopedTransaction scopedXact(&txn, MODE_IS);

                    Lock::DBLock r(txn.lockState(), "foo", MODE_S);
                    Lock::DBLock r2(txn.lockState(), "foo", MODE_S);
                    Lock::DBLock r3(txn.lockState(), "local", MODE_S);
                }
            }
            pm.hit();
        }
    }
Beispiel #20
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                _currentSyncTarget = NULL;
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        verify(r.conn() == NULL);

        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                sleepsecs(1);
                continue;
            }

            if (isStale(r, oldest)) {
                r.resetConnection();
                theReplSet->veto(current, 600);
                stale = target;
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock);
            theReplSet->syncSourceFeedback.connect(target);

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            theReplSet->goStale(stale, oldest);
            sleepsecs(120);
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Beispiel #21
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r;
        OpTime lastOpTimeFetched;
        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        r.tailingQueryGTE(rsoplog, lastOpTimeFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!r.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = r.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    dassert( !Lock::isLocked() );
                    sleepmillis(SleepToAllowBatchingMillis);
                }
  
                if (theReplSet->gotForceSync()) {
                    return;
                }
                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (isAssumingPrimary() || theReplSet->isPrimary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncTarget()) {
                    return;
                }


                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    r.more();
                }
                networkByteStats.increment(r.currentBatchMessageSize());

                if (!r.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause || 
                            !_currentSyncTarget || 
                            !_currentSyncTarget->hbinfo().hbstate.readable()) {
                            return;
                        }
                    }

                    r.tailCheck();
                    if( !r.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass" << rsLog;
                        return;
                    }

                    continue;
                }
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = r.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
            }
            // the blocking queue will wait (forever) until there's room for us to push
            _buffer.push(o);
            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastH = o["h"].numberLong();
                _lastOpTimeFetched = o["ts"]._opTime();
                LOG(3) << "replSet lastOpTimeFetched: "
                       << _lastOpTimeFetched.toStringPretty() << rsLog;
            }
        }
    }
Beispiel #22
0
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    OplogReader syncSourceReader;
    syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

    // no server found
    if (syncSourceReader.getHost().empty()) {
        sleepsecs(1);
        // if there is no one to sync from
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_pause) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _syncSourceHost = syncSourceReader.getHost();
        _replCoord->signalUpstreamUpdater();
    }

    const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout);

    // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared
    // if sync source feedback fails.
    const HostAndPort source = syncSourceReader.getHost();
    syncSourceReader.resetConnection();
    // no more references to oplog reader from here on.

    // If this status is not OK after the fetcher returns from wait(),
    // proceed to execute rollback
    Status remoteOplogStartStatus = Status::OK();

    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      &remoteOplogStartStatus);

    auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter"
                              << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))
                              << "tailable" << true << "oplogReplay" << true << "awaitData" << true
                              << "maxTimeMS" << int(fetcherMaxTimeMS.count()));
    Fetcher fetcher(taskExecutor,
                    source,
                    nsToDatabase(rsOplogName),
                    cmdObj,
                    fetcherCallback,
                    rpc::makeEmptyMetadata());
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();

    // If the background sync is paused after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isPaused()) {
        return;
    }

    // Execute rollback if necessary.
    // Rollback is a synchronous operation that uses the task executor and may not be
    // executed inside the fetcher callback.
    if (!remoteOplogStartStatus.isOK()) {
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection =
            [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* {
                if (!connection.get()) {
                    connection.reset(new ConnectionPool::ConnectionPtr(
                        &connectionPool, source, Date_t::now(), oplogSocketTimeout));
                };
                return connection->get();
            };

        log() << "starting rollback: " << remoteOplogStartStatus;
        _rollback(txn, source, getConnection);
        stop();
    }
}
Status MigrationChunkClonerSourceLegacy::awaitUntilCriticalSectionIsAppropriate(
    OperationContext* txn, Milliseconds maxTimeToWait) {
    invariant(!txn->lockState()->isLocked());
    auto scopedGuard = MakeGuard([&] { cancelClone(txn); });

    const auto startTime = Date_t::now();

    int iteration = 0;
    while ((Date_t::now() - startTime) < maxTimeToWait) {
        // Exponential sleep backoff, up to 1024ms. Don't sleep much on the first few iterations,
        // since we want empty chunk migrations to be fast.
        sleepmillis(1 << std::min(iteration, 10));
        iteration++;

        auto responseStatus = _callRecipient(BSON(kRecvChunkStatus << _args.getNss().ns()));
        if (!responseStatus.isOK()) {
            return {responseStatus.getStatus().code(),
                    str::stream()
                        << "Failed to contact recipient shard to monitor data transfer due to "
                        << responseStatus.getStatus().toString()};
        }

        BSONObj res = std::move(responseStatus.getValue());

        log() << "moveChunk data transfer progress: " << res << " my mem used: " << _memoryUsed;

        if (res["state"].String() == "steady") {
            // Ensure all cloned docs have actually been transferred
            const std::size_t locsRemaining = _cloneLocs.size();
            if (locsRemaining != 0) {
                return {
                    ErrorCodes::OperationIncomplete,
                    str::stream()
                        << "cannot enter critical section before all data is cloned, "
                        << locsRemaining
                        << " locs were not transferred but to-shard thinks they are all cloned"};
            }

            scopedGuard.Dismiss();
            return Status::OK();
        }

        if (res["state"].String() == "fail") {
            return {ErrorCodes::OperationFailed, "Data transfer error"};
        }

        if (res["ns"].str() != _args.getNss().ns() || res["from"].str() != _donorCS.toString() ||
            !res["min"].isABSONObj() || res["min"].Obj().woCompare(_args.getMinKey()) != 0 ||
            !res["max"].isABSONObj() || res["max"].Obj().woCompare(_args.getMaxKey()) != 0) {
            // This can happen when the destination aborted the migration and received another
            // recvChunk before this thread sees the transition to the abort state. This is
            // currently possible only if multiple migrations are happening at once. This is an
            // unfortunate consequence of the shards not being able to keep track of multiple
            // incoming and outgoing migrations.
            return {ErrorCodes::OperationIncomplete,
                    "Destination shard aborted migration because a new one is running"};
        }

        if (_memoryUsed > 500 * 1024 * 1024) {
            // This is too much memory for us to use so we're going to abort the migration
            return {ErrorCodes::ExceededMemoryLimit,
                    "Aborting migration because of high memory usage"};
        }

        Status interruptStatus = txn->checkForInterruptNoAssert();
        if (!interruptStatus.isOK()) {
            return interruptStatus;
        }
    }

    scopedGuard.Dismiss();
    return {ErrorCodes::ExceededTimeLimit, "Timed out waiting for the cloner to catch up"};
}
Beispiel #24
0
    /**
     * @return true if had to do something
     */
    bool checkShardVersion( DBClientBase * conn_in , const string& ns , ChunkManagerPtr refManager, bool authoritative , int tryNumber ) {
        // TODO: cache, optimize, etc...

        WriteBackListener::init( *conn_in );

        DBConfigPtr conf = grid.getDBConfig( ns );
        if ( ! conf )
            return false;

        DBClientBase* conn = getVersionable( conn_in );
        verify(conn); // errors thrown above

        unsigned long long officialSequenceNumber = 0;

        ChunkManagerPtr manager;
        const bool isSharded = conf->isSharded( ns );
        if ( isSharded ) {
            manager = conf->getChunkManagerIfExists( ns , authoritative );
            // It's possible the chunk manager was reset since we checked whether sharded was true,
            // so must check this here.
            if( manager ) officialSequenceNumber = manager->getSequenceNumber();
        }

        // Check this manager against the reference manager
        if( isSharded && manager ){

            Shard shard = Shard::make( conn->getServerAddress() );
            if( refManager && ! refManager->compatibleWith( manager, shard ) ){
                throw SendStaleConfigException( ns, str::stream() << "manager (" << manager->getVersion( shard ).toString()  << " : " << manager->getSequenceNumber() << ") "
                                                                      << "not compatible with reference manager (" << refManager->getVersion( shard ).toString()  << " : " << refManager->getSequenceNumber() << ") "
                                                                      << "on shard " << shard.getName() << " (" << shard.getAddress().toString() << ")",
                                                refManager->getVersion( shard ), manager->getVersion( shard ) );
            }
        }
        else if( refManager ){
            Shard shard = Shard::make( conn->getServerAddress() );
            string msg( str::stream() << "not sharded ("
                        << ( (manager.get() == 0) ? string( "<none>" ) :
                                str::stream() << manager->getSequenceNumber() )
                        << ") but has reference manager ("
                        << refManager->getSequenceNumber() << ") "
                        << "on conn " << conn->getServerAddress() << " ("
                        << conn_in->getServerAddress() << ")" );

            throw SendStaleConfigException( ns, msg,
                    refManager->getVersion( shard ), ShardChunkVersion( 0, OID() ));
        }

        // has the ChunkManager been reloaded since the last time we updated the connection-level version?
        // (ie., last time we issued the setShardVersions below)
        unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns);
        if ( sequenceNumber == officialSequenceNumber ) {
            return false;
        }


        ShardChunkVersion version = ShardChunkVersion( 0, OID() );
        if ( isSharded && manager ) {
            version = manager->getVersion( Shard::make( conn->getServerAddress() ) );
        }

        if( ! version.isSet() ){
            LOG(0) << "resetting shard version of " << ns << " on " << conn->getServerAddress() << ", " <<
                      ( ! isSharded ? "no longer sharded" :
                      ( ! manager ? "no chunk manager found" :
                                    "version is zero" ) ) << endl;
        }

        LOG(2) << " have to set shard version for conn: " << conn->getServerAddress() << " ns:" << ns
               << " my last seq: " << sequenceNumber << "  current: " << officialSequenceNumber
               << " version: " << version << " manager: " << manager.get()
               << endl;

        const string versionableServerAddress(conn->getServerAddress());

        BSONObj result;
        if ( setShardVersion( *conn , ns , version , authoritative , result ) ) {
            // success!
            LOG(1) << "      setShardVersion success: " << result << endl;
            connectionShardStatus.setSequence( conn , ns , officialSequenceNumber );
            return true;
        }

        LOG(1) << "       setShardVersion failed!\n" << result << endl;

        if ( result["need_authoritative"].trueValue() )
            massert( 10428 ,  "need_authoritative set but in authoritative mode already" , ! authoritative );

        if ( ! authoritative ) {
            // use the original connection and get a fresh versionable connection
            // since conn can be invalidated (or worse, freed) after the failure
            checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1);
            return true;
        }
        
        if ( result["reloadConfig"].trueValue() ) {
            if( result["version"].timestampTime() == 0 ){

                warning() << "reloading full configuration for " << conf->getName()
                          << ", connection state indicates significant version changes" << endl;

                // reload db
                conf->reload();
            }
            else {
                // reload config
                conf->getChunkManager( ns , true );
            }
        }

        const int maxNumTries = 7;
        if ( tryNumber < maxNumTries ) {
            LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) 
                << "going to retry checkShardVersion host: " << versionableServerAddress << " " << result << endl;
            sleepmillis( 10 * tryNumber );
            // use the original connection and get a fresh versionable connection
            // since conn can be invalidated (or worse, freed) after the failure
            checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1);
            return true;
        }
        
        string errmsg = str::stream() << "setShardVersion failed host: " << versionableServerAddress << " " << result;
        log() << "     " << errmsg << endl;
        massert( 10429 , errmsg , 0 );
        return true;
    }
Beispiel #25
0
    bool RangeDeleter::deleteNow(const std::string& ns,
                                 const BSONObj& min,
                                 const BSONObj& max,
                                 const BSONObj& shardKeyPattern,
                                 bool secondaryThrottle,
                                 string* errMsg) {
        if (stopRequested()) {
            *errMsg = "deleter is already stopped.";
            return false;
        }

        string dummy;
        if (errMsg == NULL) errMsg = &dummy;

        NSMinMax deleteRange(ns, min, max);
        {
            scoped_lock sl(_queueMutex);
            if (!canEnqueue_inlock(ns, min, max, errMsg)) {
                return false;
            }

            _deleteSet.insert(&deleteRange);
            _stats->incTotalDeletes_inlock();

            // Note: count for pending deletes is an integral part of the shutdown story.
            // Therefore, to simplify things, there is no "pending" state for deletes in
            // deleteNow, the state transition is simply inProgress -> done.
            _stats->incInProgressDeletes_inlock();
        }

        set<CursorId> cursorsToWait;
        _env->getCursorIds(ns, &cursorsToWait);

        long long checkIntervalMillis = 5;

        if (!cursorsToWait.empty()) {
            log() << "rangeDeleter waiting for " << cursorsToWait.size()
                  << " cursors in " << ns << " to finish" << endl;
        }

        while (!cursorsToWait.empty()) {
            set<CursorId> cursorsNow;
            _env->getCursorIds(ns, &cursorsNow);

            set<CursorId> cursorsLeft;
            std::set_intersection(cursorsToWait.begin(),
                                  cursorsToWait.end(),
                                  cursorsNow.begin(),
                                  cursorsNow.end(),
                                  std::inserter(cursorsLeft, cursorsLeft.end()));

            cursorsToWait.swap(cursorsLeft);

            if (stopRequested()) {
                *errMsg = "deleter was stopped.";

                scoped_lock sl(_queueMutex);
                _deleteSet.erase(&deleteRange);

                _stats->decInProgressDeletes_inlock();
                _stats->decTotalDeletes_inlock();

                if (!_stats->hasInProgress_inlock()) {
                    _nothingInProgressCV.notify_one();
                }

                return false;
            }

            if (checkIntervalMillis < MaxCurorCheckIntervalMillis) {
                checkIntervalMillis *= 2;
            }

            sleepmillis(checkIntervalMillis);
        }

        bool result = _env->deleteRange(ns, min, max, shardKeyPattern,
                                        secondaryThrottle, errMsg);

        {
            scoped_lock sl(_queueMutex);
            _deleteSet.erase(&deleteRange);

            _stats->decInProgressDeletes_inlock();
            _stats->decTotalDeletes_inlock();

            if (!_stats->hasInProgress_inlock()) {
                _nothingInProgressCV.notify_one();
            }
        }

        return result;
    }
Beispiel #26
0
    /**
     * @return true if had to do something
     */
    bool checkShardVersion( DBClientBase& conn_in , const string& ns , bool authoritative , int tryNumber ) {
        // TODO: cache, optimize, etc...

        WriteBackListener::init( conn_in );

        DBConfigPtr conf = grid.getDBConfig( ns );
        if ( ! conf )
            return false;

        DBClientBase* conn = 0;

        switch ( conn_in.type() ) {
        case ConnectionString::INVALID:
            assert(0);
            break;
        case ConnectionString::MASTER:
            // great
            conn = &conn_in;
            break;
        case ConnectionString::PAIR:
            assert( ! "pair not support for sharding" );
            break;
        case ConnectionString::SYNC:
            // TODO: we should check later that we aren't actually sharded on this
            conn = &conn_in;
            break;
        case ConnectionString::SET:
            DBClientReplicaSet* set = (DBClientReplicaSet*)&conn_in;
            conn = &(set->masterConn());
            break;
        }

        assert(conn);

        unsigned long long officialSequenceNumber = 0;

        ChunkManagerPtr manager;
        const bool isSharded = conf->isSharded( ns );
        if ( isSharded ) {
            manager = conf->getChunkManagerIfExists( ns , authoritative );
            // It's possible the chunk manager was reset since we checked whether sharded was true,
            // so must check this here.
            if( manager ) officialSequenceNumber = manager->getSequenceNumber();
        }

        // has the ChunkManager been reloaded since the last time we updated the connection-level version?
        // (ie., last time we issued the setShardVersions below)
        unsigned long long sequenceNumber = connectionShardStatus.getSequence(conn,ns);
        if ( sequenceNumber == officialSequenceNumber ) {
            return false;
        }


        ShardChunkVersion version = 0;
        if ( isSharded && manager ) {
            version = manager->getVersion( Shard::make( conn->getServerAddress() ) );
        }

        LOG(2) << " have to set shard version for conn: " << conn << " ns:" << ns
               << " my last seq: " << sequenceNumber << "  current: " << officialSequenceNumber
               << " version: " << version << " manager: " << manager.get()
               << endl;

        BSONObj result;
        if ( setShardVersion( *conn , ns , version , authoritative , result ) ) {
            // success!
            LOG(1) << "      setShardVersion success: " << result << endl;
            connectionShardStatus.setSequence( conn , ns , officialSequenceNumber );
            return true;
        }

        LOG(1) << "       setShardVersion failed!\n" << result << endl;

        if ( result["need_authoritative"].trueValue() )
            massert( 10428 ,  "need_authoritative set but in authoritative mode already" , ! authoritative );

        if ( ! authoritative ) {
            checkShardVersion( *conn , ns , 1 , tryNumber + 1 );
            return true;
        }
        
        if ( result["reloadConfig"].trueValue() ) {
            if( result["version"].timestampTime() == 0 ){
                // reload db
                conf->reload();
            }
            else {
                // reload config
                conf->getChunkManager( ns , true );
            }
        }

        const int maxNumTries = 7;
        if ( tryNumber < maxNumTries ) {
            LOG( tryNumber < ( maxNumTries / 2 ) ? 1 : 0 ) 
                << "going to retry checkShardVersion host: " << conn->getServerAddress() << " " << result << endl;
            sleepmillis( 10 * tryNumber );
            checkShardVersion( *conn , ns , true , tryNumber + 1 );
            return true;
        }
        
        string errmsg = str::stream() << "setShardVersion failed host: " << conn->getServerAddress() << " " << result;
        log() << "     " << errmsg << endl;
        massert( 10429 , errmsg , 0 );
        return true;
    }
Beispiel #27
0
        void runThread(ConnectionString& hostConn, unsigned threadId, unsigned seed,
                       BSONObj& cmdObj, BSONObjBuilder& result) {

            stringstream ss;
            ss << "thread-" << threadId;
            setThreadName(ss.str().c_str());

            // Lock name
            string lockName = string_field(cmdObj, "lockName", this->name + "_lock");

            // Range of clock skew in diff threads
            int skewRange = (int) number_field(cmdObj, "skewRange", 1);

            // How long to wait with the lock
            int threadWait = (int) number_field(cmdObj, "threadWait", 30);
            if(threadWait <= 0) threadWait = 1;

            // Max amount of time (ms) a thread waits before checking the lock again
            int threadSleep = (int) number_field(cmdObj, "threadSleep", 30);
            if(threadSleep <= 0) threadSleep = 1;

            // How long until the lock is forced in ms, only compared locally
            unsigned long long takeoverMS = (unsigned long long) number_field(cmdObj, "takeoverMS", 0);

            // Whether or not we should hang some threads
            int hangThreads = (int) number_field(cmdObj, "hangThreads", 0);


            boost::mt19937 gen((boost::mt19937::result_type) seed);

            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSkew(gen, boost::uniform_int<>(0, skewRange));
            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomWait(gen, boost::uniform_int<>(1, threadWait));
            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomSleep(gen, boost::uniform_int<>(1, threadSleep));
            boost::variate_generator<boost::mt19937&, boost::uniform_int<> > randomNewLock(gen, boost::uniform_int<>(0, 3));


            int skew = 0;
            if (!lock.get()) {

                // Pick a skew, but the first two threads skew the whole range
                if(threadId == 0)
                    skew = -skewRange / 2;
                else if(threadId == 1)
                    skew = skewRange / 2;
                else skew = randomSkew() - (skewRange / 2);

                // Skew this thread
                jsTimeVirtualThreadSkew( skew );

                log() << "Initializing lock with skew of " << skew << " for thread " << threadId << endl;

                lock.reset(new DistributedLock(hostConn, lockName, takeoverMS, true ));

                log() << "Skewed time " << jsTime() << "  for thread " << threadId << endl
                      << "  max wait (with lock: " << threadWait << ", after lock: " << threadSleep << ")" << endl
                      << "  takeover in " << takeoverMS << "(ms remote)" << endl;

            }

            DistributedLock* myLock = lock.get();

            bool errors = false;
            BSONObj lockObj;
            while (keepGoing) {
                try {

                    if (myLock->lock_try("Testing distributed lock with skew.", false, &lockObj )) {

                        log() << "**** Locked for thread " << threadId << " with ts " << lockObj["ts"] << endl;

                        if( count.loadRelaxed() % 2 == 1 && ! myLock->lock_try( "Testing lock re-entry.", true ) ) {
                            errors = true;
                            log() << "**** !Could not re-enter lock already held" << endl;
                            break;
                        }

                        if( count.loadRelaxed() % 3 == 1 && myLock->lock_try( "Testing lock non-re-entry.", false ) ) {
                            errors = true;
                            log() << "**** !Invalid lock re-entry" << endl;
                            break;
                        }

                        int before = count.addAndFetch(1);
                        int sleep = randomWait();
                        sleepmillis(sleep);
                        int after = count.loadRelaxed();

                        if(after != before) {
                            errors = true;
                            log() << "**** !Bad increment while sleeping with lock for: " << sleep << "ms" << endl;
                            break;
                        }

                        // Unlock only half the time...
                        if(hangThreads == 0 || threadId % hangThreads != 0) {
                            log() << "**** Unlocking for thread " << threadId << " with ts " << lockObj["ts"] << endl;
                            myLock->unlock( &lockObj );
                        }
                        else {
                            log() << "**** Not unlocking for thread " << threadId << endl;
                            verify( DistributedLock::killPinger( *myLock ) );
                            // We're simulating a crashed process...
                            break;
                        }
                    }

                }
                catch( const DBException& ex ) {
                    log() << "*** !Could not try distributed lock." << causedBy( ex ) << endl;
                    break;
                }

                // Create a new lock 1/3 of the time
                if( randomNewLock() > 1 ){
                    lock.reset(new DistributedLock( hostConn, lockName, takeoverMS, true ));
                    myLock = lock.get();
                }

                sleepmillis(randomSleep());
            }

            result << "errors" << errors
                   << "skew" << skew
                   << "takeover" << (long long) takeoverMS
                   << "localTimeout" << (takeoverMS > 0);

        }
Beispiel #28
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Status* remoteOplogStartStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        return;
    }

    if (inShutdown()) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    const auto& queryResponse = result.getValue();
    const auto& documents = queryResponse.documents;
    auto documentBegin = documents.cbegin();
    auto documentEnd = documents.cend();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> {
            if (documentBegin == documentEnd) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(documentBegin++);
        };

        *remoteOplogStartStatus =
            checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!remoteOplogStartStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(documentBegin != documents.cbegin());
    }

    // process documents
    int currentBatchMessageSize = 0;
    for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) {
        if (inShutdown()) {
            return;
        }

        // If we are transitioning to primary state, we need to leave
        // this loop in order to go into bgsync-pause mode.
        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
            LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
            return;
        }

        // At this point, we are guaranteed to have at least one thing to read out
        // of the fetcher.
        const BSONObj& o = *documentIter;
        currentBatchMessageSize += o.objsize();
        opsReadStats.increment();

        if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) {
            sleepsecs(20);
        }

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _appliedBuffer = false;
        }

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        bufferCountGauge.increment();
        bufferSizeGauge.increment(getSize(o));
        _buffer.push(o);

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = o["h"].numberLong();
            _lastOpTimeFetched = extractOpTime(o);
            LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }

    // record time for each batch
    getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count());

    networkByteStats.increment(currentBatchMessageSize);

    // Check some things periodically
    // (whenever we run out of items in the
    // current cursor batch)
    if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) {
        // on a very low latency network, if we don't wait a little, we'll be
        // getting ops to write almost one at a time.  this will both be expensive
        // for the upstream server as well as potentially defeating our parallel
        // application of batches on the secondary.
        //
        // the inference here is basically if the batch is really small, we are
        // "caught up".
        //
        sleepmillis(SleepToAllowBatchingMillis);
    }

    // If we are transitioning to primary state, we need to leave
    // this loop in order to go into bgsync-pause mode.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        return;
    }

    // re-evaluate quality of sync target
    if (_shouldChangeSyncSource(source)) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    // We fill in 'bob' to signal the fetcher to process with another getMore.
    invariant(bob);
    bob->append("getMore", queryResponse.cursorId);
    bob->append("collection", queryResponse.nss.coll());
    bob->append("maxTimeMS", int(fetcherMaxTimeMS.count()));
}
    Status CmdAuthenticate::_authenticateCR(const UserName& user, const BSONObj& cmdObj) {

        if (user == internalSecurity.user->getName() &&
            serverGlobalParams.clusterAuthMode == "x509") {
            return Status(ErrorCodes::AuthenticationFailed,
                          "Mechanism x509 is required for internal cluster authentication");
        }

        if (!_areNonceAuthenticateCommandsEnabled) {
            // SERVER-8461, MONGODB-CR must be enabled for authenticating the internal user, so that
            // cluster members may communicate with each other.
            if (user != internalSecurity.user->getName()) {
                return Status(ErrorCodes::BadValue, _nonceAuthenticateCommandsDisabledMessage);
            }
        }

        string key = cmdObj.getStringField("key");
        string received_nonce = cmdObj.getStringField("nonce");

        if( user.getUser().empty() || key.empty() || received_nonce.empty() ) {
            sleepmillis(10);
            return Status(ErrorCodes::ProtocolError,
                          "field missing/wrong type in received authenticate command");
        }

        stringstream digestBuilder;

        {
            ClientBasic *client = ClientBasic::getCurrent();
            boost::scoped_ptr<AuthenticationSession> session;
            client->swapAuthenticationSession(session);
            if (!session || session->getType() != AuthenticationSession::SESSION_TYPE_MONGO) {
                sleepmillis(30);
                return Status(ErrorCodes::ProtocolError, "No pending nonce");
            }
            else {
                nonce64 nonce = static_cast<MongoAuthenticationSession*>(session.get())->getNonce();
                digestBuilder << hex << nonce;
                if (digestBuilder.str() != received_nonce) {
                    sleepmillis(30);
                    return Status(ErrorCodes::AuthenticationFailed, "Received wrong nonce.");
                }
            }
        }

        User* userObj;
        Status status = getGlobalAuthorizationManager()->acquireUser(user, &userObj);
        if (!status.isOK()) {
            // Failure to find the privilege document indicates no-such-user, a fact that we do not
            // wish to reveal to the client.  So, we return AuthenticationFailed rather than passing
            // through the returned status.
            return Status(ErrorCodes::AuthenticationFailed, status.toString());
        }
        string pwd = userObj->getCredentials().password;
        getGlobalAuthorizationManager()->releaseUser(userObj);

        md5digest d;
        {
            digestBuilder << user.getUser() << pwd;
            string done = digestBuilder.str();

            md5_state_t st;
            md5_init(&st);
            md5_append(&st, (const md5_byte_t *) done.c_str(), done.size());
            md5_finish(&st, d);
        }

        string computed = digestToString( d );

        if ( key != computed ) {
            return Status(ErrorCodes::AuthenticationFailed, "key mismatch");
        }

        AuthorizationSession* authorizationSession =
            ClientBasic::getCurrent()->getAuthorizationSession();
        status = authorizationSession->addAndAuthorizeUser(user);
        if (!status.isOK()) {
            return status;
        }

        return Status::OK();
    }
Beispiel #30
0
        bool run(const string& , BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) {

            lastError.disableForCommand();
            ShardedConnectionInfo* info = ShardedConnectionInfo::get( true );

            bool authoritative = cmdObj.getBoolField( "authoritative" );

            string configdb = cmdObj["configdb"].valuestrsafe();
            {
                // configdb checking
                if ( configdb.size() == 0 ) {
                    errmsg = "no configdb";
                    return false;
                }

                if ( shardingState.enabled() ) {
                    if ( configdb != shardingState.getConfigServer() ) {
                        errmsg = "specified a different configdb!";
                        return false;
                    }
                }
                else {
                    if ( ! authoritative ) {
                        result.appendBool( "need_authoritative" , true );
                        errmsg = "first setShardVersion";
                        return false;
                    }
                    shardingState.enable( configdb );
                    configServer.init( configdb );
                }
            }

            if ( cmdObj["shard"].type() == String ) {
                shardingState.gotShardName( cmdObj["shard"].String() );
                shardingState.gotShardHost( cmdObj["shardHost"].String() );
            }

            {
                // setting up ids
                if ( cmdObj["serverID"].type() != jstOID ) {
                    // TODO: fix this
                    //errmsg = "need serverID to be an OID";
                    //return 0;
                }
                else {
                    OID clientId = cmdObj["serverID"].__oid();
                    if ( ! info->hasID() ) {
                        info->setID( clientId );
                    }
                    else if ( clientId != info->getID() ) {
                        errmsg = "server id has changed!";
                        return 0;
                    }
                }
            }

            unsigned long long version = extractVersion( cmdObj["version"] , errmsg );

            if ( errmsg.size() ) {
                return false;
            }

            string ns = cmdObj["setShardVersion"].valuestrsafe();
            if ( ns.size() == 0 ) {
                errmsg = "need to speciy fully namespace";
                return false;
            }

            const ConfigVersion oldVersion = info->getVersion(ns);
            const ConfigVersion globalVersion = shardingState.getVersion(ns);

            if ( oldVersion > 0 && globalVersion == 0 ) {
                // this had been reset
                info->setVersion( ns , 0 );
            }

            if ( version == 0 && globalVersion == 0 ) {
                // this connection is cleaning itself
                info->setVersion( ns , 0 );
                return true;
            }

            if ( version == 0 && globalVersion > 0 ) {
                if ( ! authoritative ) {
                    result.appendBool( "need_authoritative" , true );
                    result.append( "ns" , ns );
                    result.appendTimestamp( "globalVersion" , globalVersion );
                    result.appendTimestamp( "oldVersion" , oldVersion );
                    errmsg = "dropping needs to be authoritative";
                    return false;
                }
                log() << "wiping data for: " << ns << endl;
                result.appendTimestamp( "beforeDrop" , globalVersion );
                // only setting global version on purpose
                // need clients to re-find meta-data
                shardingState.resetVersion( ns );
                info->setVersion( ns , 0 );
                return true;
            }

            if ( version < oldVersion ) {
                errmsg = "you already have a newer version of collection '" + ns + "'";
                result.append( "ns" , ns );
                result.appendTimestamp( "oldVersion" , oldVersion );
                result.appendTimestamp( "newVersion" , version );
                result.appendTimestamp( "globalVersion" , globalVersion );
                return false;
            }

            if ( version < globalVersion ) {
                while ( shardingState.inCriticalMigrateSection() ) {
                    dbtemprelease r;
                    sleepmillis(2);
                    OCCASIONALLY log() << "waiting till out of critical section" << endl;
                }
                errmsg = "going to older version for global for collection '" + ns + "'";
                result.append( "ns" , ns );
                result.appendTimestamp( "version" , version );
                result.appendTimestamp( "globalVersion" , globalVersion );
                return false;
            }

            if ( globalVersion == 0 && ! cmdObj.getBoolField( "authoritative" ) ) {
                // need authoritative for first look
                result.append( "ns" , ns );
                result.appendBool( "need_authoritative" , true );
                errmsg = "first time for collection '" + ns + "'";
                return false;
            }

            {
                dbtemprelease unlock;

                ShardChunkVersion currVersion = version;
                if ( ! shardingState.trySetVersion( ns , currVersion ) ) {
                    errmsg = str::stream() << "client version differs from config's for colleciton '" << ns << "'";
                    result.append( "ns" , ns );
                    result.appendTimestamp( "version" , version );
                    result.appendTimestamp( "globalVersion" , currVersion );
                    return false;
                }
            }

            info->setVersion( ns , version );
            result.appendTimestamp( "oldVersion" , oldVersion );
            result.append( "ok" , 1 );

            return true;
        }