void DocumentSourceOutReplaceColl::initializeWriteNs() {
    DBClientBase* conn = pExpCtx->mongoProcessInterface->directClient();

    const auto& outputNs = getOutputNs();
    _tempNs = NamespaceString(str::stream() << outputNs.db() << ".tmp.agg_out."
                                            << aggOutCounter.addAndFetch(1));

    // Save the original collection options and index specs so we can check they didn't change
    // during computation.
    _originalOutOptions = pExpCtx->mongoProcessInterface->getCollectionOptions(outputNs);
    _originalIndexes = conn->getIndexSpecs(outputNs.ns());

    // Check if it's capped to make sure we have a chance of succeeding before we do all the work.
    // If the collection becomes capped during processing, the collection options will have changed,
    // and the $out will fail.
    uassert(17152,
            str::stream() << "namespace '" << outputNs.ns()
                          << "' is capped so it can't be used for $out",
            _originalOutOptions["capped"].eoo());

    // We will write all results into a temporary collection, then rename the temporary
    // collection to be the target collection once we are done.
    _tempNs = NamespaceString(str::stream() << outputNs.db() << ".tmp.agg_out."
                                            << aggOutCounter.addAndFetch(1));

    // Create temp collection, copying options from the existing output collection if any.
    {
        BSONObjBuilder cmd;
        cmd << "create" << _tempNs.coll();
        cmd << "temp" << true;
        cmd.appendElementsUnique(_originalOutOptions);

        BSONObj info;
        uassert(16994,
                str::stream() << "failed to create temporary $out collection '" << _tempNs.ns()
                              << "': "
                              << info.toString(),
                conn->runCommand(outputNs.db().toString(), cmd.done(), info));
    }

    if (_originalIndexes.empty()) {
        return;
    }

    // Copy the indexes of the output collection to the temp collection.
    std::vector<BSONObj> tempNsIndexes;
    for (const auto& indexSpec : _originalIndexes) {
        // Replace the spec's 'ns' field value, which is the original collection, with the temp
        // collection.
        tempNsIndexes.push_back(indexSpec.addField(BSON("ns" << _tempNs.ns()).firstElement()));
    }
    try {
        conn->createIndexes(_tempNs.ns(), tempNsIndexes);
    } catch (DBException& ex) {
        ex.addContext("Copying indexes for $out failed");
        throw;
    }
};
예제 #2
0
void DocumentSourceOut::prepTempCollection() {
    verify(_conn);
    verify(_tempNs.size() == 0);

    _tempNs = StringData(str::stream() << _outputNs.db()
                         << ".tmp.agg_out."
                         << aggOutCounter.addAndFetch(1)
                        );

    {
        BSONObj info;
        bool ok =_conn->runCommand(_outputNs.db().toString(),
                                   BSON("create" << _tempNs.coll() << "temp" << true),
                                   info);
        uassert(16994, str::stream() << "failed to create temporary $out collection '"
                << _tempNs.ns() << "': " << info.toString(),
                ok);
    }

    // copy indexes on _outputNs to _tempNs
    scoped_ptr<DBClientCursor> indexes(_conn->getIndexes(_outputNs));
    while (indexes->more()) {
        MutableDocument index(Document(indexes->nextSafe()));
        index.remove("_id"); // indexes shouldn't have _ids but some existing ones do
        index["ns"] = Value(_tempNs.ns());

        BSONObj indexBson = index.freeze().toBson();
        _conn->insert(_tempNs.getSystemIndexesCollection(), indexBson);
        BSONObj err = _conn->getLastErrorDetailed();
        uassert(16995, str::stream() << "copying index for $out failed."
                << " index: " << indexBson
                << " error: " <<  err,
                DBClientWithCommands::getLastErrorString(err).empty());
    }
}
예제 #3
0
파일: dur.cpp 프로젝트: Aaron20141021/mongo
        void DurableImpl::commitAndStopDurThread(OperationContext* txn) {
            // This is only called by clean shutdown and the global lock must be held to ensure
            // there will not be any more writes.
            invariant(txn->lockState()->isW());

            commitNow(txn);
            shutdownRequested.store(1);
        }
예제 #4
0
 static void initializeWriterThread() {
     // Only do this once per thread
     if (!ClientBasic::getCurrent()) {
         string threadName = str::stream() << "repl writer worker "
                                           << replWriterWorkerId.addAndFetch(1);
         Client::initThread( threadName.c_str() );
         replLocalAuth();
     }
 }
예제 #5
0
void workerThread() {
    bool r = options["r"].trueValue();
    bool w = options["w"].trueValue();
    cout << "read:" << r << " write:" << w << endl;
    long long su = options["sleepMicros"].numberLong();
    Aligned a;
    while( 1 ) { 
        unsigned long long rofs = (rrand() * PG) % len;
        unsigned long long wofs = (rrand() * PG) % len;
        const unsigned P = PG/1024;
        if( mmf ) { 
            if( r ) {
                for( unsigned p = P; p <= recSizeKB; p += P ) {
                    if( rofs < len ) 
                        dummy += mmf[rofs];
                    rofs += PG;
                }
                iops.fetchAndAdd(1);
            }
            if( w ) {
                for( unsigned p = P; p <= recSizeKB; p += P ) {
                    if( wofs < len )
                        mmf[wofs] = 3;
                    wofs += PG;
                }
                iops.fetchAndAdd(1);
            }
        }
        else {
            if( r ) {
                lf->readAt(rofs, a.addr(), recSizeKB * 1024);
                iops.fetchAndAdd(1);
            }
            if( w ) {
                lf->writeAt(wofs, a.addr(), recSizeKB * 1024);
                iops.fetchAndAdd(1);
            }
        }
        long long micros = su / nThreadsRunning;
        if( micros ) {
            sleepmicros(micros);
        }
    }
}
예제 #6
0
 void initializeWriterThread() {
     // Only do this once per thread
     if (!ClientBasic::getCurrent()) {
         string threadName = str::stream() << "repl writer worker " << replWriterWorkerId.addAndFetch(1);
         Client::initThread( threadName.c_str() );
         // allow us to get through the magic barrier
         Lock::ParallelBatchWriterMode::iAmABatchParticipant();
         replLocalAuth();
     }
 }
예제 #7
0
    void run() {
        OldThreadPool tp(nThreads);

        for (unsigned i = 0; i < iterations; i++) {
            tp.schedule(&ThreadPoolTest::increment, this, 2);
        }

        tp.join();

        ASSERT_EQUALS(counter.load(), iterations * 2);
    }
예제 #8
0
파일: oid.cpp 프로젝트: Benguang/mongo
    void OID::init() {
        static AtomicUInt32 inc(
            static_cast<unsigned>(
                scoped_ptr<SecureRandom>(SecureRandom::create())->nextInt64()));

        {
            unsigned t = (unsigned) time(0);
            unsigned char *T = (unsigned char *) &t;
            _time[0] = T[3]; // big endian order because we use memcmp() to compare OID's
            _time[1] = T[2];
            _time[2] = T[1];
            _time[3] = T[0];
        }

        _machineAndPid = ourMachineAndPid;

        {
            int new_inc = inc.fetchAndAdd(1);
            unsigned char *T = (unsigned char *) &new_inc;
            _inc[0] = T[2];
            _inc[1] = T[1];
            _inc[2] = T[0];
        }
    }
예제 #9
0
 void increment(unsigned n) {
     for (unsigned i = 0; i < n; i++) {
         counter.fetchAndAdd(1);
     }
 }
예제 #10
0
파일: dur.cpp 프로젝트: Aaron20141021/mongo
    namespace dur {

        void PREPLOGBUFFER(JSectHeader& outParm, AlignedBuilder&);
        void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed);
        void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed);

        /** declared later in this file
            only used in this file -- use DurableInterface::commitNow() outside
        */
        static void groupCommit();

        // Used to activate the flush thread
        static boost::mutex flushMutex;
        static boost::condition_variable flushRequested;

        // When set, the flush thread will exit
        static AtomicUInt32 shutdownRequested(0);


        CommitJob& commitJob = *(new CommitJob()); // don't destroy

        Stats stats;

        void Stats::S::reset() {
            memset(this, 0, sizeof(*this));
        }

        Stats::Stats() {
            _a.reset();
            _b.reset();
            curr = &_a;
            _intervalMicros = 3000000;
        }

        Stats::S * Stats::other() {
            return curr == &_a ? &_b : &_a;
        }
        string _CSVHeader();

        string Stats::S::_CSVHeader() { 
            return "cmts  jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB  wrToJ\twrToDF\trmpPrVw";
        }

        string Stats::S::_asCSV() { 
            stringstream ss;
            ss << 
                setprecision(2) << 
                _commits << '\t' << fixed << 
                _journaledBytes / 1000000.0 << '\t' << 
                _writeToDataFilesBytes / 1000000.0 << '\t' << 
                _commitsInWriteLock << '\t' << 
                _earlyCommits <<  '\t' << 
                (unsigned) (_prepLogBufferMicros/1000) << '\t' << 
                (unsigned) (_writeToJournalMicros/1000) << '\t' << 
                (unsigned) (_writeToDataFilesMicros/1000) << '\t' << 
                (unsigned) (_remapPrivateViewMicros/1000);
            return ss.str();
        }

        BSONObj Stats::S::_asObj() {
            BSONObjBuilder b;
            b << 
                       "commits" << _commits <<
                       "journaledMB" << _journaledBytes / 1000000.0 <<
                       "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 <<
                       "compression" << _journaledBytes / (_uncompressedBytes+1.0) <<
                       "commitsInWriteLock" << _commitsInWriteLock <<
                       "earlyCommits" << _earlyCommits << 
                       "timeMs" <<
                       BSON( "dt" << _dtMillis <<
                             "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) <<
                             "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) <<
                             "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) <<
                             "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000)
                           );
            if (storageGlobalParams.journalCommitInterval != 0)
                b << "journalCommitIntervalMs" << storageGlobalParams.journalCommitInterval;
            return b.obj();
        }

        BSONObj Stats::asObj() {
            return other()->_asObj();
        }

        void Stats::rotate() {
            unsigned long long now = curTimeMicros64();
            unsigned long long dt = now - _lastRotate;
            if( dt >= _intervalMicros && _intervalMicros ) {
                // rotate
                curr->_dtMillis = (unsigned) (dt/1000);
                _lastRotate = now;
                curr = other();
                curr->reset();
            }
        }

        void* NonDurableImpl::writingPtr(void *x, unsigned len) {
            dassert(shutdownRequested.load() == 0);
            return x; 
        }

        void NonDurableImpl::declareWriteIntent(void *, unsigned) { 
        }

        bool NonDurableImpl::commitNow(OperationContext* txn) {
            cc().checkpointHappened();   // XXX: remove when all dur goes through DurRecoveryUnit
            return false;
        }

        bool NonDurableImpl::commitIfNeeded(OperationContext* txn) {
            cc().checkpointHappened();   // XXX: remove when all dur goes through DurRecoveryUnit
            return false;
        }



        static DurableImpl* durableImpl = new DurableImpl();
        static NonDurableImpl* nonDurableImpl = new NonDurableImpl();
        DurableInterface* DurableInterface::_impl = nonDurableImpl;

        void DurableInterface::enableDurability() {
            verify(_impl == nonDurableImpl);
            _impl = durableImpl;
        }


        bool DurableImpl::commitNow(OperationContext* txn) {
            stats.curr->_earlyCommits++;

            NotifyAll::When when = commitJob._notify.now();

            AutoYieldFlushLockForMMAPV1Commit flushLockYield(txn->lockState());

            // There is always just one waiting anyways
            flushRequested.notify_one();
            commitJob._notify.waitFor(when);

            cc().checkpointHappened();
            return true;
        }

        bool DurableImpl::awaitCommit() {
            commitJob._notify.awaitBeyondNow();
            return true;
        }

        /** Declare that a file has been created
            Normally writes are applied only after journaling, for safety.  But here the file
            is created first, and the journal will just replay the creation if the create didn't
            happen because of crashing.
        */
        void DurableImpl::createdFile(const std::string& filename, unsigned long long len) {
            shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
            commitJob.noteOp(op);
        }

        void* DurableImpl::writingPtr(void *x, unsigned len) {
            dassert(shutdownRequested.load() == 0);
            void *p = x;
            declareWriteIntent(p, len);
            return p;
        }

        /** declare intent to write
            @param ofs offset within buf at which we will write
            @param len the length at ofs we will write
            @return new buffer pointer.
        */
        void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
            char *p = (char *) buf;
            declareWriteIntent(p+ofs, len);
            return p;
        }

        void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) {
            char *p = (char *) buf;
            for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin();
                    i != ranges.end(); ++i ) {
                declareWriteIntent( p + i->first, i->second );
            }
            return p;
        }

        bool DurableImpl::commitIfNeeded(OperationContext* txn) {
            // this is safe since since conceptually if you call commitIfNeeded, we're at a valid
            // spot in an operation to be terminated.
            cc().checkpointHappened();

            if (MONGO_likely(commitJob.bytes() < UncommittedBytesLimit)) {
                return false;
            }

            // Just wake up the flush thread
            flushRequested.notify_one();
            return true;
        }

        void DurableImpl::commitAndStopDurThread(OperationContext* txn) {
            // This is only called by clean shutdown and the global lock must be held to ensure
            // there will not be any more writes.
            invariant(txn->lockState()->isW());

            commitNow(txn);
            shutdownRequested.store(1);
        }


        // Functor to be called over all MongoFiles

        class validateSingleMapMatches {
        public:
            validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes)  {}
            void operator () (MongoFile *mf) {
                if( mf->isDurableMappedFile() ) {
                    DurableMappedFile *mmf = (DurableMappedFile*) mf;
                    const unsigned char *p = (const unsigned char *) mmf->getView();
                    const unsigned char *w = (const unsigned char *) mmf->view_write();

                    if (!p || !w) return; // File not fully opened yet

                    _bytes += mmf->length();

                    verify( mmf->length() == (unsigned) mmf->length() );

                    if (memcmp(p, w, (unsigned) mmf->length()) == 0)
                        return; // next file

                    unsigned low = 0xffffffff;
                    unsigned high = 0;
                    log() << "DurParanoid mismatch in " << mmf->filename() << endl;
                    int logged = 0;
                    unsigned lastMismatch = 0xffffffff;
                    for( unsigned i = 0; i < mmf->length(); i++ ) {
                        if( p[i] != w[i] ) {
                            if( lastMismatch != 0xffffffff && lastMismatch+1 != i )
                                log() << endl; // separate blocks of mismatches
                            lastMismatch= i;
                            if( ++logged < 60 ) {
                                if( logged == 1 )
                                    log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record
                                stringstream ss;
                                ss << "mismatch ofs:" << hex << i <<  "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i];
                                if( p[i] > 32 && p[i] <= 126 )
                                    ss << '\t' << p[i];
                                log() << ss.str() << endl;
                            }
                            if( logged == 60 )
                                log() << "..." << endl;
                            if( i < low ) low = i;
                            if( i > high ) high = i;
                        }
                    }
                    if( low != 0xffffffff ) {
                        std::stringstream ss;
                        ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
                        log() << ss.str() << endl;
                        log() << "priv loc: " << (void*)(p+low) << ' ' << endl;
                        //vector<WriteIntent>& _intents = commitJob.wi()._intents;
                        //(void) _intents; // mark as unused. Useful for inspection in debugger

                        // should we abort() here so this isn't unnoticed in some circumstances?
                        massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false);
                    }
                }
            }
        private:
            unsigned long long& _bytes;
        };

        /** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
        */
        void debugValidateAllMapsMatch() {
            if (!(storageGlobalParams.durOptions & StorageGlobalParams::DurParanoid))
                return;

            unsigned long long bytes = 0;
            Timer t;
            MongoFile::forEach(validateSingleMapMatches(bytes));
            OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " <<  (bytes / (1024*1024)) << "MB" << endl;
        }

        extern size_t privateMapBytes;

        static void _REMAPPRIVATEVIEW() {
            // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop.  that could be a way 
            //       to assure very good behavior here.

            static unsigned startAt;
            static unsigned long long lastRemap;

            LOG(4) << "journal REMAPPRIVATEVIEW" << endl;

            invariant(!commitJob.hasWritten());

            // we want to remap all private views about every 2 seconds.  there could be ~1000 views so
            // we do a little each pass; beyond the remap time, more significantly, there will be copy on write
            // faults after remapping, so doing a little bit at a time will avoid big load spikes on
            // remapping.
            unsigned long long now = curTimeMicros64();
            double fraction = (now-lastRemap)/2000000.0;
            if (storageGlobalParams.durOptions & StorageGlobalParams::DurAlwaysRemap)
                fraction = 1;
            lastRemap = now;

#if defined(_WIN32) || defined(__sunos__)
            // Note that this negatively affects performance.
            // We must grab the exclusive lock here because remapPrivateView() on Windows and
            // Solaris need to grab it as well, due to the lack of an atomic way to remap a
            // memory mapped file.
            // See SERVER-5723 for performance improvement.
            // See SERVER-5680 to see why this code is necessary on Windows.
            // See SERVER-8795 to see why this code is necessary on Solaris.
            LockMongoFilesExclusive lk;
#else
            LockMongoFilesShared lk;
#endif
            set<MongoFile*>& files = MongoFile::getAllFiles();
            unsigned sz = files.size();
            if( sz == 0 )
                return;

            {
                // be careful not to use too much memory if the write rate is 
                // extremely high
                double f = privateMapBytes / ((double)UncommittedBytesLimit);
                if( f > fraction ) { 
                    fraction = f;
                }
                privateMapBytes = 0;
            }

            unsigned ntodo = (unsigned) (sz * fraction);
            if( ntodo < 1 ) ntodo = 1;
            if( ntodo > sz ) ntodo = sz;

            const set<MongoFile*>::iterator b = files.begin();
            const set<MongoFile*>::iterator e = files.end();
            set<MongoFile*>::iterator i = b;
            // skip to our starting position
            for( unsigned x = 0; x < startAt; x++ ) {
                i++;
                if( i == e ) i = b;
            }
            unsigned startedAt = startAt;
            startAt = (startAt + ntodo) % sz; // mark where to start next time

            Timer t;
            for( unsigned x = 0; x < ntodo; x++ ) {
                dassert( i != e );
                if( (*i)->isDurableMappedFile() ) {
                    DurableMappedFile *mmf = (DurableMappedFile*) *i;
                    verify(mmf);
                    if( mmf->willNeedRemap() ) {
                        mmf->remapThePrivateView();
                    }
                    i++;
                    if( i == e ) i = b;
                }
            }
            LOG(2) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' << t.millis() << "ms" << endl;
        }

        /** We need to remap the private views periodically. otherwise they would become very large.
            Call within write lock.  See top of file for more commentary.
        */
        static void REMAPPRIVATEVIEW() {
            Timer t;
            _REMAPPRIVATEVIEW();
            stats.curr->_remapPrivateViewMicros += t.micros();
        }

        // this is a pseudo-local variable in the groupcommit functions 
        // below.  however we don't truly do that so that we don't have to 
        // reallocate, and more importantly regrow it, on every single commit.
        static AlignedBuilder __theBuilder(4 * 1024 * 1024);


        static void _groupCommit() {
            LOG(4) << "_groupCommit " << endl;

            {
                AlignedBuilder &ab = __theBuilder;

                // we need to make sure two group commits aren't running at the same time
                // (and we are only read locked in the dbMutex, so it could happen -- while 
                // there is only one dur thread, "early commits" can be done by other threads)
                SimpleMutex::scoped_lock lk(commitJob.groupCommitMutex);

                commitJob.commitingBegin();

                if( !commitJob.hasWritten() ) {
                    // getlasterror request could have came after the data was already committed
                    commitJob.committingNotifyCommitted();
                }
                else {
                    JSectHeader h;
                    PREPLOGBUFFER(h,ab);

                    // todo : write to the journal outside locks, as this write can be slow.
                    //        however, be careful then about remapprivateview as that cannot be done 
                    //        if new writes are then pending in the private maps.
                    WRITETOJOURNAL(h, ab);

                    // data is now in the journal, which is sufficient for acknowledging getLastError.
                    // (ok to crash after that)
                    commitJob.committingNotifyCommitted();

                    WRITETODATAFILES(h, ab);
                    debugValidateAllMapsMatch();

                    commitJob.committingReset();
                    ab.reset();
                }
            }
        }

        /** locking: in at least 'R' when called
                     or, for early commits (commitIfNeeded), in W or X
            @param lwg set if the durcommitthread *only* -- then we will upgrade the lock 
                   to W so we can remapprivateview. only durcommitthread calls with 
                   lgw != 0 as more than one thread upgrading would deadlock
            @see DurableMappedFile::close()
        */
        static void groupCommit() {
            try {
                _groupCommit();
            }
            catch(DBException& e ) { 
                log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl;
                mongoAbort("gc1");
            }
            catch(std::ios_base::failure& e) { 
                log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl;
                mongoAbort("gc2");
            }
            catch(std::bad_alloc& e) { 
                log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl;
                mongoAbort("gc3");
            }
            catch(std::exception& e) {
                log() << "exception in groupCommit causing immediate shutdown: " << e.what() << endl;
                mongoAbort("gc4");
            }
            LOG(4) << "groupCommit end" << endl;
        }

        static void remapPrivateView() {
            try {
                // REMAPPRIVATEVIEW
                //
                // remapping private views must occur after WRITETODATAFILES otherwise
                // we wouldn't see newly written data on reads.
                //
                invariant(!commitJob.hasWritten());

                stats.curr->_commitsInWriteLock++;

                REMAPPRIVATEVIEW();
            }
            catch (DBException& e) {
                log() << "dbexception in remapPrivateView causing immediate shutdown: "
                      << e.toString() 
                      << endl;
                mongoAbort("gc1");
            }
            catch (std::ios_base::failure& e) {
                log() << "ios_base exception in remapPrivateView causing immediate shutdown: "
                      << e.what()
                      << endl;
                mongoAbort("gc2");
            }
            catch (std::bad_alloc& e) {
                log() << "bad_alloc exception in remapPrivateView causing immediate shutdown: "
                      << e.what()
                      << endl;
                mongoAbort("gc3");
            }
            catch (std::exception& e) {
                log() << "exception in remapPrivateView causing immediate shutdown: "
                      << e.what()
                      << endl;
                mongoAbort("gc4");
            }

            LOG(4) << "remapPrivateView end" << endl;
        }


        /** called when a DurableMappedFile is closing -- we need to go ahead and group commit in that case before its
            views disappear
        */
        void closingFileNotification() {
            if (!storageGlobalParams.dur)
                return;

            if (commitJob.hasWritten()) {
                if (inShutdown()) {
                    log() << "journal warning files are closing outside locks with writes pending"
                          << endl;
                }
                else {
                    fassert(18507, !"File is closing while there are unwritten changes.");
                }
            }
        }

        extern int groupCommitIntervalMs;
        boost::filesystem::path getJournalDir();

        static void durThread() {
            Client::initThread("journal");

            bool samePartition = true;
            try {
                const std::string dbpathDir =
                    boost::filesystem::path(storageGlobalParams.dbpath).string();
                samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
            }
            catch(...) {

            }

            while (shutdownRequested.loadRelaxed() == 0) {
                unsigned ms = storageGlobalParams.journalCommitInterval;
                if( ms == 0 ) { 
                    ms = samePartition ? 100 : 30;
                }

                unsigned oneThird = (ms / 3) + 1; // +1 so never zero

                try {
                    stats.rotate();

                    boost::mutex::scoped_lock lock(flushMutex);

                    // commit sooner if one or more getLastError j:true is pending
                    for (unsigned i = 0; i <= 2; i++) {
                        if (flushRequested.timed_wait(lock,
                                                      Milliseconds(oneThird))) {
                            // Someone forced a flush
                            break;
                        }

                        if (commitJob._notify.nWaiting())
                            break;
                        if (commitJob.bytes() > UncommittedBytesLimit / 2)
                            break;
                    }

                    OperationContextImpl txn;

                    // Waits for all active operations to drain and won't let new ones start. This
                    // should be optimized to allow readers in (see SERVER-15262).
                    AutoAcquireFlushLockForMMAPV1Commit flushLock(txn.lockState());

                    groupCommit();
                    remapPrivateView();
                }
                catch(std::exception& e) {
                    log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
                    mongoAbort("exception in durThread");
                }
                catch (...) {
                    log() << "unhandled exception in durThread causing immediate shutdown" << endl;
                    mongoAbort("unhandled exception in durThread");
                }
            }

            cc().shutdown();
        }

        void preallocateFiles();

        /** at startup, recover, and then start the journal threads */
        void startup() {
            if (!storageGlobalParams.dur)
                return;

            journalMakeDir();

            try {
                replayJournalFilesAtStartup();
            }
            catch(DBException& e) {
                log() << "dbexception during recovery: " << e.toString() << endl;
                throw;
            }
            catch(std::exception& e) {
                log() << "std::exception during recovery: " << e.what() << endl;
                throw;
            }
            catch(...) {
                log() << "exception during recovery" << endl;
                throw;
            }

            preallocateFiles();

            DurableInterface::enableDurability();
            boost::thread t(durThread);
        }

        DurableInterface::~DurableInterface() {
            log() << "ERROR warning ~DurableInterface not intended to be called" << std::endl;
        }

        void DurableImpl::syncDataAndTruncateJournal(OperationContext* txn) {
            invariant(txn->lockState()->isW());

            // a commit from the commit thread won't begin while we are in the write lock,
            // but it may already be in progress and the end of that work is done outside 
            // (dbMutex) locks. This line waits for that to complete if already underway.
            {
                SimpleMutex::scoped_lock lk(commitJob.groupCommitMutex);
            }

            commitNow(txn);
            MongoFile::flushAll(true);
            journalCleanup();

            invariant(!haveJournalFiles()); // Double check post-conditions
        }
        
        class DurSSS : public ServerStatusSection {
        public:
            DurSSS() : ServerStatusSection( "dur" ){}
            virtual bool includeByDefault() const { return true; }
            
            BSONObj generateSection(const BSONElement& configElement) const {
                if (!storageGlobalParams.dur)
                    return BSONObj();
                return dur::stats.asObj();
            }
                
        } durSSS;


    } // namespace dur
예제 #11
0
namespace repl {

    static Counter64 opsAppliedStats;

    //The oplog entries applied
    static ServerStatusMetricField<Counter64> displayOpsApplied( "repl.apply.ops",
                                                                &opsAppliedStats );

    MONGO_FP_DECLARE(rsSyncApplyStop);

    // Number and time of each ApplyOps worker pool round
    static TimerStats applyBatchStats;
    static ServerStatusMetricField<TimerStats> displayOpBatchesApplied(
                                                    "repl.apply.batches",
                                                    &applyBatchStats );
    void initializePrefetchThread() {
        if (!ClientBasic::getCurrent()) {
            Client::initThread("repl prefetch worker");
            replLocalAuth();
        }
    }

    SyncTail::SyncTail(BackgroundSyncInterface *q) :
        Sync(""), oplogVersion(0), _networkQueue(q)
    {}

    SyncTail::~SyncTail() {}

    bool SyncTail::peek(BSONObj* op) {
        return _networkQueue->peek(op);
    }
    /* apply the log op that is in param o
       @return bool success (true) or failure (false)
    */
    bool SyncTail::syncApply(
                        OperationContext* txn, const BSONObj &op, bool convertUpdateToUpsert) {
        const char *ns = op.getStringField("ns");
        verify(ns);

        if ( (*ns == '\0') || (*ns == '.') ) {
            // this is ugly
            // this is often a no-op
            // but can't be 100% sure
            if( *op.getStringField("op") != 'n' ) {
                error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog;
            }
            return true;
        }

        bool isCommand(op["op"].valuestrsafe()[0] == 'c');

        boost::scoped_ptr<Lock::ScopedLock> lk;

        if(isCommand) {
            // a command may need a global write lock. so we will conservatively go 
            // ahead and grab one here. suboptimal. :-(
            lk.reset(new Lock::GlobalWrite(txn->lockState()));
        } else {
            // DB level lock for this operation
            lk.reset(new Lock::DBWrite(txn->lockState(), ns)); 
        }

        Client::Context ctx(ns);
        ctx.getClient()->curop()->reset();
        // For non-initial-sync, we convert updates to upserts
        // to suppress errors when replaying oplog entries.
        bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert);
        opsAppliedStats.increment();
        txn->recoveryUnit()->commitIfNeeded();

        return ok;
    }

    // The pool threads call this to prefetch each op
    void SyncTail::prefetchOp(const BSONObj& op) {
        initializePrefetchThread();

        const char *ns = op.getStringField("ns");
        if (ns && (ns[0] != '\0')) {
            try {
                // one possible tweak here would be to stay in the read lock for this database 
                // for multiple prefetches if they are for the same database.
                OperationContextImpl txn;
                Client::ReadContext ctx(&txn, ns);
                prefetchPagesForReplicatedOp(&txn, ctx.ctx().db(), op);
            }
            catch (const DBException& e) {
                LOG(2) << "ignoring exception in prefetchOp(): " << e.what() << endl;
            }
            catch (const std::exception& e) {
                log() << "Unhandled std::exception in prefetchOp(): " << e.what() << endl;
                fassertFailed(16397);
            }
        }
    }

    // Doles out all the work to the reader pool threads and waits for them to complete
    void SyncTail::prefetchOps(const std::deque<BSONObj>& ops) {
        threadpool::ThreadPool& prefetcherPool = theReplSet->getPrefetchPool();
        for (std::deque<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            prefetcherPool.schedule(&prefetchOp, *it);
        }
        prefetcherPool.join();
    }
    
    // Doles out all the work to the writer pool threads and waits for them to complete
    void SyncTail::applyOps(const std::vector< std::vector<BSONObj> >& writerVectors, 
                                     MultiSyncApplyFunc applyFunc) {
        ThreadPool& writerPool = theReplSet->getWriterPool();
        TimerHolder timer(&applyBatchStats);
        for (std::vector< std::vector<BSONObj> >::const_iterator it = writerVectors.begin();
             it != writerVectors.end();
             ++it) {
            if (!it->empty()) {
                writerPool.schedule(applyFunc, boost::cref(*it), this);
            }
        }
        writerPool.join();
    }

    // Doles out all the work to the writer pool threads and waits for them to complete
    void SyncTail::multiApply( std::deque<BSONObj>& ops, MultiSyncApplyFunc applyFunc ) {

        // Use a ThreadPool to prefetch all the operations in a batch.
        prefetchOps(ops);
        
        std::vector< std::vector<BSONObj> > writerVectors(theReplSet->replWriterThreadCount);
        fillWriterVectors(ops, &writerVectors);
        LOG(2) << "replication batch size is " << ops.size() << endl;
        // We must grab this because we're going to grab write locks later.
        // We hold this mutex the entire time we're writing; it doesn't matter
        // because all readers are blocked anyway.
        SimpleMutex::scoped_lock fsynclk(filesLockedFsync);

        // stop all readers until we're done
        Lock::ParallelBatchWriterMode pbwm;

        applyOps(writerVectors, applyFunc);
    }


    void SyncTail::fillWriterVectors(const std::deque<BSONObj>& ops, 
                                              std::vector< std::vector<BSONObj> >* writerVectors) {
        for (std::deque<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            const BSONElement e = it->getField("ns");
            verify(e.type() == String);
            const char* ns = e.valuestr();
            int len = e.valuestrsize();
            uint32_t hash = 0;
            MurmurHash3_x86_32( ns, len, 0, &hash);

            (*writerVectors)[hash % writerVectors->size()].push_back(*it);
        }
    }


    BSONObj SyncTail::oplogApplySegment(const BSONObj& applyGTEObj, const BSONObj& minValidObj,
                                     MultiSyncApplyFunc func) {
        OpTime applyGTE = applyGTEObj["ts"]._opTime();
        OpTime minValid = minValidObj["ts"]._opTime();

        // We have to keep track of the last op applied to the data, because there's no other easy
        // way of getting this data synchronously.  Batches may go past minValidObj, so we need to
        // know to bump minValid past minValidObj.
        BSONObj lastOp = applyGTEObj;
        OpTime ts = applyGTE;

        time_t start = time(0);
        time_t now = start;

        unsigned long long n = 0, lastN = 0;

        while( ts < minValid ) {
            OpQueue ops;

            while (ops.getSize() < replBatchLimitBytes) {
                if (tryPopAndWaitForMore(&ops)) {
                    break;
                }

                // apply replication batch limits
                now = time(0);
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
            }
            setOplogVersion(ops.getDeque().front());

            multiApply(ops.getDeque(), func);

            n += ops.getDeque().size();

            if ( n > lastN + 1000 ) {
                if (now - start > 10) {
                    // simple progress metering
                    log() << "replSet initialSyncOplogApplication applied "
                          << n << " operations, synced to "
                          << ts.toStringPretty() << rsLog;
                    start = now;
                    lastN = n;
                }
            }

            // we want to keep a record of the last op applied, to compare with minvalid
            lastOp = ops.getDeque().back();
            OpTime tempTs = lastOp["ts"]._opTime();
            applyOpsToOplog(&ops.getDeque());

            ts = tempTs;
        }

        return lastOp;
    }

    BSONObj SyncTail::oplogApplication(const BSONObj& applyGTEObj, const BSONObj& minValidObj) {
        return oplogApplySegment(applyGTEObj, minValidObj, multiSyncApply);
    }

    void SyncTail::setOplogVersion(const BSONObj& op) {
        BSONElement version = op["v"];
        // old primaries do not get the unique index ignoring feature
        // because some of their ops are not imdepotent, see
        // SERVER-7186
        if (version.eoo()) {
            theReplSet->oplogVersion = 1;
            RARELY log() << "warning replset primary is an older version than we are;"
                         << " upgrade recommended";
        } else {
            theReplSet->oplogVersion = version.Int();
        }
    }

    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        while( 1 ) {
            OpQueue ops;

            verify( !Lock::isLocked() );

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                if (theReplSet->isPrimary()) {
                    massert(16620, "there are ops to sync, but I'm primary", ops.empty());
                    return;
                }

                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    {
                        boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
                        if (theReplSet->initialSyncRequested) {
                            // got a resync command
                            return;
                        }
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    if (!theReplSet->isSecondary()) {
                        OpTime minvalid;

                        OperationContextImpl txn;
                        theReplSet->tryToGoLiveAsASecondary(&txn, minvalid);
                    }

                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet->config().members.size() == 1 &&
                        theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = theReplSet->myConfig().slaveDelay;
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true 
                                                    // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes));

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            const BSONObj& lastOp = ops.getDeque().back();
            setOplogVersion(lastOp);
            handleSlaveDelay(lastOp);

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            theReplSet->setMinValid(lastOp);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to apply batch up to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            multiApply(ops.getDeque(), multiSyncApply);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to update oplog to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            if (!theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }

    // Copies ops out of the bgsync queue into the deque passed in as a parameter.
    // Returns true if the batch should be ended early.
    // Batch should end early if we encounter a command, or if
    // there are no further ops in the bgsync queue to read.
    // This function also blocks 1 second waiting for new ops to appear in the bgsync
    // queue.  We can't block forever because there are maintenance things we need
    // to periodically check in the loop.
    bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops) {
        BSONObj op;
        // Check to see if there are ops waiting in the bgsync queue
        bool peek_success = peek(&op);

        if (!peek_success) {
            // if we don't have anything in the queue, wait a bit for something to appear
            if (ops->empty()) {
                // block up to 1 second
                _networkQueue->waitForMore();
                return false;
            }

            // otherwise, apply what we have
            return true;
        }

        const char* ns = op["ns"].valuestrsafe();

        // check for commands
        if ((op["op"].valuestrsafe()[0] == 'c') ||
            // Index builds are acheived through the use of an insert op, not a command op.
            // The following line is the same as what the insert code uses to detect an index build.
            ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) {

            if (ops->empty()) {
                // apply commands one-at-a-time
                ops->push_back(op);
                _networkQueue->consume();
            }

            // otherwise, apply what we have so far and come back for the command
            return true;
        }

        // check for oplog version change
        BSONElement elemVersion = op["v"];
        int curVersion = 0;
        if (elemVersion.eoo())
            // missing version means version 1
            curVersion = 1;
        else
            curVersion = elemVersion.Int();

        if (curVersion != oplogVersion) {
            // Version changes cause us to end a batch.
            // If we are starting a new batch, reset version number
            // and continue.
            if (ops->empty()) {
                oplogVersion = curVersion;
            } 
            else {
                // End batch early
                return true;
            }
        }
    
        // Copy the op to the deque and remove it from the bgsync queue.
        ops->push_back(op);
        _networkQueue->consume();

        // Go back for more ops
        return false;
    }

    void SyncTail::applyOpsToOplog(std::deque<BSONObj>* ops) {
        {
            OperationContextImpl txn; // XXX?
            Lock::DBWrite lk(txn.lockState(), "local");

            while (!ops->empty()) {
                const BSONObj& op = ops->front();
                // this updates theReplSet->lastOpTimeWritten
                _logOpObjRS(op);
                ops->pop_front();
             }
        }

        if (BackgroundSync::get()->isAssumingPrimary()) {
            LOG(1) << "notifying BackgroundSync";
        }
            
        // Update write concern on primary
        BackgroundSync::notify();
    }

    void SyncTail::handleSlaveDelay(const BSONObj& lastOp) {
        int sd = theReplSet->myConfig().slaveDelay;

        // ignore slaveDelay if the box is still initializing. once
        // it becomes secondary we can worry about it.
        if( sd && theReplSet->isSecondary() ) {
            const OpTime ts = lastOp["ts"]._opTime();
            long long a = ts.getSecs();
            long long b = time(0);
            long long lag = b - a;
            long long sleeptime = sd - lag;
            if( sleeptime > 0 ) {
                uassert(12000, "rs slaveDelay differential too big check clocks and systems",
                        sleeptime < 0x40000000);
                if( sleeptime < 60 ) {
                    sleepsecs((int) sleeptime);
                }
                else {
                    log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog;
                    // sleep(hours) would prevent reconfigs from taking effect & such!
                    long long waitUntil = b + sleeptime;
                    while( 1 ) {
                        sleepsecs(6);
                        if( time(0) >= waitUntil )
                            break;

                        if( theReplSet->myConfig().slaveDelay != sd ) // reconf
                            break;
                    }
                }
            }
        } // endif slaveDelay
    }

    static AtomicUInt32 replWriterWorkerId;

    void initializeWriterThread() {
        // Only do this once per thread
        if (!ClientBasic::getCurrent()) {
            string threadName = str::stream() << "repl writer worker "
                                              << replWriterWorkerId.addAndFetch(1);
            Client::initThread( threadName.c_str() );
            // allow us to get through the magic barrier
            Lock::ParallelBatchWriterMode::iAmABatchParticipant();
            replLocalAuth();
        }
    }

    // This free function is used by the writer threads to apply each op
    void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();

        // convert update operations only for 2.2.1 or greater, because we need guaranteed
        // idempotent operations for this to work.  See SERVER-6825
        bool convertUpdatesToUpserts = theReplSet->oplogVersion > 1 ? true : false;

        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                OperationContextImpl txn;
                if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) {
                    fassertFailedNoTrace(16359);
                }
            } catch (const DBException& e) {
                error() << "writer worker caught exception: " << causedBy(e)
                        << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16360);
            }
        }
    }

    // This free function is used by the initial sync writer threads to apply each op
    void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();
        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                OperationContextImpl txn;

                if (!st->syncApply(&txn, *it)) {
                    bool status;
                    {
                        Lock::GlobalWrite lk(txn.lockState());
                        status = st->shouldRetry(&txn, *it);
                    }

                    if (status) {
                        // retry
                        if (!st->syncApply(&txn, *it)) {
                            fassertFailedNoTrace(15915);
                        }
                    }
                    // If shouldRetry() returns false, fall through.
                    // This can happen if the document that was moved and missed by Cloner
                    // subsequently got deleted and no longer exists on the Sync Target at all
                }
            }
            catch (const DBException& e) {
                error() << "exception: " << causedBy(e) << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16361);
            }
        }
    }

} // namespace repl
예제 #12
0
void DocumentSourceOut::initialize() {
    DBClientBase* conn = pExpCtx->mongoProcessInterface->directClient();

    // Save the original collection options and index specs so we can check they didn't change
    // during computation.
    _originalOutOptions = pExpCtx->mongoProcessInterface->getCollectionOptions(_outputNs);
    _originalIndexes = conn->getIndexSpecs(_outputNs.ns());

    // Check if it's sharded or capped to make sure we have a chance of succeeding before we do all
    // the work. If the collection becomes capped during processing, the collection options will
    // have changed, and the $out will fail. If it becomes sharded during processing, the final
    // rename will fail.
    uassert(17017,
            str::stream() << "namespace '" << _outputNs.ns()
                          << "' is sharded so it can't be used for $out'",
            !pExpCtx->mongoProcessInterface->isSharded(pExpCtx->opCtx, _outputNs));
    uassert(17152,
            str::stream() << "namespace '" << _outputNs.ns()
                          << "' is capped so it can't be used for $out",
            _originalOutOptions["capped"].eoo());

    // We will write all results into a temporary collection, then rename the temporary collection
    // to be the target collection once we are done.
    _tempNs = NamespaceString(str::stream() << _outputNs.db() << ".tmp.agg_out."
                                            << aggOutCounter.addAndFetch(1));

    // Create output collection, copying options from existing collection if any.
    {
        BSONObjBuilder cmd;
        cmd << "create" << _tempNs.coll();
        cmd << "temp" << true;
        cmd.appendElementsUnique(_originalOutOptions);

        BSONObj info;
        bool ok = conn->runCommand(_outputNs.db().toString(), cmd.done(), info);
        uassert(16994,
                str::stream() << "failed to create temporary $out collection '" << _tempNs.ns()
                              << "': "
                              << info.toString(),
                ok);
    }

    // copy indexes to _tempNs
    for (std::list<BSONObj>::const_iterator it = _originalIndexes.begin();
         it != _originalIndexes.end();
         ++it) {
        MutableDocument index((Document(*it)));
        index.remove("_id");  // indexes shouldn't have _ids but some existing ones do
        index["ns"] = Value(_tempNs.ns());

        BSONObj indexBson = index.freeze().toBson();
        conn->insert(_tempNs.getSystemIndexesCollection(), indexBson);
        BSONObj err = conn->getLastErrorDetailed();
        uassert(16995,
                str::stream() << "copying index for $out failed."
                              << " index: "
                              << indexBson
                              << " error: "
                              << err,
                DBClientBase::getLastErrorString(err).empty());
    }
    _initialized = true;
}
예제 #13
0
파일: dur.cpp 프로젝트: Aaron20141021/mongo
        static void durThread() {
            Client::initThread("journal");

            bool samePartition = true;
            try {
                const std::string dbpathDir =
                    boost::filesystem::path(storageGlobalParams.dbpath).string();
                samePartition = onSamePartition(getJournalDir().string(), dbpathDir);
            }
            catch(...) {

            }

            while (shutdownRequested.loadRelaxed() == 0) {
                unsigned ms = storageGlobalParams.journalCommitInterval;
                if( ms == 0 ) { 
                    ms = samePartition ? 100 : 30;
                }

                unsigned oneThird = (ms / 3) + 1; // +1 so never zero

                try {
                    stats.rotate();

                    boost::mutex::scoped_lock lock(flushMutex);

                    // commit sooner if one or more getLastError j:true is pending
                    for (unsigned i = 0; i <= 2; i++) {
                        if (flushRequested.timed_wait(lock,
                                                      Milliseconds(oneThird))) {
                            // Someone forced a flush
                            break;
                        }

                        if (commitJob._notify.nWaiting())
                            break;
                        if (commitJob.bytes() > UncommittedBytesLimit / 2)
                            break;
                    }

                    OperationContextImpl txn;

                    // Waits for all active operations to drain and won't let new ones start. This
                    // should be optimized to allow readers in (see SERVER-15262).
                    AutoAcquireFlushLockForMMAPV1Commit flushLock(txn.lockState());

                    groupCommit();
                    remapPrivateView();
                }
                catch(std::exception& e) {
                    log() << "exception in durThread causing immediate shutdown: " << e.what() << endl;
                    mongoAbort("exception in durThread");
                }
                catch (...) {
                    log() << "unhandled exception in durThread causing immediate shutdown" << endl;
                    mongoAbort("unhandled exception in durThread");
                }
            }

            cc().shutdown();
        }
예제 #14
0
namespace repl {
#ifdef MONGO_PLATFORM_64
    const int replWriterThreadCount = 16;
    const int replPrefetcherThreadCount = 16;
#else
    const int replWriterThreadCount = 2;
    const int replPrefetcherThreadCount = 2;
#endif

    static Counter64 opsAppliedStats;

    //The oplog entries applied
    static ServerStatusMetricField<Counter64> displayOpsApplied( "repl.apply.ops",
                                                                &opsAppliedStats );

    MONGO_FP_DECLARE(rsSyncApplyStop);

    // Number and time of each ApplyOps worker pool round
    static TimerStats applyBatchStats;
    static ServerStatusMetricField<TimerStats> displayOpBatchesApplied(
                                                    "repl.apply.batches",
                                                    &applyBatchStats );
    void initializePrefetchThread() {
        if (!ClientBasic::getCurrent()) {
            Client::initThread("repl prefetch worker");
            replLocalAuth();
        }
    }

    SyncTail::SyncTail(BackgroundSyncInterface *q, MultiSyncApplyFunc func) :
        Sync(""), 
        _networkQueue(q), 
        _applyFunc(func),
        _writerPool(replWriterThreadCount),
        _prefetcherPool(replPrefetcherThreadCount)
    {}

    SyncTail::~SyncTail() {}

    bool SyncTail::peek(BSONObj* op) {
        return _networkQueue->peek(op);
    }
    /* apply the log op that is in param o
       @return bool success (true) or failure (false)
    */
    bool SyncTail::syncApply(
                        OperationContext* txn, const BSONObj &op, bool convertUpdateToUpsert) {
        const char *ns = op.getStringField("ns");
        verify(ns);

        if ( (*ns == '\0') || (*ns == '.') ) {
            // this is ugly
            // this is often a no-op
            // but can't be 100% sure
            if( *op.getStringField("op") != 'n' ) {
                error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog;
            }
            return true;
        }

        bool isCommand(op["op"].valuestrsafe()[0] == 'c');

        boost::scoped_ptr<Lock::ScopedLock> lk;

        if(isCommand) {
            // a command may need a global write lock. so we will conservatively go 
            // ahead and grab one here. suboptimal. :-(
            lk.reset(new Lock::GlobalWrite(txn->lockState()));
        } else {
            // DB level lock for this operation
            lk.reset(new Lock::DBLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X));
        }

        Client::Context ctx(txn, ns);
        ctx.getClient()->curop()->reset();
        // For non-initial-sync, we convert updates to upserts
        // to suppress errors when replaying oplog entries.
        bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert);
        opsAppliedStats.increment();

        return ok;
    }

    // The pool threads call this to prefetch each op
    void SyncTail::prefetchOp(const BSONObj& op) {
        initializePrefetchThread();

        const char *ns = op.getStringField("ns");
        if (ns && (ns[0] != '\0')) {
            try {
                // one possible tweak here would be to stay in the read lock for this database 
                // for multiple prefetches if they are for the same database.
                OperationContextImpl txn;
                AutoGetCollectionForRead ctx(&txn, ns);
                prefetchPagesForReplicatedOp(&txn, ctx.getDb(), op);
            }
            catch (const DBException& e) {
                LOG(2) << "ignoring exception in prefetchOp(): " << e.what() << endl;
            }
            catch (const std::exception& e) {
                log() << "Unhandled std::exception in prefetchOp(): " << e.what() << endl;
                fassertFailed(16397);
            }
        }
    }

    // Doles out all the work to the reader pool threads and waits for them to complete
    void SyncTail::prefetchOps(const std::deque<BSONObj>& ops) {
        for (std::deque<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            _prefetcherPool.schedule(&prefetchOp, *it);
        }
        _prefetcherPool.join();
    }
    
    // Doles out all the work to the writer pool threads and waits for them to complete
    void SyncTail::applyOps(const std::vector< std::vector<BSONObj> >& writerVectors) {
        TimerHolder timer(&applyBatchStats);
        for (std::vector< std::vector<BSONObj> >::const_iterator it = writerVectors.begin();
             it != writerVectors.end();
             ++it) {
            if (!it->empty()) {
                _writerPool.schedule(_applyFunc, boost::cref(*it), this);
            }
        }
        _writerPool.join();
    }

    // Doles out all the work to the writer pool threads and waits for them to complete
    void SyncTail::multiApply( std::deque<BSONObj>& ops) {

        // Use a ThreadPool to prefetch all the operations in a batch.
        prefetchOps(ops);
        
        std::vector< std::vector<BSONObj> > writerVectors(replWriterThreadCount);
        fillWriterVectors(ops, &writerVectors);
        LOG(2) << "replication batch size is " << ops.size() << endl;
        // We must grab this because we're going to grab write locks later.
        // We hold this mutex the entire time we're writing; it doesn't matter
        // because all readers are blocked anyway.
        SimpleMutex::scoped_lock fsynclk(filesLockedFsync);

        // stop all readers until we're done
        Lock::ParallelBatchWriterMode pbwm;

        applyOps(writerVectors);
    }


    void SyncTail::fillWriterVectors(const std::deque<BSONObj>& ops, 
                                              std::vector< std::vector<BSONObj> >* writerVectors) {
        for (std::deque<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            const BSONElement e = it->getField("ns");
            verify(e.type() == String);
            const char* ns = e.valuestr();
            int len = e.valuestrsize();
            uint32_t hash = 0;
            MurmurHash3_x86_32( ns, len, 0, &hash);

            (*writerVectors)[hash % writerVectors->size()].push_back(*it);
        }
    }
    void SyncTail::oplogApplication(OperationContext* txn, const OpTime& endOpTime) {
        _applyOplogUntil(txn, endOpTime);
    }

    /* applies oplog from "now" until endOpTime using the applier threads for initial sync*/
    void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) {
        unsigned long long bytesApplied = 0;
        unsigned long long entriesApplied = 0;
        while (true) {
            OpQueue ops;
            OperationContextImpl ctx;

            while (!tryPopAndWaitForMore(&ops, getGlobalReplicationCoordinator())) {
                // nothing came back last time, so go again
                if (ops.empty()) continue;

                // Check if we reached the end
                const BSONObj currentOp = ops.back();
                const OpTime currentOpTime = currentOp["ts"]._opTime();

                // When we reach the end return this batch
                if (currentOpTime == endOpTime) {
                    break;
                }
                else if (currentOpTime > endOpTime) {
                    severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime
                            << " without seeing it. Rollback?" << rsLog;
                    fassertFailedNoTrace(18693);
                }

                // apply replication batch limits
                if (ops.getSize() > replBatchLimitBytes)
                    break;
                if (ops.getDeque().size() > replBatchLimitOperations)
                    break;
            };

            if (ops.empty()) {
                severe() << "got no ops for batch...";
                fassertFailedNoTrace(18692);
            }

            const BSONObj lastOp = ops.back().getOwned();

            // Tally operation information
            bytesApplied += ops.getSize();
            entriesApplied += ops.getDeque().size();

            multiApply(ops.getDeque());
            OpTime lastOpTime = applyOpsToOplog(&ops.getDeque());

            // if the last op applied was our end, return
            if (lastOpTime == endOpTime) {
                LOG(1) << "SyncTail applied " << entriesApplied
                       << " entries (" << bytesApplied << " bytes)"
                       << " and finished at opTime " << endOpTime.toStringPretty();
                return;
            }
        } // end of while (true)
    }

namespace {
    void tryToGoLiveAsASecondary(OperationContext* txn, ReplicationCoordinator* replCoord) {
        Lock::GlobalRead readLock(txn->lockState());

        if (replCoord->getMaintenanceMode()) {
            // we're not actually going live
            return;
        }

        // Only state RECOVERING can transition to SECONDARY.
        MemberState state(replCoord->getCurrentMemberState());
        if (!state.recovering()) {
            return;
        }

        OpTime minvalid = getMinValid(txn);
        if (minvalid > replCoord->getMyLastOptime()) {
            return;
        }

        replCoord->setFollowerMode(MemberState::RS_SECONDARY);
    }
}

    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();

        while(!inShutdown()) {
            OpQueue ops;
            OperationContextImpl txn;

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    BackgroundSync* bgsync = BackgroundSync::get();
                    if (bgsync->getInitialSyncRequestedFlag()) {
                        // got a resync command
                        Lock::DBLock lk(txn.lockState(), "local", MODE_X);
                        WriteUnitOfWork wunit(&txn);
                        Client::Context ctx(&txn, "local");

                        ctx.db()->dropCollection(&txn, "local.oplog.rs");

                        // Note: the following order is important.
                        // The bgsync thread uses an empty optime as a sentinel to know to wait
                        // for initial sync (done in this thread after we return); thus, we must
                        // ensure the lastAppliedOptime is empty before pausing the bgsync thread
                        // via stop().
                        // We must clear the sync source blacklist after calling stop()
                        // because the bgsync thread, while running, may update the blacklist.
                        replCoord->setMyLastOptime(&txn, OpTime());
                        bgsync->stop();
                        replCoord->clearSyncSourceBlacklist();

                        wunit.commit();

                        return;
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    tryToGoLiveAsASecondary(&txn, replCoord);

                    // TODO(emilkie): This can be removed once we switch over from legacy;
                    // this code is what moves 1-node sets to PRIMARY state.
                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet &&
                            theReplSet->config().members.size() == 1 &&
                            theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds();
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true 
                                                               // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes) &&
                   !inShutdown());

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            if (ops.empty()) {
                continue;
            }

            const BSONObj& lastOp = ops.getDeque().back();
            handleSlaveDelay(lastOp);

            if (replCoord->getCurrentMemberState().primary() && 
                !replCoord->isWaitingForApplierToDrain()) {
                severe() << "attempting to replicate ops while primary";
                fassertFailed(28527);
            }

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            OpTime minValid = lastOp["ts"]._opTime();
            setMinValid(&txn, minValid);

            multiApply(ops.getDeque());

            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication
            // coordinator
            if (theReplSet && !theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }

    // Copies ops out of the bgsync queue into the deque passed in as a parameter.
    // Returns true if the batch should be ended early.
    // Batch should end early if we encounter a command, or if
    // there are no further ops in the bgsync queue to read.
    // This function also blocks 1 second waiting for new ops to appear in the bgsync
    // queue.  We can't block forever because there are maintenance things we need
    // to periodically check in the loop.
    bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops, ReplicationCoordinator* replCoord) {
        BSONObj op;
        // Check to see if there are ops waiting in the bgsync queue
        bool peek_success = peek(&op);

        if (!peek_success) {
            // if we don't have anything in the queue, wait a bit for something to appear
            if (ops->empty()) {
                replCoord->signalDrainComplete();
                // block up to 1 second
                _networkQueue->waitForMore();
                return false;
            }

            // otherwise, apply what we have
            return true;
        }

        const char* ns = op["ns"].valuestrsafe();

        // check for commands
        if ((op["op"].valuestrsafe()[0] == 'c') ||
            // Index builds are acheived through the use of an insert op, not a command op.
            // The following line is the same as what the insert code uses to detect an index build.
            ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) {

            if (ops->empty()) {
                // apply commands one-at-a-time
                ops->push_back(op);
                _networkQueue->consume();
            }

            // otherwise, apply what we have so far and come back for the command
            return true;
        }

        // check for oplog version change
        BSONElement elemVersion = op["v"];
        int curVersion = 0;
        if (elemVersion.eoo())
            // missing version means version 1
            curVersion = 1;
        else
            curVersion = elemVersion.Int();
        
        if (curVersion != OPLOG_VERSION) {
            severe() << "expected oplog version " << OPLOG_VERSION << " but found version " 
                     << curVersion << " in oplog entry: " << op;
            fassertFailedNoTrace(18820);
        }
    
        // Copy the op to the deque and remove it from the bgsync queue.
        ops->push_back(op);
        _networkQueue->consume();

        // Go back for more ops
        return false;
    }

    OpTime SyncTail::applyOpsToOplog(std::deque<BSONObj>* ops) {
        OpTime lastOpTime;
        {
            OperationContextImpl txn; // XXX?
            Lock::DBLock lk(txn.lockState(), "local", MODE_X);
            WriteUnitOfWork wunit(&txn);

            while (!ops->empty()) {
                const BSONObj& op = ops->front();
                // this updates lastOpTimeApplied
                lastOpTime = _logOpObjRS(&txn, op);
                ops->pop_front();
             }
            wunit.commit();
        }

        // Update write concern on primary
        BackgroundSync::get()->notify();
        return lastOpTime;
    }

    void SyncTail::handleSlaveDelay(const BSONObj& lastOp) {
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
        int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds();

        // ignore slaveDelay if the box is still initializing. once
        // it becomes secondary we can worry about it.
        if( slaveDelaySecs > 0 && replCoord->getCurrentMemberState().secondary() ) {
            const OpTime ts = lastOp["ts"]._opTime();
            long long a = ts.getSecs();
            long long b = time(0);
            long long lag = b - a;
            long long sleeptime = slaveDelaySecs - lag;
            if( sleeptime > 0 ) {
                uassert(12000, "rs slaveDelay differential too big check clocks and systems",
                        sleeptime < 0x40000000);
                if( sleeptime < 60 ) {
                    sleepsecs((int) sleeptime);
                }
                else {
                    warning() << "replSet slavedelay causing a long sleep of " << sleeptime
                              << " seconds" << rsLog;
                    // sleep(hours) would prevent reconfigs from taking effect & such!
                    long long waitUntil = b + sleeptime;
                    while(time(0) < waitUntil) {
                        sleepsecs(6);

                        // Handle reconfigs that changed the slave delay
                        if (replCoord->getSlaveDelaySecs().total_seconds() != slaveDelaySecs)
                            break;
                    }
                }
            }
        } // endif slaveDelay
    }

    static AtomicUInt32 replWriterWorkerId;

    static void initializeWriterThread() {
        // Only do this once per thread
        if (!ClientBasic::getCurrent()) {
            string threadName = str::stream() << "repl writer worker "
                                              << replWriterWorkerId.addAndFetch(1);
            Client::initThread( threadName.c_str() );
            replLocalAuth();
        }
    }

    // This free function is used by the writer threads to apply each op
    void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();

        OperationContextImpl txn;

        // allow us to get through the magic barrier
        Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState());

        bool convertUpdatesToUpserts = true;

        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) {
                    fassertFailedNoTrace(16359);
                }
            } catch (const DBException& e) {
                error() << "writer worker caught exception: " << causedBy(e)
                        << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16360);
            }
        }
    }

    // This free function is used by the initial sync writer threads to apply each op
    void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();

        OperationContextImpl txn;

        // allow us to get through the magic barrier
        Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState());

        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                if (!st->syncApply(&txn, *it)) {
                    bool status;
                    {
                        Lock::GlobalWrite lk(txn.lockState());
                        status = st->shouldRetry(&txn, *it);
                    }

                    if (status) {
                        // retry
                        if (!st->syncApply(&txn, *it)) {
                            fassertFailedNoTrace(15915);
                        }
                    }
                    // If shouldRetry() returns false, fall through.
                    // This can happen if the document that was moved and missed by Cloner
                    // subsequently got deleted and no longer exists on the Sync Target at all
                }
            }
            catch (const DBException& e) {
                error() << "exception: " << causedBy(e) << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16361);
            }
        }
    }

} // namespace repl
예제 #15
0
파일: dur.cpp 프로젝트: Aaron20141021/mongo
 void* DurableImpl::writingPtr(void *x, unsigned len) {
     dassert(shutdownRequested.load() == 0);
     void *p = x;
     declareWriteIntent(p, len);
     return p;
 }
예제 #16
0
파일: dur.cpp 프로젝트: Aaron20141021/mongo
 void* NonDurableImpl::writingPtr(void *x, unsigned len) {
     dassert(shutdownRequested.load() == 0);
     return x; 
 }
예제 #17
0
void go() {
    verify( options["r"].trueValue() || options["w"].trueValue() );

    recSizeKB = options["recSizeKB"].numberInt();
    if( recSizeKB == 0 )
        recSizeKB = 4;
    verify( recSizeKB <= 64000 && recSizeKB > 0 );

    MemoryMappedFile f;
    cout << "creating test file size:";
    len = options["fileSizeMB"].numberLong();
    if( len == 0 ) len = 1;
    cout << len << "MB ..." << endl;

    if( 0 && len > 2000 && !options["mmf"].trueValue() ) { 
        // todo make tests use 64 bit offsets in their i/o -- i.e. adjust LogFile::writeAt and such
        cout << "\nsizes > 2GB not yet supported with mmf:false" << endl; 
        return;
    }
    len *= 1024 * 1024;
    const char *fname = "./mongoperf__testfile__tmp";
    try {
        boost::filesystem::remove(fname);
    }
    catch(...) { 
        cout << "error deleting file " << fname << endl;
        return;
    }
    lf = new LogFile(fname,true);
    const unsigned sz = 1024 * 1024 * 32; // needs to be big as we are using synchronousAppend.  if we used a regular MongoFile it wouldn't have to be
    char *buf = (char*) mongoMalloc(sz+4096);
    const char *p = round(buf);
    for( unsigned long long i = 0; i < len; i += sz ) { 
        lf->synchronousAppend(p, sz);
        if( i % (1024ULL*1024*1024) == 0 && i ) {
            cout << i / (1024ULL*1024*1024) << "GB..." << endl;
        }
    }
    BSONObj& o = options;

    if( o["mmf"].trueValue() ) { 
        delete lf;
        lf = 0;
        mmfFile = new MemoryMappedFile();
        mmf = (char *) mmfFile->map(fname);
        verify( mmf );

        syncDelaySecs = options["syncDelay"].numberInt();
        if( syncDelaySecs ) {
            boost::thread t(syncThread);
        }
    }

    cout << "testing..."<< endl;

    cout << "options:" << o.toString() << endl;
    unsigned wthr = 1;
    if( !o["nThreads"].eoo() ) {
        wthr = (unsigned) o["nThreads"].Int();
    }
    cout << "wthr " << wthr << endl;

    if( wthr < 1 ) { 
        cout << "bad threads field value" << endl;
        return;
    }
    unsigned i = 0;
    unsigned d = 1;
    unsigned &nthr = nThreadsRunning;
    while( 1 ) {
        if( i++ % 8 == 0 ) {
            if( nthr < wthr ) {
                while( nthr < wthr && nthr < d ) {
                    nthr++;
                    boost::thread w(workerThread);
                }
                cout << "new thread, total running : " << nthr << endl;
                d *= 2;
            }
        }
        sleepsecs(1);
        unsigned long long w = iops.loadRelaxed();
        iops.store(0);
        w /= 1; // 1 secs
        cout << w << " ops/sec ";
        if( mmf == 0 ) 
            // only writing 4 bytes with mmf so we don't say this
            cout << (w * PG / 1024 / 1024) << " MB/sec";
        cout << endl;
    }
}