void ReplicationCoordinatorExternalStateImpl::dropAllTempCollections(OperationContext* txn) {
        std::vector<std::string> dbNames;
        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        storageEngine->listDatabases(&dbNames);

        for (std::vector<std::string>::iterator it = dbNames.begin(); it != dbNames.end(); ++it) {
            // The local db is special because it isn't replicated. It is cleared at startup even on
            // replica set members.
            if (*it == "local")
                continue;
            LOG(2) << "Removing temporary collections from " << *it;
            Database* db = dbHolder().get(txn, *it);
            // Since we must be holding the global lock during this function, if listDatabases
            // returned this dbname, we should be able to get a reference to it - it can't have
            // been dropped.
            invariant(db);
            db->clearTmpCollections(txn);
        }
    }
示例#2
0
    void dropAllDatabasesExceptLocal(OperationContext* txn) {
        Lock::GlobalWrite lk(txn->lockState());

        vector<string> n;
        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        storageEngine->listDatabases(&n);

        if( n.size() == 0 ) return;
        log() << "dropAllDatabasesExceptLocal " << n.size() << endl;

        for (vector<string>::iterator i = n.begin(); i != n.end(); i++) {
            if (*i != "local") {
                Database* db = dbHolder().get(txn, *i);
                invariant(db);

                dropDatabase(txn, db);
            }
        }
    }
示例#3
0
    void DataFileSync::run() {
        Client::initThread( name().c_str() );

        if (mmapv1GlobalOptions.syncdelay == 0) {
            log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
        }
        else if (mmapv1GlobalOptions.syncdelay == 1) {
            log() << "--syncdelay 1" << endl;
        }
        else if (mmapv1GlobalOptions.syncdelay != 60) {
            LOG(1) << "--syncdelay " << mmapv1GlobalOptions.syncdelay << endl;
        }
        int time_flushing = 0;
        while ( ! inShutdown() ) {
            _diaglog.flush();
            if (mmapv1GlobalOptions.syncdelay == 0) {
                // in case at some point we add an option to change at runtime
                sleepsecs(5);
                continue;
            }

            sleepmillis((long long) std::max(0.0, (mmapv1GlobalOptions.syncdelay * 1000) - time_flushing));

            if ( inShutdown() ) {
                // occasional issue trying to flush during shutdown when sleep interrupted
                break;
            }

            Date_t start = jsTime();
            StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
            int numFiles = storageEngine->flushAllFiles( true );
            time_flushing = (int) (jsTime() - start);

            _flushed(time_flushing);

            if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) {
                log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
            }
        }
    }
示例#4
0
    std::string newRunQuery(OperationContext* txn,
                            Message& m,
                            QueryMessage& q,
                            CurOp& curop,
                            Message &result,
                            bool fromDBDirectClient) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        const NamespaceString nss(q.ns);

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(
                                    q, &cq, WhereCallbackReal(txn, StringData(nss.db())));
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }

        QLOG() << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        PlanExecutor* rawExec = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        AutoGetCollectionForRead ctx(txn, nss);

        const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() :
                                                             serverGlobalParams.defaultProfile;

        Collection* collection = ctx.getCollection();

        // We'll now try to get the query executor that will execute this query for us. There
        // are a few cases in which we know upfront which executor we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we use an EOFStage.
        //
        // (b) if the query is a replication's initial sync one, we use a specifically designed
        // stage that skips extents faster (see details in exec/oplogstart.h).
        //
        // Otherwise we go through the selection of which executor is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (NULL != collection && pq.getOptions().oplogReplay) {
            // Takes ownership of 'cq'.
            status = getOplogStartHack(txn, collection, cq, &rawExec);
        }
        else {
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            // Takes ownership of 'cq'.
            status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getExecutor has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawExec);
        auto_ptr<PlanExecutor> exec(rawExec);

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref();
        status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                NamespaceString(cq->ns()),
                slaveOK);
        uassertStatusOK(status);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the PlanExecutor in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.getOptions().oplogReplay) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (!supportsGetMore && (enough(pq, numResults)
                                     || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " executor EOF=" << exec->isEOF() << endl;
                    saveClientCursor = !exec->isEOF();
                }
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::EXEC_ERROR == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // Why save a dead executor?
        if (PlanExecutor::DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.getOptions().tailable) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery;
        const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1);

        PlanSummaryStats summaryStats;
        Explain::getSummaryStats(exec.get(), &summaryStats);

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;
        curop.debug().scanAndOrder = summaryStats.hasSortStage;
        curop.debug().nscanned = summaryStats.totalKeysExamined;
        curop.debug().nscannedObjects = summaryStats.totalDocsExamined;
        curop.debug().idhack = summaryStats.isIdhack;

        // Set debug information for consumption by the profiler.
        if (dbProfilingLevel > 0 ||
            curop.elapsedMillis() > serverGlobalParams.slowMS ||
            logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) {
            // Get BSON stats.
            scoped_ptr<PlanStageStats> execStats(exec->getStats());
            BSONObjBuilder statsBob;
            Explain::statsToBSON(*execStats, &statsBob);
            curop.debug().execStats.set(statsBob.obj());

            // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
            if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) {
                BSONObjBuilder bob;
                bob.append("summary", curop.debug().planSummary.toString());
                curop.debug().execStats.set(bob.done());
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection, exec.get(),
                                                cq->getParsed().getOptions().toInt(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            if (fromDBDirectClient) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
                txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn));
            }

            QLOG() << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of executor.  Release to make sure it's not deleted.
            exec.release();

            // TODO document
            if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.getOptions().exhaust) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "Not caching executor but returning " << numResults << " results.\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
示例#5
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View newGetMore(OperationContext* txn,
                            const char* ns,
                            int ntoreturn,
                            long long cursorid,
                            CurOp& curop,
                            int pass,
                            bool& exhaust,
                            bool* isCursorAuthorized,
                            bool fromDBDirectClient) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        // This is a read lock.
        const NamespaceString nss(ns);
        scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss));
        Collection* collection = ctx->getCollection();
        uassert( 17356, "collection dropped between getMore calls", collection );

        QLOG() << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(collection, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (fromDBDirectClient) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn));

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn, curop); 
            }

            if (cc->isAggCursor) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            // We save the client cursor when there might be more results, and hence we may receive
            // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know
            // that we will not be producing more results. We indicate that the cursor is closed by
            // sending a cursorId of 0 back to the client.
            //
            // On the other hand, if we retrieve all results necessary for this batch, then
            // 'saveClientCursor' is true and we send a valid cursorId back to the client. In
            // this case, there may or may not actually be more results (for example, the next call
            // to getNext(...) might just return EOF).
            bool saveClientCursor = false;

            if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) {
                // Propagate this error to caller.
                if (PlanExecutor::EXEC_ERROR == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // If we're dead there's no way to get more results.
                saveClientCursor = false;

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (PlanExecutor::IS_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(PlanExecutor::ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                QLOG() << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!fromDBDirectClient) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper.reset();
                        delete cc->releaseOwnedRecoveryUnit();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
示例#6
0
        virtual bool run(OperationContext* txn,  const string& dbname, BSONObj& cmdObj, int options,
                          string& errmsg, BSONObjBuilder& result,
                          bool fromRepl = false ) {

            // ---  parse

            NamespaceString ns( dbname, cmdObj[name].String() );
            Status status = userAllowedWriteNS( ns );
            if ( !status.isOK() )
                return appendCommandStatus( result, status );

            if ( cmdObj["indexes"].type() != Array ) {
                errmsg = "indexes has to be an array";
                result.append( "cmdObj", cmdObj );
                return false;
            }

            std::vector<BSONObj> specs;
            {
                BSONObjIterator i( cmdObj["indexes"].Obj() );
                while ( i.more() ) {
                    BSONElement e = i.next();
                    if ( e.type() != Object ) {
                        errmsg = "everything in indexes has to be an Object";
                        result.append( "cmdObj", cmdObj );
                        return false;
                    }
                    specs.push_back( e.Obj() );
                }
            }

            if ( specs.size() == 0 ) {
                errmsg = "no indexes to add";
                return false;
            }

            // check specs
            for ( size_t i = 0; i < specs.size(); i++ ) {
                BSONObj spec = specs[i];
                if ( spec["ns"].eoo() ) {
                    spec = _addNsToSpec( ns, spec );
                    specs[i] = spec;
                }

                if ( spec["ns"].type() != String ) {
                    errmsg = "spec has no ns";
                    result.append( "spec", spec );
                    return false;
                }
                if ( ns != spec["ns"].String() ) {
                    errmsg = "namespace mismatch";
                    result.append( "spec", spec );
                    return false;
                }
            }

            // now we know we have to create index(es)
            // Note: createIndexes command does not currently respect shard versioning.
            ScopedTransaction transaction(txn, MODE_IX);
            Lock::DBLock dbLock(txn->lockState(), ns.db(), MODE_X);
            if (!fromRepl &&
                !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) {
                return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream()
                    << "Not primary while creating indexes in " << ns.ns()));
            }

            Database* db = dbHolder().get(txn, ns.db());
            if (!db) {
                db = dbHolder().openDb(txn, ns.db());
            }

            Collection* collection = db->getCollection( ns.ns() );
            result.appendBool( "createdCollectionAutomatically", collection == NULL );
            if ( !collection ) {
                MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                    WriteUnitOfWork wunit(txn);
                    collection = db->createCollection( txn, ns.ns() );
                    invariant( collection );
                    if (!fromRepl) {
                        getGlobalEnvironment()->getOpObserver()->onCreateCollection(
                                txn,
                                ns,
                                CollectionOptions());
                    }
                    wunit.commit();
                } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createIndexes", ns.ns());
            }
示例#7
0
文件: oplog.cpp 项目: chmanie/mongo
    void createOplog(OperationContext* txn) {
        Lock::GlobalWrite lk(txn->lockState());

        const char * ns = "local.oplog.$main";

        const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings();
        bool rs = !replSettings.replSet.empty();
        if( rs )
            ns = rsoplog;

        Client::Context ctx(txn, ns);
        Collection* collection = ctx.db()->getCollection(txn, ns );

        if ( collection ) {

            if (replSettings.oplogSize != 0) {
                int o = (int)(collection->getRecordStore()->storageSize(txn) / ( 1024 * 1024 ) );
                int n = (int)(replSettings.oplogSize / (1024 * 1024));
                if ( n != o ) {
                    stringstream ss;
                    ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog";
                    log() << ss.str() << endl;
                    throw UserException( 13257 , ss.str() );
                }
            }

            if( rs ) return;

            initOpTimeFromOplog(txn, ns);
            return;
        }

        /* create an oplog collection, if it doesn't yet exist. */
        long long sz = 0;
        if ( replSettings.oplogSize != 0 ) {
            sz = replSettings.oplogSize;
        }
        else {
            /* not specified. pick a default size */
            sz = 50LL * 1024LL * 1024LL;
            if ( sizeof(int *) >= 8 ) {
#if defined(__APPLE__)
                // typically these are desktops (dev machines), so keep it smallish
                sz = (256-64) * 1024 * 1024;
#else
                sz = 990LL * 1024 * 1024;
                double free =
                    File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported.
                long long fivePct = static_cast<long long>( free * 0.05 );
                if ( fivePct > sz )
                    sz = fivePct;
                // we use 5% of free space up to 50GB (1TB free)
                static long long upperBound = 50LL * 1024 * 1024 * 1024;
                if (fivePct > upperBound)
                    sz = upperBound;
#endif
            }
        }

        log() << "******" << endl;
        log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl;

        CollectionOptions options;
        options.capped = true;
        options.cappedSize = sz;
        options.autoIndexId = CollectionOptions::NO;

        WriteUnitOfWork wunit(txn);
        invariant(ctx.db()->createCollection(txn, ns, options));
        if( !rs )
            logOp(txn, "n", "", BSONObj() );
        wunit.commit();

        /* sync here so we don't get any surprising lag later when we try to sync */
        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        storageEngine->flushAllFiles(true);
        log() << "******" << endl;
    }
示例#8
0
    static void handleCursorCommand(OperationContext* txn,
                                    const string& ns,
                                    ClientCursorPin* pin,
                                    PlanExecutor* exec,
                                    const BSONObj& cmdObj,
                                    BSONObjBuilder& result) {

        ClientCursor* cursor = pin ? pin->c() : NULL;
        if (pin) {
            invariant(cursor);
            invariant(cursor->getExecutor() == exec);
            invariant(cursor->isAggCursor());
        }

        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
        BSONArrayBuilder resultsArray;
        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        BSONObj next;
        for (int objCount = 0; objCount < batchSize; objCount++) {
            // The initial getNext() on a PipelineProxyStage may be very expensive so we don't
            // do it when batchSize is 0 since that indicates a desire for a fast return.
            if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) {
                if (pin) pin->deleteUnderlying();
                // make it an obvious error to use cursor or executor after this point
                cursor = NULL;
                exec = NULL;
                break;
            }

            if (resultsArray.len() + next.objsize() > byteLimit) {
                // Get the pipeline proxy stage wrapped by this PlanExecutor.
                PipelineProxyStage* proxy = static_cast<PipelineProxyStage*>(exec->getRootStage());
                // too big. next will be the first doc in the second batch
                proxy->pushBack(next);
                break;
            }

            resultsArray.append(next);
        }

        // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should
        // be relatively quick since if there was no pin then the input is empty. Also, this
        // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
        // case. This is ok for now however, since you can't have a sharded collection that doesn't
        // exist.
        const bool canReturnMoreBatches = pin;
        if (!canReturnMoreBatches && exec && !exec->isEOF()) {
            // msgasserting since this shouldn't be possible to trigger from today's aggregation
            // language. The wording assumes that the only reason pin would be null is if the
            // collection doesn't exist.
            msgasserted(17391, str::stream()
                << "Aggregation has more results than fit in initial batch, but can't "
                << "create cursor since collection " << ns << " doesn't exist");
        }

        if (cursor) {
            // If a time limit was set on the pipeline, remaining time is "rolled over" to the
            // cursor (for use by future getmore ops).
            cursor->setLeftoverMaxTimeMicros( txn->getCurOp()->getRemainingMaxTimeMicros() );

            // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
            // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
            cursor->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
            StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
            txn->setRecoveryUnit(storageEngine->newRecoveryUnit());

            // Cursor needs to be in a saved state while we yield locks for getmore. State
            // will be restored in getMore().
            exec->saveState();
        }

        BSONObjBuilder cursorObj(result.subobjStart("cursor"));
        cursorObj.append("id", cursor ? cursor->cursorid() : 0LL);
        cursorObj.append("ns", ns);
        cursorObj.append("firstBatch", resultsArray.arr());
        cursorObj.done();
    }
 Status AuthzManagerExternalStateMongod::getAllDatabaseNames(
             OperationContext* txn, std::vector<std::string>* dbnames) {
     StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
     storageEngine->listDatabases(dbnames);
     return Status::OK();
 }
 void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) {
     GlobalEnvironmentExperiment* environment = getGlobalEnvironment();
     environment->killAllUserOperations(txn);
 }
示例#11
0
void RangeDeleter::doWork() {
    _env->initThread();

    while (!inShutdown() && !stopRequested()) {
        string errMsg;

        RangeDeleteEntry* nextTask = NULL;

        {
            scoped_lock sl(_queueMutex);
            while (_taskQueue.empty()) {
                _taskQueueNotEmptyCV.timed_wait(
                    sl.boost(), duration::milliseconds(kNotEmptyTimeoutMillis));

                if (stopRequested()) {
                    log() << "stopping range deleter worker" << endl;
                    return;
                }

                if (_taskQueue.empty()) {
                    // Try to check if some deletes are ready and move them to the
                    // ready queue.

                    TaskList::iterator iter = _notReadyQueue.begin();
                    while (iter != _notReadyQueue.end()) {
                        RangeDeleteEntry* entry = *iter;

                        set<CursorId> cursorsNow;
                        {
                            boost::scoped_ptr<OperationContext> txn(
                                getGlobalEnvironment()->newOpCtx());
                            if (entry->options.waitForOpenCursors) {
                                _env->getCursorIds(txn.get(),
                                                   entry->options.range.ns,
                                                   &cursorsNow);
                            }
                        }

                        set<CursorId> cursorsLeft;
                        std::set_intersection(entry->cursorsToWait.begin(),
                                              entry->cursorsToWait.end(),
                                              cursorsNow.begin(),
                                              cursorsNow.end(),
                                              std::inserter(cursorsLeft,
                                                            cursorsLeft.end()));

                        entry->cursorsToWait.swap(cursorsLeft);

                        if (entry->cursorsToWait.empty()) {
                            (*iter)->stats.queueEndTS = jsTime();
                            _taskQueue.push_back(*iter);
                            _taskQueueNotEmptyCV.notify_one();
                            iter = _notReadyQueue.erase(iter);
                        }
                        else {
                            logCursorsWaiting(entry);
                            ++iter;
                        }
                    }
                }
            }

            if (stopRequested()) {
                log() << "stopping range deleter worker" << endl;
                return;
            }

            nextTask = _taskQueue.front();
            _taskQueue.pop_front();

            _deletesInProgress++;
        }

        {
            boost::scoped_ptr<OperationContext> txn(getGlobalEnvironment()->newOpCtx());

            nextTask->stats.deleteStartTS = jsTime();
            bool delResult = _env->deleteRange(txn.get(),
                                               *nextTask,
                                               &nextTask->stats.deletedDocCount,
                                               &errMsg);
            nextTask->stats.deleteEndTS = jsTime();

            if (delResult) {
                nextTask->stats.waitForReplStartTS = jsTime();

                if (!_waitForMajority(txn.get(), &errMsg)) {
                    warning() << "Error encountered while waiting for replication: " << errMsg;
                }

                nextTask->stats.waitForReplEndTS = jsTime();
            }
            else {
                warning() << "Error encountered while trying to delete range: "
                          << errMsg << endl;
            }
        }

        {
            scoped_lock sl(_queueMutex);

            NSMinMax setEntry(nextTask->options.range.ns,
                              nextTask->options.range.minKey,
                              nextTask->options.range.maxKey);
            deletePtrElement(&_deleteSet, &setEntry);
            _deletesInProgress--;

            if (nextTask->notifyDone) {
                nextTask->notifyDone->notifyOne();
            }
        }

        recordDelStats(new DeleteJobStats(nextTask->stats));
        delete nextTask;
        nextTask = NULL;
    }
}
 ~IndexBuildBase() {
     _client.dropCollection( _ns );
     _ctx.commit(); // just for testing purposes
     getGlobalEnvironment()->unsetKillAllOperations();
 }
示例#13
0
    int Tool::main( int argc , char ** argv, char ** envp ) {
        setupSignalHandlers(true);

        static StaticObserver staticObserver;

        mongo::runGlobalInitializersOrDie(argc, argv, envp);

        // hide password from ps output
        for (int i=0; i < (argc-1); ++i) {
            if (!strcmp(argv[i], "-p") || !strcmp(argv[i], "--password")) {
                char* arg = argv[i+1];
                while (*arg) {
                    *arg++ = 'x';
                }
            }
        }

        if (!toolGlobalParams.useDirectClient) {
            if (toolGlobalParams.noconnection) {
                // do nothing
            }
            else {
                string errmsg;

                ConnectionString cs = ConnectionString::parse(toolGlobalParams.connectionString,
                                                              errmsg);
                if ( ! cs.isValid() ) {
                    toolError() << "invalid hostname [" << toolGlobalParams.connectionString << "] "
                              << errmsg << std::endl;
                    ::_exit(-1);
                }

                _conn = cs.connect( errmsg );
                if ( ! _conn ) {
                    toolError() << "couldn't connect to [" << toolGlobalParams.connectionString
                              << "] " << errmsg << std::endl;
                    ::_exit(-1);
                }

                toolInfoOutput() << "connected to: " << toolGlobalParams.connectionString
                                 << std::endl;
            }

        }
        else {
            verify( lastError.get( true ) );

            Client::initThread("tools");
            storageGlobalParams.dbpath = toolGlobalParams.dbpath;
            try {
                getGlobalEnvironment()->setGlobalStorageEngine(storageGlobalParams.engine);
            }
            catch (const DBException& ex) {
                if (ex.getCode() == ErrorCodes::DBPathInUse) {
                    toolError() << std::endl << "If you are running a mongod on the same "
                                 "path you should connect to that instead of direct data "
                                  "file access" << std::endl << std::endl;
                }
                else {
                    toolError() << "Failed to initialize storage engine: " << ex.toString();
                }
                dbexit( EXIT_FS );
                ::_exit(EXIT_FAILURE);
            }

            _txn.reset(new OperationContextImpl());
            _conn = new DBDirectClient(_txn.get());
        }

        int ret = -1;
        try {
            if (!toolGlobalParams.useDirectClient && !toolGlobalParams.noconnection)
                auth();
            ret = run();
        }
        catch ( DBException& e ) {
            toolError() << "assertion: " << e.toString() << std::endl;
            ret = -1;
        }
        catch(const boost::filesystem::filesystem_error &fse) {
            /*
              https://jira.mongodb.org/browse/SERVER-2904

              Simple tools that don't access the database, such as
              bsondump, aren't throwing DBExceptions, but are throwing
              boost exceptions.

              The currently available set of error codes don't seem to match
              boost documentation.  boost::filesystem::not_found_error
              (from http://www.boost.org/doc/libs/1_31_0/libs/filesystem/doc/exception.htm)
              doesn't seem to exist in our headers.  Also, fse.code() isn't
              boost::system::errc::no_such_file_or_directory when this
              happens, as you would expect.  And, determined from
              experimentation that the command-line argument gets turned into
              "\\?" instead of "/?" !!!
             */
#if defined(_WIN32)
            if (/*(fse.code() == boost::system::errc::no_such_file_or_directory) &&*/
                (fse.path1() == "\\?"))
                printHelp(cerr);
            else
#endif // _WIN32
                toolError() << "error: " << fse.what() << std::endl;

            ret = -1;
        }

        if ( currentClient.get() )
            currentClient.get()->shutdown();

        if (toolGlobalParams.useDirectClient) {
            exitCleanly(EXIT_CLEAN);
        }

        fflush(stdout);
        fflush(stderr);
        ::_exit(ret);
    }
示例#14
0
    Database* DatabaseHolder::getOrCreate(OperationContext* txn,
                                          const StringData& ns,
                                          bool& justCreated) {

        const StringData dbname = _todb( ns );
        invariant(txn->lockState()->isAtLeastReadLocked(dbname));

        if (txn->lockState()->isWriteLocked() && FileAllocator::get()->hasFailed()) {
            uassert(17507, "Can't take a write lock while out of disk space", false);
        }

        {
            SimpleMutex::scoped_lock lk(_m);
            {
                DBs::const_iterator i = _dbs.find(dbname);
                if( i != _dbs.end() ) {
                    justCreated = false;
                    return i->second;
                }
            }

            // todo: protect against getting sprayed with requests for different db names that DNE -
            //       that would make the DBs map very large.  not clear what to do to handle though,
            //       perhaps just log it, which is what we do here with the "> 40" :
            bool cant = !txn->lockState()->isWriteLocked(ns);
            if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) ||
                _dbs.size() > 40 || cant || DEBUG_BUILD ) {
                log() << "opening db: " << dbname;
            }
            massert(15927, "can't open database in a read lock. if db was just closed, consider retrying the query. might otherwise indicate an internal error", !cant);
        }

        // we know we have a db exclusive lock here
        { // check casing
            string duplicate = Database::duplicateUncasedName(dbname.toString());
            if ( !duplicate.empty() ) {
                stringstream ss;
                ss << "db already exists with different case already have: [" << duplicate
                   << "] trying to create [" << dbname.toString() << "]";
                uasserted( DatabaseDifferCaseCode , ss.str() );
            }
        }

        // we mark our thread as having done writes now as we do not want any exceptions
        // once we start creating a new database
        cc().writeHappened();

        // this locks _m for defensive checks, so we don't want to be locked right here :
        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        invariant(storageEngine);
        Database *db;
        {
            WriteUnitOfWork wunit(txn);
            DatabaseCatalogEntry* entry = storageEngine->getDatabaseCatalogEntry(txn, dbname);
            invariant(entry);
            justCreated = !entry->exists();
            db = new Database(txn, dbname, entry);
            wunit.commit();
        }

        {
            SimpleMutex::scoped_lock lk(_m);
            _dbs[dbname] = db;
        }

        return db;
    }
示例#15
0
        virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {

            if ( cmdObj.firstElement().type() != Array ) {
                errmsg = "ops has to be an array";
                return false;
            }

            BSONObj ops = cmdObj.firstElement().Obj();

            {
                // check input
                BSONObjIterator i( ops );
                while ( i.more() ) {
                    BSONElement e = i.next();
                    if (!_checkOperation(e, errmsg)) {
                        return false;
                    }
                }
            }

            // SERVER-4328 todo : is global ok or does this take a long time? i believe multiple 
            // ns used so locking individually requires more analysis
            ScopedTransaction scopedXact(txn, MODE_X);
            Lock::GlobalWrite globalWriteLock(txn->lockState());

            if (!fromRepl &&
                !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) {
                return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream()
                    << "Not primary while applying ops to database " << dbname));
            }

            // Preconditions check reads the database state, so needs to be done locked
            if ( cmdObj["preCondition"].type() == Array ) {
                BSONObjIterator i( cmdObj["preCondition"].Obj() );
                while ( i.more() ) {
                    BSONObj f = i.next().Obj();

                    DBDirectClient db( txn );
                    BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() );

                    // Apply-ops would never have a $where matcher, so use the default callback,
                    // which will throw an error if $where is found.
                    Matcher m(f["res"].Obj());
                    if ( ! m.matches( realres ) ) {
                        result.append( "got" , realres );
                        result.append( "whatFailed" , f );
                        errmsg = "pre-condition failed";
                        return false;
                    }
                }
            }

            // apply
            int num = 0;
            int errors = 0;
            
            BSONObjIterator i( ops );
            BSONArrayBuilder ab;
            const bool alwaysUpsert = cmdObj.hasField("alwaysUpsert") ?
                    cmdObj["alwaysUpsert"].trueValue() : true;
            
            while ( i.more() ) {
                BSONElement e = i.next();
                const BSONObj& temp = e.Obj();

                // Ignore 'n' operations.
                const char *opType = temp["op"].valuestrsafe();
                if (*opType == 'n') continue;

                const string ns = temp["ns"].String();

                // Run operations under a nested lock as a hack to prevent yielding.
                //
                // The list of operations is supposed to be applied atomically; yielding
                // would break atomicity by allowing an interruption or a shutdown to occur
                // after only some operations are applied.  We are already locked globally
                // at this point, so taking a DBLock on the namespace creates a nested lock,
                // and yields are disallowed for operations that hold a nested lock.
                //
                // We do not have a wrapping WriteUnitOfWork so it is possible for a journal
                // commit to happen with a subset of ops applied.
                // TODO figure out what to do about this.
                Lock::GlobalWrite globalWriteLockDisallowTempRelease(txn->lockState());

                // Ensures that yielding will not happen (see the comment above).
                DEV {
                    Locker::LockSnapshot lockSnapshot;
                    invariant(!txn->lockState()->saveLockStateAndUnlock(&lockSnapshot));
                };

                OldClientContext ctx(txn, ns);

                Status status(ErrorCodes::InternalError, "");
                while (true) {
                    try {
                        // We assume that in the WriteConflict retry case, either the op rolls back
                        // any changes it makes or is otherwise safe to rerun.
                        status =
                            repl::applyOperation_inlock(txn, ctx.db(), temp, false, alwaysUpsert);
                        break;
                    }
                    catch (const WriteConflictException& wce) {
                        LOG(2) << "WriteConflictException in applyOps command, retrying.";
                        txn->recoveryUnit()->commitAndRestart();
                        continue;
                    }
                }

                ab.append(status.isOK());
                if (!status.isOK()) {
                    errors++;
                }

                num++;

                WriteUnitOfWork wuow(txn);
                logOpForDbHash(txn, ns.c_str());
                wuow.commit();
            }

            result.append( "applied" , num );
            result.append( "results" , ab.arr() );

            if ( ! fromRepl ) {
                // We want this applied atomically on slaves
                // so we re-wrap without the pre-condition for speed

                string tempNS = str::stream() << dbname << ".$cmd";

                // TODO: possibly use mutable BSON to remove preCondition field
                // once it is available
                BSONObjIterator iter(cmdObj);
                BSONObjBuilder cmdBuilder;

                while (iter.more()) {
                    BSONElement elem(iter.next());
                    if (strcmp(elem.fieldName(), "preCondition") != 0) {
                        cmdBuilder.append(elem);
                    }
                }

                const BSONObj cmdRewritten = cmdBuilder.done();

                // We currently always logOp the command regardless of whether the individial ops
                // succeeded and rely on any failures to also happen on secondaries. This isn't
                // perfect, but it's what the command has always done and is part of its "correct"
                // behavior.
                while (true) {
                    try {
                        WriteUnitOfWork wunit(txn);
                        getGlobalEnvironment()->getOpObserver()->onApplyOps(txn,
                                                                            tempNS,
                                                                            cmdRewritten);
                        wunit.commit();
                        break;
                    }
                    catch (const WriteConflictException& wce) {
                        LOG(2) <<
                            "WriteConflictException while logging applyOps command, retrying.";
                        txn->recoveryUnit()->commitAndRestart();
                        continue;
                    }
                }
            }

            if (errors != 0) {
                return false;
            }

            return true;
        }