intrusive_ptr<DocumentSource> DocumentSourceGroup::createFromBson(
            BSONElement elem,
            const intrusive_ptr<ExpressionContext> &pExpCtx) {
        uassert(15947, "a group's fields must be specified in an object",
                elem.type() == Object);

        intrusive_ptr<DocumentSourceGroup> pGroup(
            DocumentSourceGroup::create(pExpCtx));

        BSONObj groupObj(elem.Obj());
        BSONObjIterator groupIterator(groupObj);
        VariablesIdGenerator idGenerator;
        VariablesParseState vps(&idGenerator);
        while(groupIterator.more()) {
            BSONElement groupField(groupIterator.next());
            const char *pFieldName = groupField.fieldName();

            if (str::equals(pFieldName, "_id")) {
                uassert(15948, "a group's _id may only be specified once",
                        pGroup->_idExpressions.empty());
                pGroup->parseIdExpression(groupField, vps);
                invariant(!pGroup->_idExpressions.empty());
            }
            else if (str::equals(pFieldName, "$doingMerge")) {
                massert(17030, "$doingMerge should be true if present",
                        groupField.Bool());

                pGroup->setDoingMerge(true);
            }
            else {
                /*
                  Treat as a projection field with the additional ability to
                  add aggregation operators.
                */
                uassert(16414, str::stream() <<
                        "the group aggregate field name '" << pFieldName <<
                        "' cannot be used because $group's field names cannot contain '.'",
                        !str::contains(pFieldName, '.') );

                uassert(15950, str::stream() <<
                        "the group aggregate field name '" <<
                        pFieldName << "' cannot be an operator name",
                        pFieldName[0] != '$');

                uassert(15951, str::stream() <<
                        "the group aggregate field '" << pFieldName <<
                        "' must be defined as an expression inside an object",
                        groupField.type() == Object);

                BSONObj subField(groupField.Obj());
                BSONObjIterator subIterator(subField);
                size_t subCount = 0;
                for(; subIterator.more(); ++subCount) {
                    BSONElement subElement(subIterator.next());

                    /* look for the specified operator */
                    GroupOpDesc key;
                    key.name = subElement.fieldName();
                    const GroupOpDesc *pOp =
                        (const GroupOpDesc *)bsearch(
                              &key, GroupOpTable, NGroupOp, sizeof(GroupOpDesc),
                                      GroupOpDescCmp);

                    uassert(15952, str::stream() << "unknown group operator '" << key.name << "'",
                            pOp);

                    intrusive_ptr<Expression> pGroupExpr;

                    BSONType elementType = subElement.type();
                    if (elementType == Object) {
                        Expression::ObjectCtx oCtx(Expression::ObjectCtx::DOCUMENT_OK);
                        pGroupExpr = Expression::parseObject(subElement.Obj(), &oCtx, vps);
                    }
                    else if (elementType == Array) {
                        uasserted(15953, str::stream()
                                << "aggregating group operators are unary (" << key.name << ")");
                    }
                    else { /* assume its an atomic single operand */
                        pGroupExpr = Expression::parseOperand(subElement, vps);
                    }

                    pGroup->addAccumulator(pFieldName, pOp->factory, pGroupExpr);
                }

                uassert(15954, str::stream() <<
                        "the computed aggregate '" <<
                        pFieldName << "' must specify exactly one operator",
                        subCount == 1);
            }
        }

        uassert(15955, "a group specification must include an _id",
                !pGroup->_idExpressions.empty());

        pGroup->_variables.reset(new Variables(idGenerator.getIdCount()));

        return pGroup;
    }
Esempio n. 2
0
        bool group( OperationContext* txn,
                    Database* db,
                    const std::string& ns,
                    const BSONObj& query,
                    BSONObj keyPattern,
                    const std::string& keyFunctionCode,
                    const std::string& reduceCode,
                    const char * reduceScope,
                    BSONObj initial,
                    const std::string& finalize,
                    string& errmsg,
                    BSONObjBuilder& result ) {

            const string userToken = ClientBasic::getCurrent()->getAuthorizationSession()
                                                              ->getAuthenticatedUserNamesToken();
            auto_ptr<Scope> s = globalScriptEngine->getPooledScope(db->name(), "group" + userToken);

            if ( reduceScope )
                s->init( reduceScope );

            s->setObject( "$initial" , initial , true );

            s->exec( "$reduce = " + reduceCode , "$group reduce setup" , false , true , true , 100 );
            s->exec( "$arr = [];" , "$group reduce setup 2" , false , true , true , 100 );
            ScriptingFunction f = s->createFunction(
                                      "function(){ "
                                      "  if ( $arr[n] == null ){ "
                                      "    next = {}; "
                                      "    Object.extend( next , $key ); "
                                      "    Object.extend( next , $initial , true ); "
                                      "    $arr[n] = next; "
                                      "    next = null; "
                                      "  } "
                                      "  $reduce( obj , $arr[n] ); "
                                      "}" );

            ScriptingFunction keyFunction = 0;
            if ( keyFunctionCode.size() ) {
                keyFunction = s->createFunction( keyFunctionCode.c_str() );
            }


            double keysize = keyPattern.objsize() * 3;
            double keynum = 1;

            Collection* collection = db->getCollection( txn, ns );

            const WhereCallbackReal whereCallback(txn, StringData(db->name()));

            map<BSONObj,int,BSONObjCmp> map;
            list<BSONObj> blah;

            if (collection) {
                CanonicalQuery* cq;
                if (!CanonicalQuery::canonicalize(ns, query, &cq, whereCallback).isOK()) {
                    uasserted(17212, "Can't canonicalize query " + query.toString());
                    return 0;
                }

                Runner* rawRunner;
                if (!getRunner(txn,collection, cq, &rawRunner).isOK()) {
                    uasserted(17213, "Can't get runner for query " + query.toString());
                    return 0;
                }

                auto_ptr<Runner> runner(rawRunner);
                const ScopedRunnerRegistration safety(runner.get());

                BSONObj obj;
                Runner::RunnerState state;
                while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                    BSONObj key = getKey(obj , keyPattern , keyFunction , keysize / keynum,
                                         s.get() );
                    keysize += key.objsize();
                    keynum++;

                    int& n = map[key];
                    if ( n == 0 ) {
                        n = map.size();
                        s->setObject( "$key" , key , true );
                        uassert(17203, "group() can't handle more than 20000 unique keys",
                                n <= 20000 );
                    }

                    s->setObject( "obj" , obj , true );
                    s->setNumber( "n" , n - 1 );
                    if ( s->invoke( f , 0, 0 , 0 , true ) ) {
                        throw UserException(17214,
                                            (string)"reduce invoke failed: " + s->getError());
                    }
                }
            }

            if (!finalize.empty()) {
                s->exec( "$finalize = " + finalize , "$group finalize define" ,
                         false , true , true , 100 );
                ScriptingFunction g = s->createFunction(
                                          "function(){ "
                                          "  for(var i=0; i < $arr.length; i++){ "
                                          "  var ret = $finalize($arr[i]); "
                                          "  if (ret !== undefined) "
                                          "    $arr[i] = ret; "
                                          "  } "
                                          "}" );
                s->invoke( g , 0, 0 , 0 , true );
            }

            result.appendArray( "retval" , s->getObject( "$arr" ) );
            result.append( "count" , keynum - 1 );
            result.append( "keys" , (int)(map.size()) );
            s->exec( "$arr = [];" , "$group reduce setup 2" , false , true , true , 100 );
            s->gc();

            return true;
        }
DocumentSource::GetNextResult Exchange::getNext(OperationContext* opCtx, size_t consumerId) {
    // Grab a lock.
    stdx::unique_lock<stdx::mutex> lk(_mutex);

    for (;;) {
        // Execute only in case we have not encountered an error.
        if (!_errorInLoadNextBatch.isOK()) {
            uasserted(ErrorCodes::ExchangePassthrough,
                      "Exchange failed due to an error on different thread.");
        }

        // Check if we have a document.
        if (!_consumers[consumerId]->isEmpty()) {
            auto doc = _consumers[consumerId]->getNext();
            unblockLoading(consumerId);

            return doc;
        }

        // There is not any document so try to load more from the source.
        if (_loadingThreadId == kInvalidThreadId) {
            LOG(3) << "A consumer " << consumerId << " begins loading";

            try {
                // This consumer won the race and will fill the buffers.
                _loadingThreadId = consumerId;

                _pipeline->reattachToOperationContext(opCtx);

                // This will return when some exchange buffer is full and we cannot make any forward
                // progress anymore.
                // The return value is an index of a full consumer buffer.
                size_t fullConsumerId = loadNextBatch();

                if (MONGO_FAIL_POINT(exchangeFailLoadNextBatch)) {
                    log() << "exchangeFailLoadNextBatch fail point enabled.";
                    uasserted(ErrorCodes::FailPointEnabled,
                              "Asserting on loading the next batch due to failpoint.");
                }

                _pipeline->detachFromOperationContext();

                // The loading cannot continue until the consumer with the full buffer consumes some
                // documents.
                _loadingThreadId = fullConsumerId;

                // Wake up everybody and try to make some progress.
                _haveBufferSpace.notify_all();
            } catch (const DBException& ex) {
                _errorInLoadNextBatch = ex.toStatus();

                // We have to wake up all other blocked threads so they can detect the error and
                // fail too. They can be woken up only after _errorInLoadNextBatch has been set.
                _haveBufferSpace.notify_all();

                throw;
            }
        } else {
            // Some other consumer is already loading the buffers. There is nothing else we can do
            // but wait.
            _haveBufferSpace.wait(lk);
        }
    }
}
Esempio n. 4
0
 void uasserted(int msgid , const string &msg) {
     uasserted(msgid, msg.c_str());
 }
Esempio n. 5
0
void WiredTigerRecoveryUnit::_txnOpen() {
    invariant(!_isActive(), toString(_state));
    invariant(!_isCommittingOrAborting(),
              str::stream() << "commit or rollback handler reopened transaction: "
                            << toString(_state));
    _ensureSession();

    // Only start a timer for transaction's lifetime if we're going to log it.
    if (shouldLog(kSlowTransactionSeverity)) {
        _timer.reset(new Timer());
    }
    WT_SESSION* session = _session->getSession();

    switch (_timestampReadSource) {
        case ReadSource::kUnset:
        case ReadSource::kNoTimestamp: {
            WiredTigerBeginTxnBlock txnOpen(session, _ignorePrepared);

            if (_isOplogReader) {
                auto status =
                    txnOpen.setTimestamp(Timestamp(_oplogManager->getOplogReadTimestamp()),
                                         WiredTigerBeginTxnBlock::RoundToOldest::kRound);
                fassert(50771, status);
            }
            txnOpen.done();
            break;
        }
        case ReadSource::kMajorityCommitted: {
            // We reset _majorityCommittedSnapshot to the actual read timestamp used when the
            // transaction was started.
            _majorityCommittedSnapshot =
                _sessionCache->snapshotManager().beginTransactionOnCommittedSnapshot(
                    session, _ignorePrepared);
            break;
        }
        case ReadSource::kLastApplied: {
            if (_sessionCache->snapshotManager().getLocalSnapshot()) {
                _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot(
                    session, _ignorePrepared);
            } else {
                WiredTigerBeginTxnBlock(session, _ignorePrepared).done();
            }
            break;
        }
        case ReadSource::kAllCommittedSnapshot: {
            if (_readAtTimestamp.isNull()) {
                _readAtTimestamp = _beginTransactionAtAllCommittedTimestamp(session);
                break;
            }
            // Intentionally continue to the next case to read at the _readAtTimestamp.
        }
        case ReadSource::kLastAppliedSnapshot: {
            // Only ever read the last applied timestamp once, and continue reusing it for
            // subsequent transactions.
            if (_readAtTimestamp.isNull()) {
                _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot(
                    session, _ignorePrepared);
                break;
            }
            // Intentionally continue to the next case to read at the _readAtTimestamp.
        }
        case ReadSource::kProvided: {
            WiredTigerBeginTxnBlock txnOpen(session, _ignorePrepared);
            auto status = txnOpen.setTimestamp(_readAtTimestamp);

            if (!status.isOK() && status.code() == ErrorCodes::BadValue) {
                uasserted(ErrorCodes::SnapshotTooOld,
                          str::stream() << "Read timestamp " << _readAtTimestamp.toString()
                                        << " is older than the oldest available timestamp.");
            }
            uassertStatusOK(status);
            txnOpen.done();
            break;
        }
    }

    LOG(3) << "WT begin_transaction for snapshot id " << _mySnapshotId;
}
Esempio n. 6
0
        Config::Config( const string& _dbname , const BSONObj& cmdObj ) {

            dbname = _dbname;
            ns = dbname + "." + cmdObj.firstElement().valuestr();

            verbose = cmdObj["verbose"].trueValue();

            uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() );

            if ( cmdObj["out"].type() == String ) {
                finalShort = cmdObj["out"].String();
                outType = REPLACE;
            }
            else if ( cmdObj["out"].type() == Object ) {
                BSONObj o = cmdObj["out"].embeddedObject();

                BSONElement e = o.firstElement();
                string t = e.fieldName();

                if ( t == "normal" || t == "replace" ) {
                    outType = REPLACE;
                    finalShort = e.String();
                }
                else if ( t == "merge" ) {
                    outType = MERGE;
                    finalShort = e.String();
                }
                else if ( t == "reduce" ) {
                    outType = REDUCE;
                    finalShort = e.String();
                }
                else if ( t == "inline" ) {
                    outType = INMEMORY;
                }
                else {
                    uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" );
                }

                if (o.hasElement("db")) {
                    outDB = o["db"].String();
                }
            }
            else {
                uasserted( 13606 , "'out' has to be a string or an object" );
            }

            if ( outType != INMEMORY ) { // setup names
                tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << finalShort << "_" << JOB_NUMBER++;

                incLong = tempLong + "_inc";

                finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort;
            }

            {
                // scope and code

                if ( cmdObj["scope"].type() == Object )
                    scopeSetup = cmdObj["scope"].embeddedObjectUserCheck();

                mapper.reset( new JSMapper( cmdObj["map"] ) );
                reducer.reset( new JSReducer( cmdObj["reduce"] ) );
                if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() )
                    finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) );

                if ( cmdObj["mapparams"].type() == Array ) {
                    mapParams = cmdObj["mapparams"].embeddedObjectUserCheck();
                }

            }

            {
                // query options
                BSONElement q = cmdObj["query"];
                if ( q.type() == Object )
                    filter = q.embeddedObjectUserCheck();
                else
                    uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() );


                BSONElement s = cmdObj["sort"];
                if ( s.type() == Object )
                    sort = s.embeddedObjectUserCheck();
                else
                    uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() );

                if ( cmdObj["limit"].isNumber() )
                    limit = cmdObj["limit"].numberLong();
                else
                    limit = 0;
            }
        }
Esempio n. 7
0
    unsigned long long addExistingToIndex( OperationContext* txn,
                                           Collection* collection,
                                           const IndexDescriptor* descriptor,
                                           IndexAccessMethod* accessMethod,
                                           bool canBeKilled ) {

        string ns = collection->ns().ns(); // our copy for sanity

        bool dupsAllowed = !descriptor->unique();
        bool dropDups = descriptor->dropDups();

        string curopMessage;
        {
            stringstream ss;
            ss << "Index Build";
            if ( canBeKilled )
                ss << "(background)";
            curopMessage = ss.str();
        }

        ProgressMeter* progress = txn->setMessage(curopMessage.c_str(),
                                                  curopMessage,
                                                  collection->numRecords());

        unsigned long long n = 0;
        unsigned long long numDropped = 0;

        auto_ptr<Runner> runner(InternalPlanner::collectionScan(ns,collection));

        std::string idxName = descriptor->indexName();

        // After this yields in the loop, idx may point at a different index (if indexes get
        // flipped, see insert_makeIndex) or even an empty IndexDetails, so nothing below should
        // depend on idx. idxNo should be recalculated after each yield.

        BSONObj js;
        DiskLoc loc;
        while (Runner::RUNNER_ADVANCED == runner->getNext(&js, &loc)) {
            try {
                if ( !dupsAllowed && dropDups ) {
                    LastError::Disabled led( lastError.get() );
                    addKeysToIndex(txn, collection, descriptor, accessMethod, js, loc);
                }
                else {
                    addKeysToIndex(txn, collection, descriptor, accessMethod, js, loc);
                }
            }
            catch( AssertionException& e ) {
                if (ErrorCodes::isInterruption(DBException::convertExceptionCode(e.getCode()))) {
                    txn->checkForInterrupt();
                }

                // TODO: Does exception really imply dropDups exception?
                if (dropDups) {
                    bool runnerEOF = runner->isEOF();
                    runner->saveState();
                    BSONObj toDelete;
                    collection->deleteDocument( txn, loc, false, true, &toDelete );
                    repl::logOp(txn, "d", ns.c_str(), toDelete);

                    if (!runner->restoreState(txn)) {
                        // Runner got killed somehow.  This probably shouldn't happen.
                        if (runnerEOF) {
                            // Quote: "We were already at the end.  Normal.
                            // TODO: Why is this normal?
                        }
                        else {
                            uasserted(ErrorCodes::CursorNotFound, 
                                      "cursor gone during bg index; dropDups");
                        }
                        break;
                    }
                    // We deleted a record, but we didn't actually yield the dblock.
                    // TODO: Why did the old code assume we yielded the lock?
                    numDropped++;
                }
                else {
                    log() << "background addExistingToIndex exception " << e.what() << endl;
                    throw;
                }
            }

            n++;
            progress->hit();

            txn->recoveryUnit()->commitIfNeeded();

            if (canBeKilled) {
                // Checking for interrupt here is necessary because the bg index 
                // interruptors can only interrupt this index build while they hold 
                // a write lock, and yieldAndCheckIfOK only checks for
                // interrupt prior to yielding our write lock. We need to check the kill flag
                // here before another iteration of the loop.
                txn->checkForInterrupt();
            }

            progress->setTotalWhileRunning( collection->numRecords() );
        }

        progress->finished();
        if ( dropDups && numDropped )
            log() << "\t index build dropped: " << numDropped << " dups";
        return n;
    }
UpdateNode::ApplyResult RenameNode::apply(ApplyParams applyParams) const {
    // It would make sense to store fromFieldRef and toFieldRef as members during
    // RenameNode::init(), but FieldRef is not copyable.
    auto fromFieldRef = std::make_shared<FieldRef>(_val.fieldName());
    FieldRef toFieldRef(_val.valueStringData());

    mutablebson::Document& document = applyParams.element.getDocument();

    size_t fromIdxFound;
    mutablebson::Element fromElement(document.end());
    auto status =
        pathsupport::findLongestPrefix(*fromFieldRef, document.root(), &fromIdxFound, &fromElement);

    if (!status.isOK() || !fromElement.ok() || fromIdxFound != (fromFieldRef->numParts() - 1)) {
        // We could safely remove this restriction (thereby treating a rename with a non-viable
        // source path as a no-op), but most updates fail on an attempt to update a non-viable path,
        // so we throw an error for consistency.
        if (status == ErrorCodes::PathNotViable) {
            uassertStatusOK(status);
            MONGO_UNREACHABLE;  // The previous uassertStatusOK should always throw.
        }

        // The element we want to rename does not exist. When that happens, we treat the operation
        // as a no-op. The attempted from/to paths are still considered modified.
        if (applyParams.modifiedPaths) {
            applyParams.modifiedPaths->keepShortest(*fromFieldRef);
            applyParams.modifiedPaths->keepShortest(toFieldRef);
        }
        return ApplyResult::noopResult();
    }

    // Renaming through an array is prohibited. Check that our source path does not contain an
    // array. (The element being renamed may be an array, however.)
    for (auto currentElement = fromElement.parent(); currentElement != document.root();
         currentElement = currentElement.parent()) {
        invariant(currentElement.ok());
        if (BSONType::Array == currentElement.getType()) {
            auto idElem = mutablebson::findFirstChildNamed(document.root(), "_id");
            uasserted(ErrorCodes::BadValue,
                      str::stream() << "The source field cannot be an array element, '"
                                    << fromFieldRef->dottedField()
                                    << "' in doc with "
                                    << (idElem.ok() ? idElem.toString() : "no id")
                                    << " has an array field called '"
                                    << currentElement.getFieldName()
                                    << "'");
        }
    }

    // Check that our destination path does not contain an array. (If the rename will overwrite an
    // existing element, that element may be an array. Iff pathToCreate is empty, "element"
    // represents an element that we are going to overwrite.)
    for (auto currentElement = applyParams.pathToCreate->empty() ? applyParams.element.parent()
                                                                 : applyParams.element;
         currentElement != document.root();
         currentElement = currentElement.parent()) {
        invariant(currentElement.ok());
        if (BSONType::Array == currentElement.getType()) {
            auto idElem = mutablebson::findFirstChildNamed(document.root(), "_id");
            uasserted(ErrorCodes::BadValue,
                      str::stream() << "The destination field cannot be an array element, '"
                                    << toFieldRef.dottedField()
                                    << "' in doc with "
                                    << (idElem.ok() ? idElem.toString() : "no id")
                                    << " has an array field called '"
                                    << currentElement.getFieldName()
                                    << "'");
        }
    }

    // Once we've determined that the rename is valid and found the source element, the actual work
    // gets broken out into a $set operation and an $unset operation. Note that, generally, we
    // should call the init() method of a ModifierNode before calling its apply() method, but the
    // init() methods of SetElementNode and UnsetNode don't do anything, so we can skip them.
    SetElementNode setElement(fromElement);
    auto setElementApplyResult = setElement.apply(applyParams);

    ApplyParams unsetParams(applyParams);
    unsetParams.element = fromElement;
    unsetParams.pathToCreate = std::make_shared<FieldRef>();
    unsetParams.pathTaken = fromFieldRef;

    UnsetNode unsetElement;
    auto unsetElementApplyResult = unsetElement.apply(unsetParams);

    // Determine the final result based on the results of the $set and $unset.
    ApplyResult applyResult;
    applyResult.indexesAffected =
        setElementApplyResult.indexesAffected || unsetElementApplyResult.indexesAffected;

    // The $unset would only be a no-op if the source element did not exist, in which case we would
    // have exited early with a no-op result.
    invariant(!unsetElementApplyResult.noop);

    return applyResult;
}
Esempio n. 9
0
OpMsg OpMsg::parse(const Message& message) try {
    // It is the caller's responsibility to call the correct parser for a given message type.
    invariant(!message.empty());
    invariant(message.operation() == dbMsg);

    const uint32_t flags = OpMsg::flags(message);
    uassert(ErrorCodes::IllegalOpMsgFlag,
            str::stream() << "Message contains illegal flags value: Ob"
                          << std::bitset<32>(flags).to_string(),
            !containsUnknownRequiredFlags(flags));

    constexpr int kCrc32Size = 4;
    const bool haveChecksum = flags & kChecksumPresent;
    const int checksumSize = haveChecksum ? kCrc32Size : 0;

    // The sections begin after the flags and before the checksum (if present).
    BufReader sectionsBuf(message.singleData().data() + sizeof(flags),
                          message.dataSize() - sizeof(flags) - checksumSize);

    // TODO some validation may make more sense in the IDL parser. I've tagged them with comments.
    bool haveBody = false;
    OpMsg msg;
    while (!sectionsBuf.atEof()) {
        const auto sectionKind = sectionsBuf.read<Section>();
        switch (sectionKind) {
            case Section::kBody: {
                uassert(40430, "Multiple body sections in message", !haveBody);
                haveBody = true;
                msg.body = sectionsBuf.read<Validated<BSONObj>>();
                break;
            }

            case Section::kDocSequence: {
                // We use an O(N^2) algorithm here and an O(N*M) algorithm below. These are fastest
                // for the current small values of N, but would be problematic if it is large.
                // If we need more document sequences, raise the limit and use a better algorithm.
                uassert(ErrorCodes::TooManyDocumentSequences,
                        "Too many document sequences in OP_MSG",
                        msg.sequences.size() < 2);  // Limit is <=2 since we are about to add one.

                // The first 4 bytes are the total size, including themselves.
                const auto remainingSize =
                    sectionsBuf.read<LittleEndian<int32_t>>() - sizeof(int32_t);
                BufReader seqBuf(sectionsBuf.skip(remainingSize), remainingSize);
                const auto name = seqBuf.readCStr();
                uassert(40431,
                        str::stream() << "Duplicate document sequence: " << name,
                        !msg.getSequence(name));  // TODO IDL

                msg.sequences.push_back({name.toString()});
                while (!seqBuf.atEof()) {
                    msg.sequences.back().objs.push_back(seqBuf.read<Validated<BSONObj>>());
                }
                break;
            }

            default:
                // Using uint32_t so we append as a decimal number rather than as a char.
                uasserted(40432, str::stream() << "Unknown section kind " << uint32_t(sectionKind));
        }
    }

    uassert(40587, "OP_MSG messages must have a body", haveBody);

    // Detect duplicates between doc sequences and body. TODO IDL
    // Technically this is O(N*M) but N is at most 2.
    for (const auto& docSeq : msg.sequences) {
        const char* name = docSeq.name.c_str();  // Pointer is redirected by next call.
        auto inBody =
            !dotted_path_support::extractElementAtPathOrArrayAlongPath(msg.body, name).eoo();
        uassert(40433,
                str::stream() << "Duplicate field between body and document sequence "
                              << docSeq.name,
                !inBody);
    }

    return msg;
} catch (const DBException& ex) {
    LOG(1) << "invalid message: " << ex.code() << " " << redact(ex) << " -- "
           << redact(hexdump(message.singleData().view2ptr(), message.size()));
    throw;
}
Esempio n. 10
0
    UpdateResult update(const UpdateRequest& request, OpDebug* opDebug, UpdateDriver* driver) {

        LOG(3) << "processing update : " << request;
        const NamespaceString& nsString = request.getNamespaceString();

        validateUpdate( nsString.ns().c_str(), request.getUpdates(), request.getQuery() );

        NamespaceDetails* nsDetails = nsdetails( nsString.ns() );
        NamespaceDetailsTransient* nsDetailsTransient =
            &NamespaceDetailsTransient::get( nsString.ns().c_str() );

        // TODO: This seems a bit circuitious.
        opDebug->updateobj = request.getUpdates();

        driver->refreshIndexKeys( nsDetailsTransient->indexKeys() );

        shared_ptr<Cursor> cursor = getOptimizedCursor(
            nsString.ns(), request.getQuery(), BSONObj(), request.getQueryPlanSelectionPolicy() );

        // If the update was marked with '$isolated' (a.k.a '$atomic'), we are not allowed to
        // yield while evaluating the update loop below.
        //
        // TODO: Old code checks this repeatedly within the update loop. Is that necessary? It seems
        // that once atomic should be always atomic.
        const bool isolated =
            cursor->ok() &&
            cursor->matcher() &&
            cursor->matcher()->docMatcher().atomic();

        // The 'cursor' the optimizer gave us may contain query plans that generate duplicate
        // diskloc's. We set up here the mechanims that will prevent us from processing those
        // twice if we see them. We also set up a 'ClientCursor' so that we can support
        // yielding.
        //
        // TODO: Is it valid to call this on a non-ok cursor?
        const bool dedupHere = cursor->autoDedup();

        //
        // We'll start assuming we have one or more documents for this update. (Othwerwise,
        // we'll fallback to upserting.)
        //

        // We record that this will not be an upsert, in case a mod doesn't want to be applied
        // when in strict update mode.
        driver->setContext( ModifierInterface::ExecInfo::UPDATE_CONTEXT );

        // Let's fetch each of them and pipe them through the update expression, making sure to
        // keep track of the necessary stats. Recall that we'll be pulling documents out of
        // cursors and some of them do not deduplicate the entries they generate. We have
        // deduping logic in here, too -- for now.
        unordered_set<DiskLoc, DiskLoc::Hasher> seenLocs;
        int numMatched = 0;

        // Reset these counters on each call. We might re-enter this function to retry this
        // update if we throw a page fault exception below, and we rely on these counters
        // reflecting only the actions taken locally. In particlar, we must have the no-op
        // counter reset so that we can meaningfully comapre it with numMatched above.
        opDebug->nscanned = 0;
        opDebug->nupdateNoops = 0;

        Client& client = cc();

        mutablebson::Document doc;
        mutablebson::DamageVector damages;

        // If we are going to be yielding, we will need a ClientCursor scoped to this loop. We
        // only loop as long as the underlying cursor is OK.
        for ( auto_ptr<ClientCursor> clientCursor; cursor->ok(); ) {

            // If we haven't constructed a ClientCursor, and if the client allows us to throw
            // page faults, and if we are referring to a location that is likely not in
            // physical memory, then throw a PageFaultException. The entire operation will be
            // restarted.
            if ( clientCursor.get() == NULL &&
                 client.allowedToThrowPageFaultException() &&
                 !cursor->currLoc().isNull() &&
                 !cursor->currLoc().rec()->likelyInPhysicalMemory() ) {

                // We should never throw a PFE if we have already updated items. The numMatched
                // variable includes no-ops, which do not prevent us from raising a PFE, so if
                // numMatched is non-zero, we are still OK to throw as long all matched items
                // resulted in a no-op.
                dassert((numMatched == 0) || (numMatched == opDebug->nupdateNoops));

                throw PageFaultException( cursor->currLoc().rec() );
            }

            if ( !isolated && opDebug->nscanned != 0 ) {

                // We are permitted to yield. To do so we need a ClientCursor, so create one
                // now if we have not yet done so.
                if ( !clientCursor.get() )
                    clientCursor.reset(
                        new ClientCursor( QueryOption_NoCursorTimeout, cursor, nsString.ns() ) );

                // Ask the client cursor to yield. We get two bits of state back: whether or not
                // we yielded, and whether or not we correctly recovered from yielding.
                bool yielded = false;
                const bool recovered = clientCursor->yieldSometimes(
                    ClientCursor::WillNeed, &yielded );

                if ( !recovered ) {
                    // If we failed to recover from the yield, then the ClientCursor is already
                    // gone. Release it so we don't destroy it a second time.
                    clientCursor.release();
                    break;
                }

                if ( !cursor->ok() ) {
                    // If the cursor died while we were yielded, just get out of the update loop.
                    break;
                }

                if ( yielded ) {
                    // We yielded and recovered OK, and our cursor is still good. Details about
                    // our namespace may have changed while we were yielded, so we re-acquire
                    // them here. If we can't do so, escape the update loop. Otherwise, refresh
                    // the driver so that it knows about what is currently indexed.
                    nsDetails = nsdetails( nsString.ns() );
                    if ( !nsDetails )
                        break;
                    nsDetailsTransient = &NamespaceDetailsTransient::get( nsString.ns().c_str() );

                    // TODO: This copies the index keys, but it may not need to do so.
                    driver->refreshIndexKeys( nsDetailsTransient->indexKeys() );
                }

            }

            // Let's fetch the next candidate object for this update.
            Record* record = cursor->_current();
            DiskLoc loc = cursor->currLoc();
            const BSONObj oldObj = loc.obj();

            // We count how many documents we scanned even though we may skip those that are
            // deemed duplicated. The final 'numUpdated' and 'nscanned' numbers may differ for
            // that reason.
            opDebug->nscanned++;

            // Skips this document if it:
            // a) doesn't match the query portion of the update
            // b) was deemed duplicate by the underlying cursor machinery
            //
            // Now, if we are going to update the document,
            // c) we don't want to do so while the cursor is at it, as that may invalidate
            // the cursor. So, we advance to next document, before issuing the update.
            MatchDetails matchDetails;
            matchDetails.requestElemMatchKey();
            if ( !cursor->currentMatches( &matchDetails ) ) {
                // a)
                cursor->advance();
                continue;
            }
            else if ( cursor->getsetdup( loc ) && dedupHere ) {
                // b)
                cursor->advance();
                continue;
            }
            else if (!driver->isDocReplacement() && request.isMulti()) {
                // c)
                cursor->advance();
                if ( dedupHere ) {
                    if ( seenLocs.count( loc ) ) {
                        continue;
                    }
                }

                // There are certain kind of cursors that hold multiple pointers to data
                // underneath. $or cursors is one example. In a $or cursor, it may be the case
                // that when we did the last advance(), we finished consuming documents from
                // one of $or child and started consuming the next one. In that case, it is
                // possible that the last document of the previous child is the same as the
                // first document of the next (see SERVER-5198 and jstests/orp.js).
                //
                // So we advance the cursor here until we see a new diskloc.
                //
                // Note that we won't be yielding, and we may not do so for a while if we find
                // a particularly duplicated sequence of loc's. That is highly unlikely,
                // though.  (See SERVER-5725, if curious, but "stage" based $or will make that
                // ticket moot).
                while( cursor->ok() && loc == cursor->currLoc() ) {
                    cursor->advance();
                }
            }

            // For some (unfortunate) historical reasons, not all cursors would be valid after
            // a write simply because we advanced them to a document not affected by the write.
            // To protect in those cases, not only we engaged in the advance() logic above, but
            // we also tell the cursor we're about to write a document that we've just seen.
            // prepareToTouchEarlierIterate() requires calling later
            // recoverFromTouchingEarlierIterate(), so we make a note here to do so.
            bool touchPreviousDoc = request.isMulti() && cursor->ok();
            if ( touchPreviousDoc ) {
                if ( clientCursor.get() )
                    clientCursor->setDoingDeletes( true );
                cursor->prepareToTouchEarlierIterate();
            }

            // Found a matching document
            numMatched++;

            // Ask the driver to apply the mods. It may be that the driver can apply those "in
            // place", that is, some values of the old document just get adjusted without any
            // change to the binary layout on the bson layer. It may be that a whole new
            // document is needed to accomodate the new bson layout of the resulting document.
            doc.reset( oldObj, mutablebson::Document::kInPlaceEnabled );
            BSONObj logObj;

            // If there was a matched field, obtain it.
            string matchedField;
            if (matchDetails.hasElemMatchKey())
                matchedField = matchDetails.elemMatchKey();

            Status status = driver->update( matchedField, &doc, &logObj );
            if ( !status.isOK() ) {
                uasserted( 16837, status.reason() );
            }

            // If the driver applied the mods in place, we can ask the mutable for what
            // changed. We call those changes "damages". :) We use the damages to inform the
            // journal what was changed, and then apply them to the original document
            // ourselves. If, however, the driver applied the mods out of place, we ask it to
            // generate a new, modified document for us. In that case, the file manager will
            // take care of the journaling details for us.
            //
            // This code flow is admittedly odd. But, right now, journaling is baked in the file
            // manager. And if we aren't using the file manager, we have to do jounaling
            // ourselves.
            bool objectWasChanged = false;
            BSONObj newObj;
            const char* source = NULL;
            bool inPlace = doc.getInPlaceUpdates(&damages, &source);
            if ( inPlace && !driver->modsAffectIndices() ) {
                // If a set of modifiers were all no-ops, we are still 'in place', but there is
                // no work to do, in which case we want to consider the object unchanged.
                if (!damages.empty() ) {
                    nsDetails->paddingFits();

                    // All updates were in place. Apply them via durability and writing pointer.
                    mutablebson::DamageVector::const_iterator where = damages.begin();
                    const mutablebson::DamageVector::const_iterator end = damages.end();
                    for( ; where != end; ++where ) {
                        const char* sourcePtr = source + where->sourceOffset;
                        void* targetPtr = getDur().writingPtr(
                            const_cast<char*>(oldObj.objdata()) + where->targetOffset,
                            where->size);
                        std::memcpy(targetPtr, sourcePtr, where->size);
                    }
                    objectWasChanged = true;
                    opDebug->fastmod = true;
                }
                newObj = oldObj;
            }
            else {

                // The updates were not in place. Apply them through the file manager.
                newObj = doc.getObject();
                DiskLoc newLoc = theDataFileMgr.updateRecord(nsString.ns().c_str(),
                                                             nsDetails,
                                                             nsDetailsTransient,
                                                             record,
                                                             loc,
                                                             newObj.objdata(),
                                                             newObj.objsize(),
                                                             *opDebug);

                // If we've moved this object to a new location, make sure we don't apply
                // that update again if our traversal picks the objecta again.
                //
                // We also take note that the diskloc if the updates are affecting indices.
                // Chances are that we're traversing one of them and they may be multi key and
                // therefore duplicate disklocs.
                if ( newLoc != loc || driver->modsAffectIndices()  ) {
                    seenLocs.insert( newLoc );
                }

                objectWasChanged = true;
            }

            // Log Obj
            if ( request.shouldUpdateOpLog() ) {
                if ( driver->isDocReplacement() || !logObj.isEmpty() ) {
                    BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti());
                    logOp("u", nsString.ns().c_str(), logObj , &idQuery,
                          NULL, request.isFromMigration(), &newObj);
                }
            }

            // If it was noop since the document didn't change, record that.
            if (!objectWasChanged)
                opDebug->nupdateNoops++;

            if (!request.isMulti()) {
                break;
            }

            // If we used the cursor mechanism that prepares an earlier seen document for a
            // write we need to tell such mechanisms that the write is over.
            if ( touchPreviousDoc ) {
                cursor->recoverFromTouchingEarlierIterate();
            }

            getDur().commitIfNeeded();

        }

        // TODO: Can this be simplified?
        if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) {
            opDebug->nupdated = numMatched;
            return UpdateResult( numMatched > 0 /* updated existing object(s) */,
                                 !driver->isDocReplacement() /* $mod or obj replacement */,
                                 numMatched /* # of docments update, even no-ops */,
                                 BSONObj() );
        }

        //
        // We haven't found any existing document so an insert is done
        // (upsert is true).
        //
        opDebug->upsert = true;

        // Since this is an insert (no docs found and upsert:true), we will be logging it
        // as an insert in the oplog. We don't need the driver's help to build the
        // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT.
        // Some mods may only work in that context (e.g. $setOnInsert).
        driver->setLogOp( false );
        driver->setContext( ModifierInterface::ExecInfo::INSERT_CONTEXT );

        BSONObj baseObj;

        // Reset the document we will be writing to
        doc.reset( baseObj, mutablebson::Document::kInPlaceDisabled );
        if ( request.getQuery().hasElement("_id") ) {
            uassertStatusOK(doc.root().appendElement(request.getQuery().getField("_id")));
        }


        // If this is a $mod base update, we need to generate a document by examining the
        // query and the mods. Otherwise, we can use the object replacement sent by the user
        // update command that was parsed by the driver before.
        // In the following block we handle the query part, and then do the regular mods after.
        if ( *request.getUpdates().firstElementFieldName() == '$' ) {
            uassertStatusOK(UpdateDriver::createFromQuery(request.getQuery(), doc));
            opDebug->fastmodinsert = true;
        }

        // Apply the update modifications and then log the update as an insert manually.
        Status status = driver->update( StringData(), &doc, NULL /* no oplog record */);
        if ( !status.isOK() ) {
            uasserted( 16836, status.reason() );
        }

        BSONObj newObj = doc.getObject();
        theDataFileMgr.insertWithObjMod( nsString.ns().c_str(), newObj, false, request.isGod() );
        if ( request.shouldUpdateOpLog() ) {
            logOp( "i", nsString.ns().c_str(), newObj,
                   NULL, NULL, request.isFromMigration(), &newObj );
        }

        opDebug->nupdated = 1;
        return UpdateResult( false /* updated a non existing document */,
                             !driver->isDocReplacement() /* $mod or obj replacement? */,
                             1 /* count of updated documents */,
                             newObj /* object that was upserted */ );
    }
Esempio n. 11
0
        void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) {
            const int flags = d.reservedField() | InsertOption_ContinueOnError; // ContinueOnError is always on when using sharding.
            map<ChunkPtr, vector<BSONObj> > insertsForChunk; // Group bulk insert for appropriate shards
            try {
                while ( d.moreJSObjs() ) {
                    BSONObj o = d.nextJsObj();
                    if ( ! manager->hasShardKey( o ) ) {

                        bool bad = true;

                        if ( manager->getShardKey().partOfShardKey( "_id" ) ) {
                            BSONObjBuilder b;
                            b.appendOID( "_id" , 0 , true );
                            b.appendElements( o );
                            o = b.obj();
                            bad = ! manager->hasShardKey( o );
                        }

                        if ( bad ) {
                            log() << "tried to insert object with no valid shard key: " << r.getns() << "  " << o << endl;
                            uasserted( 8011 , "tried to insert object with no valid shard key" );
                        }

                    }

                    // Many operations benefit from having the shard key early in the object
                    o = manager->getShardKey().moveToFront(o);
                    insertsForChunk[manager->findChunk(o)].push_back(o);
                }
                for (map<ChunkPtr, vector<BSONObj> >::iterator it = insertsForChunk.begin(); it != insertsForChunk.end(); ++it) {
                    ChunkPtr c = it->first;
                    vector<BSONObj> objs = it->second;
                    const int maxTries = 30;

                    bool gotThrough = false;
                    for ( int i=0; i<maxTries; i++ ) {
                        try {
                            LOG(4) << "  server:" << c->getShard().toString() << " bulk insert " << objs.size() << " documents" << endl;
                            insert( c->getShard() , r.getns() , objs , flags);

                            int bytesWritten = 0;
                            for (vector<BSONObj>::iterator vecIt = objs.begin(); vecIt != objs.end(); ++vecIt) {
                                r.gotInsert(); // Record the correct number of individual inserts
                                bytesWritten += (*vecIt).objsize();
                            }
                            if ( r.getClientInfo()->autoSplitOk() )
                                c->splitIfShould( bytesWritten );
                            gotThrough = true;
                            break;
                        }
                        catch ( StaleConfigException& e ) {
                            int logLevel = i < ( maxTries / 2 );
                            LOG( logLevel ) << "retrying bulk insert of " << objs.size() << " documents because of StaleConfigException: " << e << endl;
                            r.reset();

                            manager = r.getChunkManager();
                            if( ! manager ) {
                                uasserted(14804, "collection no longer sharded");
                            }

                            unsigned long long old = manager->getSequenceNumber();
                            
                            LOG( logLevel ) << "  sequence number - old: " << old << " new: " << manager->getSequenceNumber() << endl;
                        }
                        sleepmillis( i * 20 );
                    }

                    assert( inShutdown() || gotThrough ); // not caught below
                }
            } catch (const UserException&){
                if (!d.moreJSObjs()){
                    throw;
                }
                // Ignore and keep going. ContinueOnError is implied with sharding.
            }
        }
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request)
    : _args(std::move(request)), _startTime() {
    invariant(!txn->lockState()->isLocked());

    const auto& oss = OperationShardingState::get(txn);
    if (!oss.hasShardVersion()) {
        uasserted(ErrorCodes::InvalidOptions, "collection version is missing");
    }

    // Even though the moveChunk command transmits a value in the operation's shardVersion field,
    // this value does not actually contain the shard version, but the global collection version.
    const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss());

    log() << "Starting chunk migration for "
          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
          << " with expected collection version " << expectedCollectionVersion;

    // Now that the collection is locked, snapshot the metadata and fetch the latest versions
    ShardingState* const shardingState = ShardingState::get(txn);

    ChunkVersion shardVersion;

    Status refreshStatus =
        shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion);
    if (!refreshStatus.isOK()) {
        uasserted(refreshStatus.code(),
                  str::stream() << "cannot start migrate of chunk "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " due to "
                                << refreshStatus.toString());
    }

    if (shardVersion.majorVersion() == 0) {
        // If the major version is zero, this means we do not have any chunks locally to migrate in
        // the first place
        uasserted(ErrorCodes::IncompatibleShardingMetadata,
                  str::stream() << "cannot start migrate of chunk "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " with zero shard version");
    }

    // Snapshot the committed metadata from the time the migration starts
    {
        ScopedTransaction scopedXact(txn, MODE_IS);
        AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS);

        auto css = CollectionShardingState::get(txn, _args.getNss());
        _committedMetadata = css->getMetadata();
    }

    const ChunkVersion collectionVersion = _committedMetadata->getCollVersion();

    if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) {
        throw SendStaleConfigException(
            _args.getNss().ns(),
            str::stream() << "cannot move chunk "
                          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                          << " because collection may have been dropped. "
                          << "current epoch: "
                          << collectionVersion.epoch()
                          << ", cmd epoch: "
                          << expectedCollectionVersion.epoch(),
            expectedCollectionVersion,
            collectionVersion);
    }

    // With nonzero shard version, we must have a coll version >= our shard version
    invariant(collectionVersion >= shardVersion);

    // With nonzero shard version, we must have a shard key
    invariant(!_committedMetadata->getKeyPattern().isEmpty());

    ChunkType origChunk;
    if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) {
        // If this assertion is hit, it means that whoever called the shard moveChunk command
        // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this
        // shard. It is a benign error and does not indicate data corruption.
        uasserted(40145,
                  str::stream() << "Chunk with bounds "
                                << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                                << " is not owned by this shard.");
    }

    uassert(40146,
            str::stream() << "Unable to find chunk with the exact bounds "
                          << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString()
                          << " at collection version "
                          << collectionVersion.toString()
                          << ". This indicates corrupted metadata.",
            origChunk.getMin().woCompare(_args.getMinKey()) == 0 &&
                origChunk.getMax().woCompare(_args.getMaxKey()) == 0);
}
Esempio n. 13
0
UpdateNode::ApplyResult UpdateArrayNode::apply(ApplyParams applyParams) const {
    if (!applyParams.pathToCreate->empty()) {
        for (size_t i = 0; i < applyParams.pathToCreate->numParts(); ++i) {
            applyParams.pathTaken->appendPart(applyParams.pathToCreate->getPart(i));
        }
        uasserted(ErrorCodes::BadValue,
                  str::stream() << "The path '" << applyParams.pathTaken->dottedField()
                                << "' must exist in the document in order to apply array updates.");
    }

    uassert(ErrorCodes::BadValue,
            str::stream() << "Cannot apply array updates to non-array element "
                          << applyParams.element.toString(),
            applyParams.element.getType() == BSONType::Array);

    // Construct a map from the array index to the set of updates that should be applied to the
    // array element at that index. We do not apply the updates yet because we need to know how many
    // array elements will be updated in order to know whether to pass 'logBuilder' on to the
    // UpdateNode children.
    std::map<size_t, std::vector<UpdateNode*>> matchingElements;
    size_t i = 0;
    for (auto childElement = applyParams.element.leftChild(); childElement.ok();
         childElement = childElement.rightSibling()) {

        // 'childElement' will always be serialized because no updates have been performed on the
        // array yet, and when we populate an upserted document with equality fields from the query,
        // arrays can only be added in entirety.
        invariant(childElement.hasValue());
        auto arrayElement = childElement.getValue();

        for (const auto& update : _children) {
            if (update.first.empty()) {

                // If the identifier is the empty string (e.g. came from 'a.$[].b'), the update
                // should be applied to all array elements.
                matchingElements[i].push_back(update.second.get());
            } else {
                auto filter = _arrayFilters.find(update.first);
                invariant(filter != _arrayFilters.end());
                if (filter->second->matchesBSONElement(arrayElement)) {
                    matchingElements[i].push_back(update.second.get());
                }
            }
        }

        ++i;
    }

    // If at most one array element will be updated, pass 'logBuilder' to the UpdateNode child when
    // applying it to that element.
    const bool childrenShouldLogThemselves = matchingElements.size() <= 1;

    // Keep track of which array elements were actually modified (non-noop updates) for logging
    // purposes. We only need to keep track of one element, since if more than one element is
    // modified, we log the whole array.
    boost::optional<mutablebson::Element> modifiedElement;
    size_t nModified = 0;

    // Update array elements.
    auto applyResult = ApplyResult::noopResult();
    i = 0;
    for (auto childElement = applyParams.element.leftChild(); childElement.ok();
         childElement = childElement.rightSibling()) {
        auto updates = matchingElements.find(i);
        if (updates != matchingElements.end()) {

            // Merge all of the updates for this array element.
            invariant(updates->second.size() > 0);
            auto mergedChild = updates->second[0];
            FieldRefTempAppend tempAppend(*(applyParams.pathTaken), childElement.getFieldName());
            for (size_t j = 1; j < updates->second.size(); ++j) {

                // Use the cached merge result, if it is available.
                const auto& cachedResult = _mergedChildrenCache[mergedChild][updates->second[j]];
                if (cachedResult.get()) {
                    mergedChild = cachedResult.get();
                    continue;
                }

                // The cached merge result is not available, so perform the merge and cache the
                // result.
                _mergedChildrenCache[mergedChild][updates->second[j]] =
                    UpdateNode::createUpdateNodeByMerging(
                        *mergedChild, *updates->second[j], applyParams.pathTaken.get());
                mergedChild = _mergedChildrenCache[mergedChild][updates->second[j]].get();
            }

            auto childApplyParams = applyParams;
            childApplyParams.element = childElement;
            if (!childrenShouldLogThemselves) {
                childApplyParams.logBuilder = nullptr;
            }

            auto childApplyResult = mergedChild->apply(childApplyParams);

            applyResult.indexesAffected =
                applyResult.indexesAffected || childApplyResult.indexesAffected;
            applyResult.noop = applyResult.noop && childApplyResult.noop;
            if (!childApplyResult.noop) {
                modifiedElement = childElement;
                ++nModified;
            }
        }

        ++i;
    }

    // If no elements match the array filter, report the path to the array itself as modified.
    if (applyParams.modifiedPaths && matchingElements.size() == 0) {
        applyParams.modifiedPaths->keepShortest(*applyParams.pathTaken);
    }

    // If the child updates have not been logged, log the updated array elements.
    if (!childrenShouldLogThemselves && applyParams.logBuilder) {
        if (nModified > 1) {

            // Log the entire array.
            auto logElement = applyParams.logBuilder->getDocument().makeElementWithNewFieldName(
                applyParams.pathTaken->dottedField(), applyParams.element);
            invariant(logElement.ok());
            uassertStatusOK(applyParams.logBuilder->addToSets(logElement));
        } else if (nModified == 1) {

            // Log the modified array element.
            invariant(modifiedElement);
            FieldRefTempAppend tempAppend(*(applyParams.pathTaken),
                                          modifiedElement->getFieldName());
            auto logElement = applyParams.logBuilder->getDocument().makeElementWithNewFieldName(
                applyParams.pathTaken->dottedField(), *modifiedElement);
            invariant(logElement.ok());
            uassertStatusOK(applyParams.logBuilder->addToSets(logElement));
        }
    }

    return applyResult;
}
Esempio n. 14
0
static void assertParallelArrays(const char* first, const char* second) {
    std::stringstream ss;
    ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
    uasserted(ErrorCodes::CannotIndexParallelArrays, ss.str());
}
Esempio n. 15
0
    std::string runQuery(OperationContext* txn,
                         QueryMessage& q,
                         const NamespaceString& nss,
                         CurOp& curop,
                         Message &result) {
        // Validate the namespace.
        uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid());
        invariant(!nss.isCommand());

        // Set curop information.
        beginQueryOp(nss, q.query, q.ntoreturn, q.ntoskip, &curop);

        // Parse the qm into a CanonicalQuery.
        std::auto_ptr<CanonicalQuery> cq;
        {
            CanonicalQuery* cqRaw;
            Status canonStatus = CanonicalQuery::canonicalize(q,
                                                              &cqRaw,
                                                              WhereCallbackReal(txn, nss.db()));
            if (!canonStatus.isOK()) {
                uasserted(17287, str::stream() << "Can't canonicalize query: "
                                               << canonStatus.toString());
            }
            cq.reset(cqRaw);
        }
        invariant(cq.get());

        LOG(5) << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() :
                                                   serverGlobalParams.defaultProfile;

        // We have a parsed query. Time to get the execution plan for it.
        std::unique_ptr<PlanExecutor> exec;
        {
            PlanExecutor* rawExec;
            Status execStatus = getExecutorFind(txn,
                                                collection,
                                                nss,
                                                cq.release(),
                                                PlanExecutor::YIELD_AUTO,
                                                &rawExec);
            uassertStatusOK(execStatus);
            exec.reset(rawExec);
        }
        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(nss.ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
        Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                slaveOK);
        uassertStatusOK(serveReadsStatus);

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        Timestamp slaveReadTill;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.isOplogReplay()) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || bsonTimestamp == e.type()) {
                    slaveReadTill = e.timestamp();
                }
            }

            if (enoughForFirstBatch(pq, numResults, bb.len())) {
                LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::FAILURE == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
        // chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState.getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(nss.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(nss.ns()));
        }

        // Fill out curop based on query results. If we have a cursorid, we will fill out curop with
        // this cursorid later.
        long long ccId = 0;

        if (shouldSaveCursor(txn, collection, state, exec.get())) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection->getCursorManager(),
                                                exec.release(),
                                                nss.ns(),
                                                pq.getOptions(),
                                                pq.getFilter());
            ccId = cc->cursorid();

            if (txn->getClient()->isInDirectClient()) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.isTailable()) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                txn->recoveryUnit()->abandonSnapshot();
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
                invariant(txn->setRecoveryUnit(storageEngine->newRecoveryUnit(),
                                               OperationContext::kNotInUnitOfWork)
                          == OperationContext::kNotInUnitOfWork);
            }

            LOG(5) << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // TODO document
            if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.isExhaust()) {
                curop.debug().exhaust = true;
            }

            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

            endQueryOp(cc->getExecutor(), dbProfilingLevel, numResults, ccId, &curop);
        }
        else {
            LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
            endQueryOp(exec.get(), dbProfilingLevel, numResults, ccId, &curop);
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? nss.ns() : "";
    }
Esempio n. 16
0
    long long DeleteExecutor::execute(OperationContext* txn, Database* db) {
        uassertStatusOK(prepare());
        uassert(17417,
                mongoutils::str::stream() <<
                "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(),
                _isQueryParsed);
        const bool logop = _request->shouldCallLogOp();
        const NamespaceString& ns(_request->getNamespaceString());
        if (!_request->isGod()) {
            if (ns.isSystem()) {
                uassert(12050,
                        "cannot delete from system namespace",
                        legalClientSystemNS(ns.ns(), true));
            }
            if (ns.ns().find('$') != string::npos) {
                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
                uasserted( 10100, "cannot delete from collection with reserved $ in name" );
            }
        }

        Collection* collection = db->getCollection(txn, ns.ns());
        if (NULL == collection) {
            return 0;
        }

        uassert(10101,
                str::stream() << "cannot remove from a capped collection: " << ns.ns(),
                !collection->isCapped());

        uassert(ErrorCodes::NotMaster,
                str::stream() << "Not primary while removing from " << ns.ns(),
                !logop || repl::isMasterNs(ns.ns().c_str()));

        long long nDeleted = 0;

        Runner* rawRunner;
        if (_canonicalQuery.get()) {
            uassertStatusOK(getRunner(collection, _canonicalQuery.release(), &rawRunner));
        }
        else {
            CanonicalQuery* ignored;
            uassertStatusOK(getRunner(collection,
                                      ns.ns(),
                                      _request->getQuery(),
                                      &rawRunner,
                                      &ignored));
        }

        auto_ptr<Runner> runner(rawRunner);
        ScopedRunnerRegistration safety(runner.get());

        DiskLoc rloc;
        Runner::RunnerState state;
        CurOp* curOp = txn->getCurOp();
        int oldYieldCount = curOp->numYields();
        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &rloc))) {
            if (oldYieldCount != curOp->numYields()) {
                uassert(ErrorCodes::NotMaster,
                        str::stream() << "No longer primary while removing from " << ns.ns(),
                        !logop || repl::isMasterNs(ns.ns().c_str()));
                oldYieldCount = curOp->numYields();
            }
            BSONObj toDelete;

            // TODO: do we want to buffer docs and delete them in a group rather than
            // saving/restoring state repeatedly?
            runner->saveState();
            collection->deleteDocument(txn, rloc, false, false, logop ? &toDelete : NULL );
            runner->restoreState(txn);

            nDeleted++;

            if (logop) {
                if ( toDelete.isEmpty() ) {
                    log() << "Deleted object without id in collection " << collection->ns()
                          << ", not logging.";
                }
                else {
                    bool replJustOne = true;
                    repl::logOp(txn, "d", ns.ns().c_str(), toDelete, 0, &replJustOne);
                }
            }

            if (!_request->isMulti()) {
                break;
            }

            if (!_request->isGod()) {
                txn->recoveryUnit()->commitIfNeeded();
            }

            if (debug && _request->isGod() && nDeleted == 100) {
                log() << "warning high number of deletes with god=true "
                      << " which could use significant memory b/c we don't commit journal";
            }
        }

        return nDeleted;
    }
Esempio n. 17
0
        /**
         * actually applies a reduce, to a list of tuples (key, value).
         * After the call, tuples will hold a single tuple {"0": key, "1": value}
         */
        void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) {
            uassert( 10074 ,  "need values" , tuples.size() );

            int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128;

            // need to build the reduce args: ( key, [values] )
            BSONObjBuilder reduceArgs( sizeEstimate );
            boost::scoped_ptr<BSONArrayBuilder>  valueBuilder;
            int sizeSoFar = 0;
            unsigned n = 0;
            for ( ; n<tuples.size(); n++ ) {
                BSONObjIterator j(tuples[n]);
                BSONElement keyE = j.next();
                if ( n == 0 ) {
                    reduceArgs.append( keyE );
                    key = keyE.wrap();
                    sizeSoFar = 5 + keyE.size();
                    valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) ));
                }

                BSONElement ee = j.next();

                uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) );

                if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) {
                    assert( n > 1 ); // if not, inf. loop
                    break;
                }

                valueBuilder->append( ee );
                sizeSoFar += ee.size();
            }
            assert(valueBuilder);
            valueBuilder->done();
            BSONObj args = reduceArgs.obj();

            Scope * s = _func.scope();

            s->invokeSafe( _func.func() , args );

            if ( s->type( "return" ) == Array ) {
                uasserted( 10075 , "reduce -> multiple not supported yet");
                return;
            }

            endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() );

            if ( n == tuples.size() )
                return;

            // the input list was too large, add the rest of elmts to new tuples and reduce again
            // note: would be better to use loop instead of recursion to avoid stack overflow
            BSONList x;
            for ( ; n < tuples.size(); n++ ) {
                x.push_back( tuples[n] );
            }
            BSONObjBuilder temp( endSizeEstimate );
            temp.append( key.firstElement() );
            s->append( temp , "1" , "return" );
            x.push_back( temp.obj() );
            _reduce( x , key , endSizeEstimate );
        }
Esempio n. 18
0
    // ran at startup.
    static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) {
        //        LastError * le = lastError.get( true );
        LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl;

        Lock::GlobalWrite lk;
        vector< string > dbNames;
        getDatabaseNames( dbNames );
        for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
            string dbName = *i;
            LOG(1) << "\t" << dbName << endl;

            Client::Context ctx( dbName );
            DataFile *p = ctx.db()->getExtentManager().getFile( 0 );
            DataFileHeader *h = p->getHeader();

            if ( replSettings.usingReplSets() ) {
                // we only care about the _id index if we are in a replset
                checkForIdIndexes(ctx.db());
            }

            if (shouldClearNonLocalTmpCollections || dbName == "local")
                ctx.db()->clearTmpCollections();

            if (!h->isCurrentVersion() || mongodGlobalParams.repair) {

                if( h->version <= 0 ) {
                    uasserted(14026,
                      str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version
                                    << " info: " << h->versionMinor << ' ' << h->fileLength);
                }

                if ( !h->isCurrentVersion() ) {
                    log() << "****" << endl;
                    log() << "****" << endl;
                    log() << "need to upgrade database " << dbName << " "
                          << "with pdfile version " << h->version << "." << h->versionMinor << ", "
                          << "new version: "
                          << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR_22_AND_OLDER
                          << endl;
                }

                if (mongodGlobalParams.upgrade) {
                    // QUESTION: Repair even if file format is higher version than code?
                    doDBUpgrade( dbName, h );
                }
                else {
                    log() << "\t Not upgrading, exiting" << endl;
                    log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
                    log() << "****" << endl;
                    dbexit( EXIT_NEED_UPGRADE );
                    mongodGlobalParams.upgrade = 1;
                    return;
                }
            }
            else {
                const string systemIndexes = cc().database()->name() + ".system.indexes";
                auto_ptr<Runner> runner(InternalPlanner::collectionScan(systemIndexes));
                BSONObj index;
                Runner::RunnerState state;
                while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) {
                    const BSONObj key = index.getObjectField("key");
                    const string plugin = IndexNames::findPluginName(key);

                    if (h->versionMinor == PDFILE_VERSION_MINOR_22_AND_OLDER) {
                        if (IndexNames::existedBefore24(plugin))
                            continue;

                        log() << "Index " << index << " claims to be of type '" << plugin << "', "
                              << "which is either invalid or did not exist before v2.4. "
                              << "See the upgrade section: "
                              << "http://dochub.mongodb.org/core/upgrade-2.4"
                              << startupWarningsLog;
                    }

                    const Status keyStatus = validateKeyPattern(key);
                    if (!keyStatus.isOK()) {
                        log() << "Problem with index " << index << ": " << keyStatus.reason()
                              << " This index can still be used however it cannot be rebuilt."
                              << " For more info see"
                              << " http://dochub.mongodb.org/core/index-validation"
                              << startupWarningsLog;
                    }
                }

                if (Runner::RUNNER_EOF != state) {
                    warning() << "Internal error while reading collection " << systemIndexes;
                }

                Database::closeDatabase(dbName.c_str(), storageGlobalParams.dbpath);
            }
        }

        LOG(1) << "done repairDatabases" << endl;

        if (mongodGlobalParams.upgrade) {
            log() << "finished checking dbs" << endl;
            cc().shutdown();
            dbexit( EXIT_CLEAN );
        }
    }
Esempio n. 19
0
    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
     */
    const DiskLoc DataFileMgr::updateRecord(
        const char *ns,
        Collection* collection,
        Record *toupdate, const DiskLoc& dl,
        const char *_buf, int _len, OpDebug& debug,  bool god) {

        dassert( toupdate == dl.rec() );

        BSONObj objOld = BSONObj::make(toupdate);
        BSONObj objNew(_buf);
        DEV verify( objNew.objsize() == _len );
        DEV verify( objNew.objdata() == _buf );

        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
            /* add back the old _id value if the update removes it.  Note this implementation is slow
               (copies entire object multiple times), but this shouldn't happen often, so going for simple
               code, not speed.
            */
            BSONObjBuilder b;
            BSONElement e;
            verify( objOld.getObjectID(e) );
            b.append(e); // put _id first, for best performance
            b.appendElements(objNew);
            objNew = b.obj();
        }

        NamespaceString nsstring(ns);
        if (nsstring.coll() == "system.users") {
            V2UserDocumentParser parser;
            uassertStatusOK(parser.checkValidUserDocument(objNew));
        }

        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew,
                objNew["_id"] == objOld["_id"]);

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerVector<UpdateTicket> updateTickets;
        updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount());
        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern())
                                    || descriptor->unique())
                                  || ignoreUniqueIndex(descriptor->getOnDisk());
            updateTickets.mutableVector()[i] = new UpdateTicket();
            Status ret = iam->validateUpdate(objOld, objNew, dl, options,
                                             updateTickets.mutableVector()[i]);

            if (Status::OK() != ret) {
                uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString());
            }
        }

        if ( toupdate->netLength() < objNew.objsize() ) {
            // doesn't fit.  reallocate -----------------------------------------------------
            moveCounter.increment();
            uassert( 10003,
                     "failing update: objects in a capped ns cannot grow",
                     !(collection && collection->details()->isCapped()));
            collection->details()->paddingTooSmall();
            deleteRecord(ns, toupdate, dl);
            DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god);

            if (debug.nmoved == -1) // default of -1 rather than 0
                debug.nmoved = 1;
            else
                debug.nmoved += 1;

            return res;
        }

        collection->infoCache()->notifyOfWriteOp();
        collection->details()->paddingFits();

        debug.keyUpdates = 0;

        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            int64_t updatedKeys;
            Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys);
            if (Status::OK() != ret) {
                // This shouldn't happen unless something disastrous occurred.
                massert(16799, "update failed: " + ret.toString(), false);
            }
            debug.keyUpdates += updatedKeys;
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz);
        return dl;
    }
Esempio n. 20
0
void Strategy::queryOp(OperationContext* txn, Request& request) {
    verify(!NamespaceString(request.getns()).isCommand());

    Timer queryTimer;

    globalOpCounters.gotQuery();

    QueryMessage q(request.d());

    NamespaceString ns(q.ns);
    ClientBasic* client = txn->getClient();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForFind(ns, false);
    audit::logQueryAuthzCheck(client, ns, q.query, status.code());
    uassertStatusOK(status);

    LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd"))
        throw UserException(8010, "something is wrong, shouldn't see a command here");

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18526,
                  string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns +
                      " " + q.query.toString());
    }

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        // Determine the default read preference mode based on the value of the slaveOk flag.
        ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk)
            ? ReadPreference::SecondaryPreferred
            : ReadPreference::PrimaryOnly;
        ReadPreferenceSetting readPreference(readPreferenceOption, TagSet());

        BSONElement rpElem;
        auto readPrefExtractStatus = bsonExtractTypedField(
            q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem);

        if (readPrefExtractStatus.isOK()) {
            auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj());
            uassertStatusOK(parsedRps.getStatus());
            readPreference = parsedRps.getValue();
        } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) {
            uassertStatusOK(readPrefExtractStatus);
        }

        auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop());
        uassertStatusOK(canonicalQuery.getStatus());

        // If the $explain flag was set, we must run the operation on the shards as an explain
        // command rather than a find command.
        if (canonicalQuery.getValue()->getParsed().isExplain()) {
            const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed();
            BSONObj findCommand = lpq.asFindCommand();

            // We default to allPlansExecution verbosity.
            auto verbosity = ExplainCommon::EXEC_ALL_PLANS;

            const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly);
            rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference);

            BSONObjBuilder explainBuilder;
            uassertStatusOK(ClusterFind::runExplain(
                txn, findCommand, lpq, verbosity, metadata, &explainBuilder));

            BSONObj explainObj = explainBuilder.done();
            replyToQuery(0,  // query result flags
                         request.p(),
                         request.m(),
                         static_cast<const void*>(explainObj.objdata()),
                         explainObj.objsize(),
                         1,  // numResults
                         0,  // startingFrom
                         CursorId(0));
            return;
        }

        // Do the work to generate the first batch of results. This blocks waiting to get responses
        // from the shard(s).
        std::vector<BSONObj> batch;

        // 0 means the cursor is exhausted and
        // otherwise we assume that a cursor with the returned id can be retrieved via the
        // ClusterCursorManager
        auto cursorId =
            ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch);
        uassertStatusOK(cursorId.getStatus());

        // TODO: this constant should be shared between mongos and mongod, and should
        // not be inside ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        // Fill out the response buffer.
        int numResults = 0;
        for (const auto& obj : batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            numResults++;
        }

        replyToQuery(0,  // query result flags
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     0,  // startingFrom
                     cursorId.getValue());
        return;
    }

    QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions);

    // Parse "$maxTimeMS".
    StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query);
    uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK());

    if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) {
        return;
    }

    ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo());
    verify(cursor);

    // TODO:  Move out to Request itself, not strategy based
    try {
        cursor->init(txn);

        if (qSpec.isExplain()) {
            BSONObjBuilder explain_builder;
            cursor->explain(explain_builder);
            explain_builder.appendNumber("executionTimeMillis",
                                         static_cast<long long>(queryTimer.millis()));
            BSONObj b = explain_builder.obj();

            replyToQuery(0, request.p(), request.m(), b);
            delete (cursor);
            return;
        }
    } catch (...) {
        delete cursor;
        throw;
    }

    // TODO: Revisit all of this when we revisit the sharded cursor cache

    if (cursor->getNumQueryShards() != 1) {
        // More than one shard (or zero), manage with a ShardedClientCursor
        // NOTE: We may also have *zero* shards here when the returnPartial flag is set.
        // Currently the code in ShardedClientCursor handles this.

        ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor));

        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cc->getTotalSent();
        bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount);

        if (hasMore) {
            LOG(5) << "storing cursor : " << cc->getId();

            int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis();
            if (maxTimeMS.getValue() == 0) {  // 0 represents "no limit".
                cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit;
            } else if (cursorLeftoverMillis <= 0) {
                cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
            }

            cursorCache.store(cc, cursorLeftoverMillis);
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cc->getId() : 0);
    } else {
        // Only one shard is used

        // Remote cursors are stored remotely, we shouldn't need this around.
        unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor);

        ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId());
        verify(shard.get());
        DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId());

        // Implicitly stores the cursor in the cache
        request.reply(*(shardCursor->getMessage()), shardCursor->originalHost());

        // We don't want to kill the cursor remotely if there's still data left
        shardCursor->decouple();
    }
}
Esempio n. 21
0
 void _quotaExceeded() {
     uasserted(12501, "quota exceeded");
 }
Esempio n. 22
0
void Strategy::clientCommandOp(OperationContext* txn, Request& request) {
    QueryMessage q(request.d());

    LOG(3) << "command: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn
           << " options: " << q.queryOptions;

    if (q.queryOptions & QueryOption_Exhaust) {
        uasserted(18527,
                  string("the 'exhaust' query option is invalid for mongos commands: ") + q.ns +
                      " " + q.query.toString());
    }

    NamespaceString nss(request.getns());
    // Regular queries are handled in strategy_shard.cpp
    verify(nss.isCommand() || nss.isSpecialCommand());

    if (handleSpecialNamespaces(txn, request, q))
        return;

    int loops = 5;
    bool cmChangeAttempted = false;

    while (true) {
        BSONObjBuilder builder;
        try {
            BSONObj cmdObj = q.query;
            {
                BSONElement e = cmdObj.firstElement();
                if (e.type() == Object &&
                    (e.fieldName()[0] == '$' ? str::equals("query", e.fieldName() + 1)
                                             : str::equals("query", e.fieldName()))) {
                    // Extract the embedded query object.

                    if (cmdObj.hasField(Query::ReadPrefField.name())) {
                        // The command has a read preference setting. We don't want
                        // to lose this information so we copy this to a new field
                        // called $queryOptions.$readPreference
                        BSONObjBuilder finalCmdObjBuilder;
                        finalCmdObjBuilder.appendElements(e.embeddedObject());

                        BSONObjBuilder queryOptionsBuilder(
                            finalCmdObjBuilder.subobjStart("$queryOptions"));
                        queryOptionsBuilder.append(cmdObj[Query::ReadPrefField.name()]);
                        queryOptionsBuilder.done();

                        cmdObj = finalCmdObjBuilder.obj();
                    } else {
                        cmdObj = e.embeddedObject();
                    }
                }
            }

            Command::runAgainstRegistered(txn, q.ns, cmdObj, builder, q.queryOptions);
            BSONObj x = builder.done();
            replyToQuery(0, request.p(), request.m(), x);
            return;
        } catch (const StaleConfigException& e) {
            if (loops <= 0)
                throw e;

            loops--;
            log() << "retrying command: " << q.query;

            // For legacy reasons, ns may not actually be set in the exception :-(
            string staleNS = e.getns();
            if (staleNS.size() == 0)
                staleNS = q.ns;

            ShardConnection::checkMyConnectionVersions(txn, staleNS);
            if (loops < 4)
                versionManager.forceRemoteCheckShardVersionCB(txn, staleNS);
        } catch (const DBException& e) {
            if (e.getCode() == ErrorCodes::IncompatibleCatalogManager) {
                fassert(28791, !cmChangeAttempted);
                cmChangeAttempted = true;

                grid.forwardingCatalogManager()->waitForCatalogManagerChange(txn);
            } else {
                Command::appendCommandStatus(builder, e.toStatus());
                BSONObj x = builder.done();
                replyToQuery(0, request.p(), request.m(), x);
                return;
            }
        }
    }
}
Esempio n. 23
0
bool MessagingPort::recv(Message& m) {
    try {
#ifdef MONGO_CONFIG_SSL
    again:
#endif
        // mmm( log() << "*  recv() sock:" << this->sock << endl; )
        MSGHEADER::Value header;
        int headerLen = sizeof(MSGHEADER::Value);
        psock->recv((char*)&header, headerLen);
        int len = header.constView().getMessageLength();

        if (len == 542393671) {
            // an http GET
            string msg =
                "It looks like you are trying to access MongoDB over HTTP on the native driver "
                "port.\n";
            LOG(psock->getLogLevel()) << msg;
            std::stringstream ss;
            ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: "
                  "text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg;
            string s = ss.str();
            send(s.c_str(), s.size(), "http");
            return false;
        }
        // If responseTo is not 0 or -1 for first packet assume SSL
        else if (psock->isAwaitingHandshake()) {
#ifndef MONGO_CONFIG_SSL
            if (header.constView().getResponseTo() != 0 &&
                header.constView().getResponseTo() != -1) {
                uasserted(17133,
                          "SSL handshake requested, SSL feature not available in this build");
            }
#else
            if (header.constView().getResponseTo() != 0 &&
                header.constView().getResponseTo() != -1) {
                uassert(17132,
                        "SSL handshake received but server is started without SSL support",
                        sslGlobalParams.sslMode.load() != SSLParams::SSLMode_disabled);
                setX509SubjectName(
                    psock->doSSLHandshake(reinterpret_cast<const char*>(&header), sizeof(header)));
                psock->setHandshakeReceived();
                goto again;
            }
            uassert(17189,
                    "The server is configured to only allow SSL connections",
                    sslGlobalParams.sslMode.load() != SSLParams::SSLMode_requireSSL);
#endif  // MONGO_CONFIG_SSL
        }
        if (static_cast<size_t>(len) < sizeof(MSGHEADER::Value) ||
            static_cast<size_t>(len) > MaxMessageSizeBytes) {
            LOG(0) << "recv(): message len " << len << " is invalid. "
                   << "Min " << sizeof(MSGHEADER::Value) << " Max: " << MaxMessageSizeBytes;
            return false;
        }

        psock->setHandshakeReceived();
        int z = (len + 1023) & 0xfffffc00;
        verify(z >= len);
        MsgData::View md = reinterpret_cast<char*>(mongolMalloc(z));
        ScopeGuard guard = MakeGuard(free, md.view2ptr());
        verify(md.view2ptr());

        memcpy(md.view2ptr(), &header, headerLen);
        int left = len - headerLen;

        psock->recv(md.data(), left);

        guard.Dismiss();
        m.setData(md.view2ptr(), true);
        return true;

    } catch (const SocketException& e) {
        logger::LogSeverity severity = psock->getLogLevel();
        if (!e.shouldPrint())
            severity = severity.lessSevere();
        LOG(severity) << "SocketException: remote: " << remote() << " error: " << e;
        m.reset();
        return false;
    }
}
Esempio n. 24
0
void Strategy::getMore(OperationContext* txn, Request& request) {
    Timer getMoreTimer;

    const char* ns = request.getns();
    const int ntoreturn = request.d().pullInt();
    const long long id = request.d().pullInt64();

    // TODO:  Handle stale config exceptions here from coll being dropped or sharded during op
    // for now has same semantics as legacy request
    const NamespaceString nss(ns);
    auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString());
    if (statusGetDb == ErrorCodes::DatabaseNotFound) {
        cursorCache.remove(id);
        replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0);
        return;
    }

    uassertStatusOK(statusGetDb);

    // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor
    // code path.
    //
    // TODO: Delete the spigot and always use the new code.
    if (useClusterClientCursor) {
        boost::optional<long long> batchSize;
        if (ntoreturn) {
            batchSize = ntoreturn;
        }
        GetMoreRequest getMoreRequest(NamespaceString(ns), id, batchSize, boost::none);

        auto cursorResponse = ClusterFind::runGetMore(txn, getMoreRequest);
        if (cursorResponse == ErrorCodes::CursorNotFound) {
            replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0);
            return;
        }
        uassertStatusOK(cursorResponse.getStatus());

        // Build the response document.
        //
        // TODO: this constant should be shared between mongos and mongod, and should not be inside
        // ShardedClientCursor.
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);

        int numResults = 0;
        for (const auto& obj : cursorResponse.getValue().batch) {
            buffer.appendBuf((void*)obj.objdata(), obj.objsize());
            ++numResults;
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     numResults,
                     cursorResponse.getValue().numReturnedSoFar.value_or(0),
                     cursorResponse.getValue().cursorId);
        return;
    }

    shared_ptr<DBConfig> config = statusGetDb.getValue();

    ShardPtr primary;
    ChunkManagerPtr info;
    config->getChunkManagerOrPrimary(txn, ns, info, primary);

    //
    // TODO: Cleanup cursor cache, consolidate into single codepath
    //
    const string host = cursorCache.getRef(id);
    ShardedClientCursorPtr cursor = cursorCache.get(id);
    int cursorMaxTimeMS = cursorCache.getMaxTimeMS(id);

    // Cursor ids should not overlap between sharded and unsharded cursors
    massert(17012,
            str::stream() << "duplicate sharded and unsharded cursor id " << id << " detected for "
                          << ns << ", duplicated on host " << host,
            NULL == cursorCache.get(id).get() || host.empty());

    ClientBasic* client = ClientBasic::getCurrent();
    NamespaceString nsString(ns);
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    Status status = authSession->checkAuthForGetMore(nsString, id, false);
    audit::logGetMoreAuthzCheck(client, nsString, id, status.code());
    uassertStatusOK(status);

    if (!host.empty()) {
        LOG(3) << "single getmore: " << ns;

        // we used ScopedDbConnection because we don't get about config versions
        // not deleting data is handled elsewhere
        // and we don't want to call setShardVersion
        ScopedDbConnection conn(host);

        Message response;
        bool ok = conn->callRead(request.m(), response);
        uassert(10204, "dbgrid: getmore: error calling db", ok);

        bool hasMore = (response.singleData().getCursor() != 0);

        if (!hasMore) {
            cursorCache.removeRef(id);
        }

        request.reply(response, "" /*conn->getServerAddress() */);
        conn.done();
        return;
    } else if (cursor) {
        if (cursorMaxTimeMS == kMaxTimeCursorTimeLimitExpired) {
            cursorCache.remove(id);
            uasserted(ErrorCodes::ExceededTimeLimit, "operation exceeded time limit");
        }

        // TODO: Try to match logic of mongod, where on subsequent getMore() we pull lots more data?
        BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE);
        int docCount = 0;
        const int startFrom = cursor->getTotalSent();
        bool hasMore = cursor->sendNextBatch(ntoreturn, buffer, docCount);

        if (hasMore) {
            // still more data
            cursor->accessed();

            if (cursorMaxTimeMS != kMaxTimeCursorNoTimeLimit) {
                // Update remaining amount of time in cursor cache.
                int cursorLeftoverMillis = cursorMaxTimeMS - getMoreTimer.millis();
                if (cursorLeftoverMillis <= 0) {
                    cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired;
                }
                cursorCache.updateMaxTimeMS(id, cursorLeftoverMillis);
            }
        } else {
            // we've exhausted the cursor
            cursorCache.remove(id);
        }

        replyToQuery(0,
                     request.p(),
                     request.m(),
                     buffer.buf(),
                     buffer.len(),
                     docCount,
                     startFrom,
                     hasMore ? cursor->getId() : 0);
        return;
    } else {
        LOG(3) << "could not find cursor " << id << " in cache for " << ns;

        replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0);
        return;
    }
}
Esempio n. 25
0
        void _update( Request& r , DbMessage& d, ChunkManagerPtr manager ){
            int flags = d.pullInt();
            
            BSONObj query = d.nextJsObj();
            uassert( 10201 ,  "invalid update" , d.moreJSObjs() );
            BSONObj toupdate = d.nextJsObj();

            BSONObj chunkFinder = query;
            
            bool upsert = flags & UpdateOption_Upsert;
            bool multi = flags & UpdateOption_Multi;

            uassert( 10202 ,  "can't mix multi and upsert and sharding" , ! ( upsert && multi ) );

            if ( upsert && !(manager->hasShardKey(toupdate) ||
                             (toupdate.firstElement().fieldName()[0] == '$' && manager->hasShardKey(query))))
            {
                throw UserException( 8012 , "can't upsert something without shard key" );
            }

            bool save = false;
            if ( ! manager->hasShardKey( query ) ){
                if ( multi ){
                }
                else if ( strcmp( query.firstElement().fieldName() , "_id" ) || query.nFields() != 1 ){
                    throw UserException( 8013 , "can't do non-multi update with query that doesn't have the shard key" );
                }
                else {
                    save = true;
                    chunkFinder = toupdate;
                }
            }

            
            if ( ! save ){
                if ( toupdate.firstElement().fieldName()[0] == '$' ){
                    BSONObjIterator ops(toupdate);
                    while(ops.more()){
                        BSONElement op(ops.next());
                        if (op.type() != Object)
                            continue;
                        BSONObjIterator fields(op.embeddedObject());
                        while(fields.more()){
                            const string field = fields.next().fieldName();
                            uassert(13123, "Can't modify shard key's value", ! manager->getShardKey().partOfShardKey(field));
                        }
                    }
                } else if ( manager->hasShardKey( toupdate ) ){
                    uassert( 8014, "change would move shards!", manager->getShardKey().compare( query , toupdate ) == 0 );
                } else {
                    uasserted(12376, "shard key must be in update object");
                }
            }
            
            if ( multi ){
                set<Shard> shards;
                manager->getShardsForQuery( shards , chunkFinder );
                int * x = (int*)(r.d().afterNS());
                x[0] |= UpdateOption_Broadcast;
                for ( set<Shard>::iterator i=shards.begin(); i!=shards.end(); i++){
                    doWrite( dbUpdate , r , *i , false );
                }
            }
            else {
                int left = 5;
                while ( true ){
                    try {
                        ChunkPtr c = manager->findChunk( chunkFinder );
                        doWrite( dbUpdate , r , c->getShard() );
                        c->splitIfShould( d.msg().header()->dataLen() );
                        break;
                    }
                    catch ( StaleConfigException& e ){
                        if ( left <= 0 )
                            throw e;
                        left--;
                        log() << "update failed b/c of StaleConfigException, retrying " 
                              << " left:" << left << " ns: " << r.getns() << " query: " << query << endl;
                        r.reset( false );
                        manager = r.getChunkManager();
                    }
                }
            }

        }
Esempio n. 26
0
    void acquirePathLock(bool doingRepair) {
        string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).string();

        bool oldFile = false;

        if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ) {
            oldFile = true;
        }

#ifdef _WIN32
        lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE,
            0 /* do not allow anyone else access */, NULL, 
            OPEN_ALWAYS /* success if fh can open */, 0, NULL );

        if (lockFileHandle == INVALID_HANDLE_VALUE) {
            DWORD code = GetLastError();
            char *msg;
            FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
                NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                (LPSTR)&msg, 0, NULL);
            string m = msg;
            str::stripTrailing(m, "\r\n");
            uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << ". Is a mongod instance already running?" );
        }
        lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0);
#else
        lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO );
        if( lockFile <= 0 ) {
            uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" );
        }
        if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) {
            close ( lockFile );
            lockFile = 0;
            uassert( 10310 ,  "Unable to lock file: " + name + ". Is a mongod instance already running?",  0 );
        }
#endif

        if ( oldFile ) {
            // we check this here because we want to see if we can get the lock
            // if we can't, then its probably just another mongod running
            
            string errmsg;
            if (doingRepair && dur::haveJournalFiles()) {
                errmsg = "************** \n"
                         "You specified --repair but there are dirty journal files. Please\n"
                         "restart without --repair to allow the journal files to be replayed.\n"
                         "If you wish to repair all databases, please shutdown cleanly and\n"
                         "run with --repair again.\n"
                         "**************";
            }
            else if (cmdLine.dur) {
                if (!dur::haveJournalFiles(/*anyFiles=*/true)) {
                    // Passing anyFiles=true as we are trying to protect against starting in an
                    // unclean state with the journal directory unmounted. If there are any files,
                    // even prealloc files, then it means that it is mounted so we can continue.
                    // Previously there was an issue (SERVER-5056) where we would fail to start up
                    // if killed during prealloc.
                    
                    vector<string> dbnames;
                    getDatabaseNames( dbnames );
                    
                    if ( dbnames.size() == 0 ) {
                        // this means that mongod crashed
                        // between initial startup and when journaling was initialized
                        // it is safe to continue
                    }
                    else {
                        errmsg = str::stream()
                            << "************** \n"
                            << "old lock file: " << name << ".  probably means unclean shutdown,\n"
                            << "but there are no journal files to recover.\n"
                            << "this is likely human error or filesystem corruption.\n"
                            << "please make sure that your journal directory is mounted.\n"
                            << "found " << dbnames.size() << " dbs.\n"
                            << "see: http://dochub.mongodb.org/core/repair for more information\n"
                            << "*************";
                    }


                }
            }
            else {
                if (!dur::haveJournalFiles() && !doingRepair) {
                    errmsg = str::stream()
                             << "************** \n"
                             << "Unclean shutdown detected.\n"
                             << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n"
                             << "*************";
                }
            }

            if (!errmsg.empty()) {
                cout << errmsg << endl;
#ifdef _WIN32
                CloseHandle( lockFileHandle );
#else
                close ( lockFile );
#endif
                lockFile = 0;
                uassert( 12596 , "old lock file" , 0 );
            }
        }

        // Not related to lock file, but this is where we handle unclean shutdown
        if( !cmdLine.dur && dur::haveJournalFiles() ) {
            cout << "**************" << endl;
            cout << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl;
            cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl;
            cout << "**************" << endl;
            uasserted(13597, "can't start without --journal enabled when journal/ files are present");
        }

#ifdef _WIN32
        uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0);
        writePid( lockFile );
        _commit( lockFile );
#else
        uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0);
        writePid( lockFile );
        fsync( lockFile );
        flushMyDirectory(name);
#endif
    }
Esempio n. 27
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) {
        QLOG() << "Running query on new system: " << cq->toString();

        // This is a read lock.
        Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // Need to call cq->toString() now, since upon error getRunner doesn't guarantee
        // cq is in a consistent state.
        string cqStr = cq->toString();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr);
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);
        auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety(
            new DeregisterEvenIfUnderlyingCodeThrows(runner.get()));

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Append explain information to query results by asking the runner to produce them.
        if (isExplain) {
            TypeExplain* bareExplain;
            Status res = runner->getExplainPlan(&bareExplain);

            if (!res.isOK()) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
            else {
                boost::scoped_ptr<TypeExplain> explain(bareExplain);

                // Fill in the missing run-time fields in explain, starting with propeties of
                // the process running the query.
                std::string server = mongoutils::str::stream()
                    << getHostNameCached() << ":" << serverGlobalParams.port;
                explain->setServer(server);

                // We might have skipped some results due to chunk migration etc. so our count is
                // correct.
                explain->setN(numResults);

                // Clock the whole operation.
                explain->setMillis(curop.elapsedMillis());

                BSONObj explainObj = explain->toBSON();
                bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

                // The explain output is actually a result.
                numResults = 1;
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "not caching runner but returning " << numResults << " results\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Esempio n. 28
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View getMore(OperationContext* txn,
                              const char* ns,
                              int ntoreturn,
                              long long cursorid,
                              CurOp& curop,
                              int pass,
                              bool& exhaust,
                              bool* isCursorAuthorized) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        const NamespaceString nss(ns);

        // Depending on the type of cursor being operated on, we hold locks for the whole getMore,
        // or none of the getMore, or part of the getMore.  The three cases in detail:
        //
        // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
        // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors don't
        //    own any collection state.
        // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
        //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
        //    release), but the pin and unpin of the cursor must occur under the collection lock.
        //    We don't use our AutoGetCollectionForRead "ctx" to relock, because
        //    AutoGetCollectionForRead checks the sharding version (and we want the relock for the
        //    unpin to succeed even if the sharding version has changed).
        //
        // Note that we declare our locks before our ClientCursorPin, in order to ensure that the
        // pin's destructor is called before the lock destructors (so that the unpin occurs under
        // the lock).
        boost::scoped_ptr<AutoGetCollectionForRead> ctx;
        boost::scoped_ptr<Lock::DBLock> unpinDBLock;
        boost::scoped_ptr<Lock::CollectionLock> unpinCollLock;

        CursorManager* cursorManager;
        CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
        if (globalCursorManager->ownsCursorId(cursorid)) {
            cursorManager = globalCursorManager;
        }
        else {
            ctx.reset(new AutoGetCollectionForRead(txn, nss));
            Collection* collection = ctx->getCollection();
            uassert( 17356, "collection dropped between getMore calls", collection );
            cursorManager = collection->getCursorManager();
        }

        LOG(5) << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorManager, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Check for spoofing of the ns such that it does not match the one originally
            // there for the cursor.
            uassert(ErrorCodes::Unauthorized,
                    str::stream() << "Requested getMore on namespace " << ns << ", but cursor "
                                  << cursorid << " belongs to namespace " << cc->ns(),
                    ns == cc->ns());
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (txn->getClient()->isInDirectClient()) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit());

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn); 
            }

            if (cc->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            // If we're replaying the oplog, we save the last time that we read.
            Timestamp slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || bsonTimestamp == e.type()) {
                        slaveReadTill = e.timestamp();
                    }
                }

                if (enoughForGetMore(ntoreturn, numResults, bb.len())) {
                    break;
                }
            }

            if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) {
                // Propagate this error to caller.
                if (PlanExecutor::FAILURE == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }

            const bool shouldSaveCursor =
                    shouldSaveCursorGetMore(state, exec, isCursorTailable(cc));

            // In order to deregister a cursor, we need to be holding the DB + collection lock and
            // if the cursor is aggregation, we release these locks.
            if (cc->isAggCursor()) {
                invariant(NULL == ctx.get());
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS));
                unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS));
            }

            // Our two possible ClientCursorPin cleanup paths are:
            // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin.
            // 2) If the cursor is going to be saved, we simply let the pin go out of scope.  In
            //    this case, the pin's destructor will be invoked, which will call release() on the
            //    pin.  Because our ClientCursorPin is declared after our lock is declared, this
            //    will happen under the lock.
            if (!shouldSaveCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();

                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                curop.debug().cursorExhausted = true;

                LOG(5) << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                LOG(5) << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!txn->getClient()->isInDirectClient()) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper->dismiss();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        LOG(5) << "getMore returned " << numResults << " results\n";
        return qr;
    }
Esempio n. 29
0
    /* called on a reconfig AND on initiate
       throws
       @param initial true when initiating
    */
    void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial) {
        int failures = 0, majority = 0;
        int me = 0;
        stringstream selfs;
        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
            if( i->h.isSelf() ) {
                me++;
                if( me > 1 )
                    selfs << ',';
                selfs << i->h.toString();
                if( !i->potentiallyHot() ) {
                    uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary");
                }
                majority += i->votes;
            }
        }
        majority = (majority / 2) + 1;
        
        uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups?
        if( me != 1 ) {
            stringstream ss;
            ss << "can't find self in the replset config";
            if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port;
            if( me != 0 ) ss << " found: " << me;
            uasserted(13279, ss.str());
        }

        for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) {
            // we know we're up
            if (i->h.isSelf()) {
                continue;
            }

            BSONObj res;
            {
                bool ok = false;
                try {
                    int theirVersion = -1000;
                    ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/);
                    if( theirVersion >= cfg.version ) {
                        stringstream ss;
                        ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure";
                        uasserted(13259, ss.str());
                    }
                }
                catch(DBException& e) {
                    log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog;
                }
                catch(...) {
                    log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog;
                }
                if( res.getBoolField("mismatch") )
                    uasserted(13145, "set name does not match the set name host " + i->h.toString() + " expects");
                if( *res.getStringField("set") ) {
                    if( cfg.version <= 1 ) {
                        // this was to be initiation, no one shoudl be initiated already.
                        uasserted(13256, "member " + i->h.toString() + " is already initiated");
                    }
                    else {
                        // Assure no one has a newer config.
                        if( res["v"].Int() >= cfg.version ) {
                            uasserted(13341, "member " + i->h.toString() + " has a config version >= to the new cfg version; cannot change config");
                        }
                    }
                }
                if( !ok && !res["rs"].trueValue() ) {
                    if( !res.isEmpty() ) {
                        /* strange.  got a response, but not "ok". log it. */
                        log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog;
                    }

                    bool allowFailure = false;
                    failures += i->votes;
                    if( res.isEmpty() && !initial && failures >= majority ) {
                        const Member* m = theReplSet->findById( i->_id );
                        if( m ) {
                            assert( m->h().toString() == i->h.toString() );
                        }
                        // it's okay if the down member isn't part of the config,
                        // we might be adding a new member that isn't up yet
                        allowFailure = true;
                    }

                    if( !allowFailure ) {
                        string msg = string("need all members up to initiate, not ok : ") + i->h.toString();
                        if( !initial )
                            msg = string("need most members up to reconfigure, not ok : ") + i->h.toString();
                        uasserted(13144, msg);
                    }
                }
            }
            if( initial ) {
                bool hasData = res["hasData"].Bool();
                uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set.  All members except initiator must be empty.",
                        !hasData || i->h.isSelf());
            }
        }
    }
Esempio n. 30
0
 NOINLINE_DECL void DataFile::badOfs(int ofs) const {
     stringstream ss;
     ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
     uasserted(13440, ss.str());
 }