Status CmdAuthenticate::_authenticateCR(OperationContext* opCtx,
                                        const UserName& user,
                                        const BSONObj& cmdObj) {
    if (user == internalSecurity.user->getName() &&
        serverGlobalParams.clusterAuthMode.load() == ServerGlobalParams::ClusterAuthMode_x509) {
        return Status(ErrorCodes::AuthenticationFailed,
                      "Mechanism x509 is required for internal cluster authentication");
    }

    if (_isCRAuthDisabled) {
        // SERVER-8461, MONGODB-CR must be enabled for authenticating the internal user, so that
        // cluster members may communicate with each other.
        if (user != internalSecurity.user->getName()) {
            return Status(ErrorCodes::BadValue, _nonceAuthenticationDisabledMessage);
        }
    }

    string key = cmdObj.getStringField("key");
    string received_nonce = cmdObj.getStringField("nonce");

    if (user.getUser().empty() || key.empty() || received_nonce.empty()) {
        sleepmillis(10);
        return Status(ErrorCodes::ProtocolError,
                      "field missing/wrong type in received authenticate command");
    }

    stringstream digestBuilder;

    {
        Client* client = Client::getCurrent();
        std::unique_ptr<AuthenticationSession> session;
        AuthenticationSession::swap(client, session);
        if (!session || session->getType() != AuthenticationSession::SESSION_TYPE_MONGO) {
            sleepmillis(30);
            return Status(ErrorCodes::ProtocolError, "No pending nonce");
        } else {
            nonce64 nonce = static_cast<MongoAuthenticationSession*>(session.get())->getNonce();
            digestBuilder << hex << nonce;
            if (digestBuilder.str() != received_nonce) {
                sleepmillis(30);
                return Status(ErrorCodes::AuthenticationFailed, "Received wrong nonce.");
            }
        }
    }

    User* userObj;
    Status status =
        getGlobalAuthorizationManager()->acquireUserForInitialAuth(opCtx, user, &userObj);
    if (!status.isOK()) {
        // Failure to find the privilege document indicates no-such-user, a fact that we do not
        // wish to reveal to the client.  So, we return AuthenticationFailed rather than passing
        // through the returned status.
        return Status(ErrorCodes::AuthenticationFailed, status.toString());
    }
    string pwd = userObj->getCredentials().password;
    getGlobalAuthorizationManager()->releaseUser(userObj);

    if (pwd.empty()) {
        return Status(ErrorCodes::AuthenticationFailed,
                      "MONGODB-CR credentials missing in the user document");
    }

    md5digest d;
    {
        digestBuilder << user.getUser() << pwd;
        string done = digestBuilder.str();

        md5_state_t st;
        md5_init(&st);
        md5_append(&st, (const md5_byte_t*)done.c_str(), done.size());
        md5_finish(&st, d);
    }

    string computed = digestToString(d);

    if (key != computed) {
        return Status(ErrorCodes::AuthenticationFailed, "key mismatch");
    }

    AuthorizationSession* authorizationSession = AuthorizationSession::get(Client::getCurrent());
    status = authorizationSession->addAndAuthorizeUser(opCtx, user);
    if (!status.isOK()) {
        return status;
    }

    return Status::OK();
}
Status ShardingCatalogClientImpl::applyChunkOpsDeprecated(OperationContext* opCtx,
                                                          const BSONArray& updateOps,
                                                          const BSONArray& preCondition,
                                                          const NamespaceString& nss,
                                                          const ChunkVersion& lastChunkVersion,
                                                          const WriteConcernOptions& writeConcern,
                                                          repl::ReadConcernLevel readConcern) {
    invariant(serverGlobalParams.clusterRole == ClusterRole::ConfigServer ||
              (readConcern == repl::ReadConcernLevel::kMajorityReadConcern &&
               writeConcern.wMode == WriteConcernOptions::kMajority));
    BSONObj cmd = BSON("applyOps" << updateOps << "preCondition" << preCondition
                                  << WriteConcernOptions::kWriteConcernField
                                  << writeConcern.toBSON());

    auto response =
        Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts(
            opCtx,
            ReadPreferenceSetting{ReadPreference::PrimaryOnly},
            "config",
            cmd,
            Shard::RetryPolicy::kIdempotent);

    if (!response.isOK()) {
        return response.getStatus();
    }

    Status status = response.getValue().commandStatus.isOK()
        ? std::move(response.getValue().writeConcernStatus)
        : std::move(response.getValue().commandStatus);

    // TODO (Dianna) This fail point needs to be reexamined when CommitChunkMigration is in:
    // migrations will no longer be able to exercise it, so split or merge will need to do so.
    // SERVER-22659.
    if (MONGO_FAIL_POINT(failApplyChunkOps)) {
        status = Status(ErrorCodes::InternalError, "Failpoint 'failApplyChunkOps' generated error");
    }

    if (!status.isOK()) {
        string errMsg;

        // This could be a blip in the network connectivity. Check if the commit request made it.
        //
        // If all the updates were successfully written to the chunks collection, the last
        // document in the list of updates should be returned from a query to the chunks
        // collection. The last chunk can be identified by namespace and version number.

        warning() << "chunk operation commit failed and metadata will be revalidated"
                  << causedBy(redact(status));

        // Look for the chunk in this shard whose version got bumped. We assume that if that
        // mod made it to the config server, then transaction was successful.
        BSONObjBuilder query;
        lastChunkVersion.appendLegacyWithField(&query, ChunkType::lastmod());
        query.append(ChunkType::ns(), nss.ns());
        auto chunkWithStatus = getChunks(opCtx, query.obj(), BSONObj(), 1, nullptr, readConcern);

        if (!chunkWithStatus.isOK()) {
            errMsg = str::stream()
                << "getChunks function failed, unable to validate chunk "
                << "operation metadata: " << chunkWithStatus.getStatus().toString()
                << ". applyChunkOpsDeprecated failed to get confirmation "
                << "of commit. Unable to save chunk ops. Command: " << cmd
                << ". Result: " << response.getValue().response;
            return status.withContext(errMsg);
        };

        const auto& newestChunk = chunkWithStatus.getValue();

        if (newestChunk.empty()) {
            errMsg = str::stream() << "chunk operation commit failed: version "
                                   << lastChunkVersion.toString()
                                   << " doesn't exist in namespace: " << nss.ns()
                                   << ". Unable to save chunk ops. Command: " << cmd
                                   << ". Result: " << response.getValue().response;
            return status.withContext(errMsg);
        };

        invariant(newestChunk.size() == 1);
        return Status::OK();
    }

    return Status::OK();
}
Beispiel #3
0
    // ran at startup.
    static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) {
        LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl;

        OperationContextImpl txn;
        Lock::GlobalWrite lk(txn.lockState());
        WriteUnitOfWork wunit(txn.recoveryUnit());

        vector< string > dbNames;
        globalStorageEngine->listDatabases( &dbNames );

        for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
            string dbName = *i;
            LOG(1) << "\t" << dbName << endl;

            Client::Context ctx(&txn,  dbName );

            if (repl::replSettings.usingReplSets()) {
                // we only care about the _id index if we are in a replset
                checkForIdIndexes(&txn, ctx.db());
            }

            if (shouldClearNonLocalTmpCollections || dbName == "local")
                ctx.db()->clearTmpCollections(&txn);

            if ( mongodGlobalParams.repair ) {
                fassert(18506, globalStorageEngine->repairDatabase(&txn, dbName));
            }
            else if (!ctx.db()->getDatabaseCatalogEntry()->currentFilesCompatible(&txn)) {
                log() << "****";
                log() << "cannot do this upgrade without an upgrade in the middle";
                log() << "please do a --repair with 2.6 and then start this version";
                dbexit( EXIT_NEED_UPGRADE );
                invariant( false );
                return;
            }
            else {
                // major versions match, check indexes

                const string systemIndexes = ctx.db()->name() + ".system.indexes";
                Collection* coll = ctx.db()->getCollection( &txn, systemIndexes );
                auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, systemIndexes,coll));
                BSONObj index;
                Runner::RunnerState state;
                while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) {
                    const BSONObj key = index.getObjectField("key");
                    const string plugin = IndexNames::findPluginName(key);

                    if (ctx.db()->getDatabaseCatalogEntry()->isOlderThan24(&txn)) {
                        if (IndexNames::existedBefore24(plugin))
                            continue;

                        log() << "Index " << index << " claims to be of type '" << plugin << "', "
                              << "which is either invalid or did not exist before v2.4. "
                              << "See the upgrade section: "
                              << "http://dochub.mongodb.org/core/upgrade-2.4"
                              << startupWarningsLog;
                    }

                    const Status keyStatus = validateKeyPattern(key);
                    if (!keyStatus.isOK()) {
                        log() << "Problem with index " << index << ": " << keyStatus.reason()
                              << " This index can still be used however it cannot be rebuilt."
                              << " For more info see"
                              << " http://dochub.mongodb.org/core/index-validation"
                              << startupWarningsLog;
                    }
                }

                if (Runner::RUNNER_EOF != state) {
                    warning() << "Internal error while reading collection " << systemIndexes;
                }

                Database::closeDatabase(&txn, dbName.c_str());
            }
        }
        wunit.commit();

        LOG(1) << "done repairDatabases" << endl;
    }
Beispiel #4
0
StatusWith<ForwardingCatalogManager::ScopedDistLock*>
ChunkMoveOperationState::acquireMoveMetadata() {
    // Get the distributed lock
    const string whyMessage(stream() << "migrating chunk [" << _minKey << ", " << _maxKey << ") in "
                                     << _nss.ns());
    _distLockStatus = grid.forwardingCatalogManager()->distLock(_txn, _nss.ns(), whyMessage);

    if (!_distLockStatus->isOK()) {
        const string msg = stream() << "could not acquire collection lock for " << _nss.ns()
                                    << " to migrate chunk [" << _minKey << "," << _maxKey << ")"
                                    << causedBy(_distLockStatus->getStatus());
        warning() << msg;
        return Status(_distLockStatus->getStatus().code(), msg);
    }

    ShardingState* const shardingState = ShardingState::get(_txn);

    // Snapshot the metadata
    Status refreshStatus = shardingState->refreshMetadataNow(_txn, _nss.ns(), &_shardVersion);
    if (!refreshStatus.isOK()) {
        const string msg = stream() << "moveChunk cannot start migrate of chunk "
                                    << "[" << _minKey << "," << _maxKey << ")"
                                    << causedBy(refreshStatus.reason());
        warning() << msg;
        return Status(refreshStatus.code(), msg);
    }

    if (_shardVersion.majorVersion() == 0) {
        // It makes no sense to migrate if our version is zero and we have no chunks
        const string msg = stream() << "moveChunk cannot start migrate of chunk "
                                    << "[" << _minKey << "," << _maxKey << ")"
                                    << " with zero shard version";
        warning() << msg;
        return Status(ErrorCodes::IncompatibleShardingMetadata, msg);
    }

    {
        // Mongos >= v3.2 sends the full version, v3.0 only sends the epoch.
        // TODO(SERVER-20742): Stop parsing epoch separately after 3.2.
        auto& operationVersion = OperationShardVersion::get(_txn);
        if (operationVersion.hasShardVersion()) {
            _collectionVersion = operationVersion.getShardVersion(_nss);
            _collectionEpoch = _collectionVersion.epoch();
        }  // else the epoch will already be set from the parsing of the ChunkMoveOperationState

        if (_collectionEpoch != _shardVersion.epoch()) {
            const string msg = stream() << "moveChunk cannot move chunk "
                                        << "[" << _minKey << "," << _maxKey << "), "
                                        << "collection may have been dropped. "
                                        << "current epoch: " << _shardVersion.epoch()
                                        << ", cmd epoch: " << _collectionEpoch;
            warning() << msg;
            throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion);
        }
    }

    _collMetadata = shardingState->getCollectionMetadata(_nss.ns());

    // With nonzero shard version, we must have a coll version >= our shard version
    invariant(_collMetadata->getCollVersion() >= _shardVersion);

    // With nonzero shard version, we must have a shard key
    invariant(!_collMetadata->getKeyPattern().isEmpty());

    ChunkType origChunk;
    if (!_collMetadata->getNextChunk(_minKey, &origChunk) ||
        origChunk.getMin().woCompare(_minKey) || origChunk.getMax().woCompare(_maxKey)) {
        // Our boundaries are different from those passed in
        const string msg = stream() << "moveChunk cannot find chunk "
                                    << "[" << _minKey << "," << _maxKey << ")"
                                    << " to migrate, the chunk boundaries may be stale";
        warning() << msg;
        throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion);
    }

    return &_distLockStatus->getValue();
}
StatusWith<bool> SaslPLAINServerConversation::step(const StringData& inputData,
        std::string* outputData) {
    // Expecting user input on the form: user\0user\0pwd
    std::string input = inputData.toString();
    std::string pwd = "";

    try {
        _user = input.substr(0, inputData.find('\0'));
        pwd = input.substr(inputData.find('\0', _user.size()+1)+1);
    }
    catch (std::out_of_range& exception) {
        return StatusWith<bool>(ErrorCodes::AuthenticationFailed,
                                mongoutils::str::stream() << "Incorrectly formatted PLAIN client message");
    }

    User* userObj;
    // The authentication database is also the source database for the user.
    Status status = _saslAuthSession->getAuthorizationSession()->getAuthorizationManager().
                    acquireUser(_saslAuthSession->getOpCtxt(),
                                UserName(_user, _saslAuthSession->getAuthenticationDatabase()),
                                &userObj);

    if (!status.isOK()) {
        return StatusWith<bool>(status);
    }

    const User::CredentialData creds = userObj->getCredentials();
    _saslAuthSession->getAuthorizationSession()->getAuthorizationManager().
    releaseUser(userObj);

    std::string authDigest = createPasswordDigest(_user, pwd);

    if (!creds.password.empty()) {
        // Handle schemaVersion26Final (MONGODB-CR/SCRAM mixed mode)
        if (authDigest != creds.password) {
            return StatusWith<bool>(ErrorCodes::AuthenticationFailed,
                                    mongoutils::str::stream() << "Incorrect user name or password");
        }
    }
    else {
        // Handle schemaVersion28SCRAM (SCRAM only mode)
        unsigned char storedKey[scram::hashSize];
        unsigned char serverKey[scram::hashSize];

        scram::generateSecrets(authDigest,
                               reinterpret_cast<const unsigned char*>(base64::decode(creds.scram.salt).c_str()),
                               16,
                               creds.scram.iterationCount,
                               storedKey,
                               serverKey);
        if (creds.scram.storedKey != base64::encode(reinterpret_cast<const char*>(storedKey),
                scram::hashSize)) {
            return StatusWith<bool>(ErrorCodes::AuthenticationFailed,
                                    mongoutils::str::stream() << "Incorrect user name or password");
        }
    }

    *outputData = "";

    return StatusWith<bool>(true);
}
Beispiel #6
0
StatusWith<LocksType> LocksType::fromBSON(const BSONObj& source) {
    LocksType lock;

    {
        std::string lockName;
        Status status = bsonExtractStringField(source, name.name(), &lockName);
        if (!status.isOK())
            return status;
        lock._name = lockName;
    }

    {
        long long lockStateInt;
        Status status = bsonExtractIntegerField(source, state.name(), &lockStateInt);
        if (!status.isOK())
            return status;
        lock._state = static_cast<State>(lockStateInt);
    }

    if (source.hasField(process.name())) {
        std::string lockProcess;
        Status status = bsonExtractStringField(source, process.name(), &lockProcess);
        if (!status.isOK())
            return status;
        lock._process = lockProcess;
    }

    if (source.hasField(lockID.name())) {
        BSONElement lockIDElem;
        Status status = bsonExtractTypedField(source, lockID.name(), BSONType::jstOID, &lockIDElem);
        if (!status.isOK())
            return status;
        lock._lockID = lockIDElem.OID();
    }

    if (source.hasField(who.name())) {
        std::string lockWho;
        Status status = bsonExtractStringField(source, who.name(), &lockWho);
        if (!status.isOK())
            return status;
        lock._who = lockWho;
    }

    if (source.hasField(why.name())) {
        std::string lockWhy;
        Status status = bsonExtractStringField(source, why.name(), &lockWhy);
        if (!status.isOK())
            return status;
        lock._why = lockWhy;
    }

    return lock;
}
    Status parseCreateOrUpdateUserCommands(const BSONObj& cmdObj,
                                           const StringData& cmdName,
                                           const std::string& dbname,
                                           CreateOrUpdateUserArgs* parsedArgs) {
        unordered_set<std::string> validFieldNames;
        validFieldNames.insert(cmdName.toString());
        validFieldNames.insert("customData");
        validFieldNames.insert("pwd");
        validFieldNames.insert("roles");
        validFieldNames.insert("writeConcern");

        Status status = _checkNoExtraFields(cmdObj, cmdName, validFieldNames);
        if (!status.isOK()) {
            return status;
        }

        status = _extractWriteConcern(cmdObj, &parsedArgs->writeConcern);
        if (!status.isOK()) {
            return status;
        }

        BSONObjBuilder userObjBuilder;

        // Parse user name
        std::string userName;
        status = bsonExtractStringField(cmdObj, cmdName, &userName);
        if (!status.isOK()) {
            return status;
        }

        parsedArgs->userName = UserName(userName, dbname);

        // Parse password
        if (cmdObj.hasField("pwd")) {
            std::string clearTextPassword;
            status = bsonExtractStringField(cmdObj, "pwd", &clearTextPassword);
            if (!status.isOK()) {
                return status;
            }

            parsedArgs->hashedPassword = auth::createPasswordDigest(userName, clearTextPassword);
            parsedArgs->hasHashedPassword = true;
        }

        // Parse custom data
        if (cmdObj.hasField("customData")) {
            BSONElement element;
            status = bsonExtractTypedField(cmdObj, "customData", Object, &element);
            if (!status.isOK()) {
                return status;
            }
            parsedArgs->customData = element.Obj();
            parsedArgs->hasCustomData = true;
        }

        // Parse roles
        if (cmdObj.hasField("roles")) {
            BSONElement rolesElement;
            status = bsonExtractTypedField(cmdObj, "roles", Array, &rolesElement);
            if (!status.isOK()) {
                return status;
            }
            status = _extractRoleDataFromBSONArray(rolesElement, dbname, &parsedArgs->roles);
            if (!status.isOK()) {
                return status;
            }
            parsedArgs->hasRoles = true;
        }

        return Status::OK();
    }
Beispiel #8
0
    bool wrappedRun(OperationContext* txn,
                    const string& dbname,
                    BSONObj& jsobj,
                    string& errmsg,
                    BSONObjBuilder& anObjBuilder) {
        BSONElement e = jsobj.firstElement();
        const string toDeleteNs = dbname + '.' + e.valuestr();
        if (!serverGlobalParams.quiet) {
            LOG(0) << "CMD: dropIndexes " << toDeleteNs << endl;
        }

        Client::Context ctx(toDeleteNs);
        Database* db = ctx.db();

        Collection* collection = db->getCollection( txn, toDeleteNs );
        if ( ! collection ) {
            errmsg = "ns not found";
            return false;
        }

        stopIndexBuilds(txn, db, jsobj);

        IndexCatalog* indexCatalog = collection->getIndexCatalog();
        anObjBuilder.appendNumber("nIndexesWas", indexCatalog->numIndexesTotal() );


        BSONElement f = jsobj.getField("index");
        if ( f.type() == String ) {

            string indexToDelete = f.valuestr();

            if ( indexToDelete == "*" ) {
                Status s = indexCatalog->dropAllIndexes(txn, false);
                if ( !s.isOK() ) {
                    appendCommandStatus( anObjBuilder, s );
                    return false;
                }
                anObjBuilder.append("msg", "non-_id indexes dropped for collection");
                return true;
            }

            IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByName( indexToDelete );
            if ( desc == NULL ) {
                errmsg = str::stream() << "index not found with name [" << indexToDelete << "]";
                return false;
            }

            if ( desc->isIdIndex() ) {
                errmsg = "cannot drop _id index";
                return false;
            }

            Status s = indexCatalog->dropIndex(txn, desc);
            if ( !s.isOK() ) {
                appendCommandStatus( anObjBuilder, s );
                return false;
            }

            return true;
        }

        if ( f.type() == Object ) {
            IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByKeyPattern( f.embeddedObject() );
            if ( desc == NULL ) {
                errmsg = "can't find index with key:";
                errmsg += f.embeddedObject().toString();
                return false;
            }

            if ( desc->isIdIndex() ) {
                errmsg = "cannot drop _id index";
                return false;
            }

            Status s = indexCatalog->dropIndex(txn, desc);
            if ( !s.isOK() ) {
                appendCommandStatus( anObjBuilder, s );
                return false;
            }

            return true;
        }

        errmsg = "invalid index name spec";
        return false;
    }
Beispiel #9
0
    bool run(OperationContext* txn, const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) {
        DBDirectClient db;

        BSONElement e = jsobj.firstElement();
        string toDeleteNs = dbname + '.' + e.valuestr();

        LOG(0) << "CMD: reIndex " << toDeleteNs << endl;

        Lock::DBWrite dbXLock(txn->lockState(), dbname);
        Client::Context ctx(toDeleteNs);

        Collection* collection = ctx.db()->getCollection( txn, toDeleteNs );

        if ( !collection ) {
            errmsg = "ns not found";
            return false;
        }

        BackgroundOperation::assertNoBgOpInProgForNs( toDeleteNs );

        std::vector<BSONObj> indexesInProg = stopIndexBuilds(txn, ctx.db(), jsobj);

        list<BSONObj> all;
        auto_ptr<DBClientCursor> i = db.query( dbname + ".system.indexes" , BSON( "ns" << toDeleteNs ) , 0 , 0 , 0 , QueryOption_SlaveOk );
        BSONObjBuilder b;
        while ( i->more() ) {
            const BSONObj spec = i->next().removeField("v").getOwned();
            const BSONObj key = spec.getObjectField("key");
            const Status keyStatus = validateKeyPattern(key);
            if (!keyStatus.isOK()) {
                errmsg = str::stream()
                         << "Cannot rebuild index " << spec << ": " << keyStatus.reason()
                         << " For more info see http://dochub.mongodb.org/core/index-validation";
                return false;
            }

            b.append( BSONObjBuilder::numStr( all.size() ) , spec );
            all.push_back( spec );
        }
        result.appendNumber( "nIndexesWas", collection->getIndexCatalog()->numIndexesTotal() );

        Status s = collection->getIndexCatalog()->dropAllIndexes(txn, true);
        if ( !s.isOK() ) {
            errmsg = "dropIndexes failed";
            return appendCommandStatus( result, s );
        }

        for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) {
            BSONObj o = *i;
            LOG(1) << "reIndex ns: " << toDeleteNs << " index: " << o << endl;
            Status s = collection->getIndexCatalog()->createIndex(txn, o, false);
            if ( !s.isOK() )
                return appendCommandStatus( result, s );
        }

        result.append( "nIndexes" , (int)all.size() );
        result.appendArray( "indexes" , b.obj() );

        IndexBuilder::restoreIndexes(indexesInProg);
        return true;
    }
Beispiel #10
0
Status MetadataLoader::initChunks(CatalogManager* catalogManager,
                                  const string& ns,
                                  const string& shard,
                                  const CollectionMetadata* oldMetadata,
                                  CollectionMetadata* metadata) const
{
    map<string, ChunkVersion> versionMap;

    // Preserve the epoch
    versionMap[shard] = metadata->_shardVersion;
    OID epoch = metadata->getCollVersion().epoch();
    bool fullReload = true;

    // Check to see if we should use the old version or not.
    if ( oldMetadata ) {

        // If our epochs are compatible, it's useful to use the old metadata for diffs
        if ( oldMetadata->getCollVersion().hasEqualEpoch( epoch ) ) {

            fullReload = false;
            invariant( oldMetadata->isValid() );

            versionMap[shard] = oldMetadata->_shardVersion;
            metadata->_collVersion = oldMetadata->_collVersion;

            // TODO: This could be made more efficient if copying not required, but
            // not as frequently reloaded as in mongos.
            metadata->_chunksMap = oldMetadata->_chunksMap;

            LOG( 2 ) << "loading new chunks for collection " << ns
                     << " using old metadata w/ version " << oldMetadata->getShardVersion()
                     << " and " << metadata->_chunksMap.size() << " chunks" << endl;
        }
        else {
            warning() << "reloading collection metadata for " << ns << " with new epoch "
                      << epoch.toString() << ", the current epoch is "
                      << oldMetadata->getCollVersion().epoch().toString() << endl;
        }
    }


    // Exposes the new metadata's range map and version to the "differ," who
    // would ultimately be responsible of filling them up.
    SCMConfigDiffTracker differ( shard );
    differ.attach( ns, metadata->_chunksMap, metadata->_collVersion, versionMap );

    try {

        std::vector<ChunkType> chunks;
        Status status = catalogManager->getChunks(differ.configDiffQuery(),
                        &chunks);
        if (!status.isOK()) {
            if (status == ErrorCodes::HostUnreachable) {
                // Make our metadata invalid
                metadata->_collVersion = ChunkVersion( 0, 0, OID() );
                metadata->_chunksMap.clear();
            }
            return status;
        }

        //
        // The diff tracker should always find at least one chunk (the highest chunk we saw
        // last time).  If not, something has changed on the config server (potentially between
        // when we read the collection data and when we read the chunks data).
        //
        int diffsApplied = differ.calculateConfigDiff(chunks);
        if ( diffsApplied > 0 ) {

            // Chunks found, return ok

            LOG(2) << "loaded " << diffsApplied << " chunks into new metadata for " << ns
                   << " with version " << metadata->_collVersion << endl;

            metadata->_shardVersion = versionMap[shard];
            metadata->fillRanges();

            invariant( metadata->isValid() );
            return Status::OK();
        }
        else if ( diffsApplied == 0 ) {

            // No chunks found, the collection is dropping or we're confused
            // If this is a full reload, assume it is a drop for backwards compatibility
            // TODO: drop the config.collections entry *before* the chunks and eliminate this
            // ambiguity

            string errMsg =
                str::stream() << "no chunks found when reloading " << ns
                << ", previous version was "
                << metadata->_collVersion.toString()
                << ( fullReload ? ", this is a drop" : "" );

            warning() << errMsg << endl;

            metadata->_collVersion = ChunkVersion( 0, 0, OID() );
            metadata->_chunksMap.clear();

            return fullReload ? Status( ErrorCodes::NamespaceNotFound, errMsg ) :
                   Status( ErrorCodes::RemoteChangeDetected, errMsg );
        }
        else {

            // Invalid chunks found, our epoch may have changed because we dropped/recreated
            // the collection.

            string errMsg = // br
                str::stream() << "invalid chunks found when reloading " << ns
                << ", previous version was "
                << metadata->_collVersion.toString()
                << ", this should be rare";

            warning() << errMsg << endl;

            metadata->_collVersion = ChunkVersion( 0, 0, OID() );
            metadata->_chunksMap.clear();

            return Status( ErrorCodes::RemoteChangeDetected, errMsg );
        }
    }
    catch ( const DBException& e ) {
        string errMsg = str::stream() << "problem querying chunks metadata" << causedBy( e );

        // We deliberately do not return connPtr to the pool, since it was involved
        // with the error here.

        return Status( ErrorCodes::HostUnreachable, errMsg );
    }
}
Beispiel #11
0
    // Construct a subscription if needed
    // =============================================================================================
    Status SubscriptionFactory::acquireSubscription(
            const SubscriptionSettings& subscriptionSettings,
            Subscription*&              subscription)
    {
        logger_->debug("Acquiring subscription with the following settings:");
        logger_->debug(subscriptionSettings.toString());

        Status ret;

        subscription = 0;

        // lock the mutex to make sure the subscriptionMap_ is not being manipulated
        UaMutexLocker locker(&subscriptionMapMutex_);

        // we'll try to find a similar subscription that can be re-used,
        // unless we're creating an "unique" subscription
        // (one that is only created for -and used by- the current request)
        if (subscriptionSettings.unique)
        {
            logger_->debug("The requested subscription must be unique");
        }
        else
        {
            // loop trough the subscriptions ...
            for (SubscriptionMap::const_iterator it = subscriptionMap_.begin();
                 it != subscriptionMap_.end();
                 ++it)
            {
                // ... until a suitable one is found
                if (it->second->subscriptionSettings() == subscriptionSettings)
                {
                    subscription = it->second;
                    logger_->debug("A suitable subscription (ClientSubscriptionHandle=%d) already exists",
                                   subscription->clientSubscriptionHandle());

                    // get the ClientSubscriptionHandle of the subscription
                    ClientSubscriptionHandle handle = subscription->clientSubscriptionHandle();

                    // now increment the activity count of the subscription
                    activityMapMutex_.lock();
                    activityMap_[handle] = activityMap_[handle] + 1;
                    activityMapMutex_.unlock();

                    ret.setGood();

                    break;
                }
            }
        }

        // if no subscription exists yet, we create one
        if (subscription == 0)
        {
            ClientSubscriptionHandle clientSubscriptionHandle;
            clientSubscriptionHandle = database_->createUniqueClientSubscriptionHandle();

            logger_->debug("We create a new subscription with clientSubscriptionHandle %d",
                           clientSubscriptionHandle);

            // create a new subscription instance
            subscription = new Subscription(
                    logger_->loggerFactory(),
                    subscriptionSettings,
                    clientSubscriptionHandle,
                    clientConnectionId_,
                    uaSession_,
                    this,
                    clientInterface_,
                    database_);

            // store the new subscription instance in the subscriptionMap
            subscriptionMap_[clientSubscriptionHandle] = subscription;

            logger_->debug("The new subscription has been created");

            // create an activity count for the subscription
            activityMapMutex_.lock();
            activityMap_[clientSubscriptionHandle] = 1;
            activityMapMutex_.unlock();

            // create the subscription on the server
            ret = subscription->createSubscription();
        }

        // 'subscription' now points to an existing Subscription instance
        // (i.e. a valid memory location)

        ret.setGood();


        // add some diagnostics
        if (ret.isGood())
        {
            activityMapMutex_.lock();
            logger_->debug("The requested subscription is acquired (#activities: %d)",
                           activityMap_[subscription->clientSubscriptionHandle()]);
            activityMapMutex_.unlock();
        }
        else
        {
            logger_->error("The requested subscription could not be acquired");
            ret.addDiagnostic("The requested subscription could not be acquired");
        }
        return ret;
    }
Beispiel #12
0
    // ran at startup.
    static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) {
        //        LastError * le = lastError.get( true );
        LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl;

        Lock::GlobalWrite lk;
        DurTransaction txn;
        vector< string > dbNames;
        getDatabaseNames( dbNames );
        for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) {
            string dbName = *i;
            LOG(1) << "\t" << dbName << endl;

            Client::Context ctx( dbName );
            DataFile *p = ctx.db()->getExtentManager().getFile( 0 );
            DataFileHeader *h = p->getHeader();

            if ( replSettings.usingReplSets() ) {
                // we only care about the _id index if we are in a replset
                checkForIdIndexes(ctx.db());
            }

            if (shouldClearNonLocalTmpCollections || dbName == "local")
                ctx.db()->clearTmpCollections(&txn);

            if (!h->isCurrentVersion() || mongodGlobalParams.repair) {

                if( h->version <= 0 ) {
                    uasserted(14026,
                      str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version
                                    << " info: " << h->versionMinor << ' ' << h->fileLength);
                }

                if ( !h->isCurrentVersion() ) {
                    log() << "****" << endl;
                    log() << "****" << endl;
                    log() << "need to upgrade database " << dbName << " "
                          << "with pdfile version " << h->version << "." << h->versionMinor << ", "
                          << "new version: "
                          << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR_22_AND_OLDER
                          << endl;
                }

                if (mongodGlobalParams.upgrade) {
                    // QUESTION: Repair even if file format is higher version than code?
                    doDBUpgrade( dbName, h );
                }
                else {
                    log() << "\t Not upgrading, exiting" << endl;
                    log() << "\t run --upgrade to upgrade dbs, then start again" << endl;
                    log() << "****" << endl;
                    dbexit( EXIT_NEED_UPGRADE );
                    mongodGlobalParams.upgrade = 1;
                    return;
                }
            }
            else {
                const string systemIndexes = ctx.db()->name() + ".system.indexes";
                Collection* coll = ctx.db()->getCollection( systemIndexes );
                auto_ptr<Runner> runner(InternalPlanner::collectionScan(systemIndexes,coll));
                BSONObj index;
                Runner::RunnerState state;
                while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) {
                    const BSONObj key = index.getObjectField("key");
                    const string plugin = IndexNames::findPluginName(key);

                    if (h->versionMinor == PDFILE_VERSION_MINOR_22_AND_OLDER) {
                        if (IndexNames::existedBefore24(plugin))
                            continue;

                        log() << "Index " << index << " claims to be of type '" << plugin << "', "
                              << "which is either invalid or did not exist before v2.4. "
                              << "See the upgrade section: "
                              << "http://dochub.mongodb.org/core/upgrade-2.4"
                              << startupWarningsLog;
                    }

                    const Status keyStatus = validateKeyPattern(key);
                    if (!keyStatus.isOK()) {
                        log() << "Problem with index " << index << ": " << keyStatus.reason()
                              << " This index can still be used however it cannot be rebuilt."
                              << " For more info see"
                              << " http://dochub.mongodb.org/core/index-validation"
                              << startupWarningsLog;
                    }
                }

                if (Runner::RUNNER_EOF != state) {
                    warning() << "Internal error while reading collection " << systemIndexes;
                }

                Database::closeDatabase(dbName.c_str(), storageGlobalParams.dbpath);
            }
        }

        LOG(1) << "done repairDatabases" << endl;

        if (mongodGlobalParams.upgrade) {
            log() << "finished checking dbs" << endl;
            cc().shutdown();
            dbexit( EXIT_CLEAN );
        }
    }
Beispiel #13
0
    StatusWith<BSONObj> fixDocumentForInsert( const BSONObj& doc ) {
        if ( doc.objsize() > BSONObjMaxUserSize )
            return StatusWith<BSONObj>( ErrorCodes::BadValue,
                                        str::stream()
                                        << "object to insert too large"
                                        << doc.objsize() );

        bool firstElementIsId = doc.firstElement().fieldNameStringData() == "_id";
        bool hasTimestampToFix = false;
        {
            BSONObjIterator i( doc );
            while ( i.more() ) {
                BSONElement e = i.next();

                if ( e.type() == Timestamp && e.timestampValue() == 0 ) {
                    // we replace Timestamp(0,0) at the top level with a correct value
                    // in the fast pass, we just mark that we want to swap
                    hasTimestampToFix = true;
                    break;
                }

                const char* fieldName = e.fieldName();

                if ( fieldName[0] == '$' ) {
                    return StatusWith<BSONObj>( ErrorCodes::BadValue,
                                                str::stream()
                                                << "Document can't have $ prefixed field names: "
                                                << e.fieldName() );
                }

                // check no regexp for _id (SERVER-9502)
                // also, disallow undefined and arrays
                if ( str::equals( fieldName, "_id") ) {
                    if ( e.type() == RegEx ) {
                        return StatusWith<BSONObj>( ErrorCodes::BadValue,
                                                    "can't use a regex for _id" );
                    }
                    if ( e.type() == Undefined ) {
                        return StatusWith<BSONObj>( ErrorCodes::BadValue,
                                                    "can't use a undefined for _id" );
                    }
                    if ( e.type() == Array ) {
                        return StatusWith<BSONObj>( ErrorCodes::BadValue,
                                                    "can't use an array for _id" );
                    }
                    if ( e.type() == Object ) {
                        BSONObj o = e.Obj();
                        Status s = o.storageValidEmbedded();
                        if ( !s.isOK() )
                            return StatusWith<BSONObj>( s );
                    }
                }

            }
        }

        if ( firstElementIsId && !hasTimestampToFix )
            return StatusWith<BSONObj>( BSONObj() );

        bool hadId = firstElementIsId;

        BSONObjIterator i( doc );

        BSONObjBuilder b( doc.objsize() + 16 );
        if ( firstElementIsId ) {
            b.append( doc.firstElement() );
            i.next();
        }
        else {
            BSONElement e = doc["_id"];
            if ( e.type() ) {
                b.append( e );
                hadId = true;
            }
            else {
                b.appendOID( "_id", NULL, true );
            }
        }

        while ( i.more() ) {
            BSONElement e = i.next();
            if ( hadId && e.fieldNameStringData() == "_id" ) {
                // no-op
            }
            else if ( e.type() == Timestamp && e.timestampValue() == 0 ) {
                mutex::scoped_lock lk(OpTime::m);
                b.append( e.fieldName(), OpTime::now(lk) );
            }
            else {
                b.append( e );
            }
        }
        return StatusWith<BSONObj>( b.obj() );
    }
StatusWith<CompactStats> compactCollection(OperationContext* opCtx,
                                           Collection* collection,
                                           const CompactOptions* compactOptions) {
    dassert(opCtx->lockState()->isCollectionLockedForMode(collection->ns(), MODE_X));

    DisableDocumentValidation validationDisabler(opCtx);

    auto recordStore = collection->getRecordStore();
    auto indexCatalog = collection->getIndexCatalog();

    if (!recordStore->compactSupported())
        return StatusWith<CompactStats>(ErrorCodes::CommandNotSupported,
                                        str::stream()
                                            << "cannot compact collection with record store: "
                                            << recordStore->name());

    if (recordStore->compactsInPlace()) {
        CompactStats stats;
        Status status = recordStore->compact(opCtx);
        if (!status.isOK())
            return StatusWith<CompactStats>(status);

        // Compact all indexes (not including unfinished indexes)
        status = indexCatalog->compactIndexes(opCtx);
        if (!status.isOK())
            return StatusWith<CompactStats>(status);

        return StatusWith<CompactStats>(stats);
    }

    if (indexCatalog->numIndexesInProgress(opCtx))
        return StatusWith<CompactStats>(ErrorCodes::BadValue,
                                        "cannot compact when indexes in progress");

    std::vector<BSONObj> indexSpecs;
    {
        std::unique_ptr<IndexCatalog::IndexIterator> ii(
            indexCatalog->getIndexIterator(opCtx, false));
        while (ii->more()) {
            const IndexDescriptor* descriptor = ii->next()->descriptor();

            // Compact always creates the new index in the foreground.
            const BSONObj spec =
                descriptor->infoObj().removeField(IndexDescriptor::kBackgroundFieldName);
            const BSONObj key = spec.getObjectField("key");
            const Status keyStatus =
                index_key_validate::validateKeyPattern(key, descriptor->version());
            if (!keyStatus.isOK()) {
                return StatusWith<CompactStats>(
                    ErrorCodes::CannotCreateIndex,
                    str::stream() << "Cannot compact collection due to invalid index " << spec
                                  << ": "
                                  << keyStatus.reason()
                                  << " For more info see"
                                  << " http://dochub.mongodb.org/core/index-validation");
            }
            indexSpecs.push_back(spec);
        }
    }

    // Give a chance to be interrupted *before* we drop all indexes.
    opCtx->checkForInterrupt();

    {
        // note that the drop indexes call also invalidates all clientcursors for the namespace,
        // which is important and wanted here
        WriteUnitOfWork wunit(opCtx);
        log() << "compact dropping indexes";
        indexCatalog->dropAllIndexes(opCtx, true);
        wunit.commit();
    }

    CompactStats stats;

    MultiIndexBlock indexer;
    indexer.ignoreUniqueConstraint();  // in compact we should be doing no checking

    // The 'indexer' could throw, so ensure build cleanup occurs.
    ON_BLOCK_EXIT([&] { indexer.cleanUpAfterBuild(opCtx, collection); });

    Status status =
        indexer.init(opCtx, collection, indexSpecs, MultiIndexBlock::kNoopOnInitFn).getStatus();
    if (!status.isOK())
        return StatusWith<CompactStats>(status);

    status = recordStore->compact(opCtx);
    if (!status.isOK())
        return StatusWith<CompactStats>(status);

    log() << "starting index commits";
    status = indexer.dumpInsertsFromBulk(opCtx);
    if (!status.isOK())
        return StatusWith<CompactStats>(status);

    {
        WriteUnitOfWork wunit(opCtx);
        status = indexer.commit(opCtx,
                                collection,
                                MultiIndexBlock::kNoopOnCreateEachFn,
                                MultiIndexBlock::kNoopOnCommitFn);
        if (!status.isOK()) {
            return StatusWith<CompactStats>(status);
        }
        wunit.commit();
    }

    return StatusWith<CompactStats>(stats);
}
Beispiel #15
0
    void SyncSourceFeedback::run() {
        Client::initThread("SyncSourceFeedbackThread");
        OperationContextImpl txn;

        bool positionChanged = false;
        bool handshakeNeeded = false;
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
        while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone.
            {
                boost::unique_lock<boost::mutex> lock(_mtx);
                while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) {
                    _cond.wait(lock);
                }

                if (_shutdownSignaled) {
                    break;
                }

                positionChanged = _positionChanged;
                handshakeNeeded = _handshakeNeeded;
                _positionChanged = false;
                _handshakeNeeded = false;
            }

            MemberState state = replCoord->getCurrentMemberState();
            if (state.primary() || state.startup()) {
                _resetConnection();
                continue;
            }
            const HostAndPort target = BackgroundSync::get()->getSyncTarget();
            if (_syncTarget != target) {
                _resetConnection();
                _syncTarget = target;
            }
            if (!hasConnection()) {
                // fix connection if need be
                if (target.empty()) {
                    sleepmillis(500);
                    continue;
                }
                if (!_connect(&txn, target)) {
                    sleepmillis(500);
                    continue;
                }
                handshakeNeeded = true;
            }
            if (handshakeNeeded) {
                positionChanged = true;
                if (!replHandshake(&txn)) {
                    boost::unique_lock<boost::mutex> lock(_mtx);
                    _handshakeNeeded = true;
                    continue;
                }
            }
            if (positionChanged) {
                Status status = updateUpstream(&txn);
                if (!status.isOK()) {
                    boost::unique_lock<boost::mutex> lock(_mtx);
                    _positionChanged = true;
                    if (status == ErrorCodes::NodeNotFound) {
                        _handshakeNeeded = true;
                    }
                }
            }
        }
        cc().shutdown();
    }
Beispiel #16
0
    PlanStage* parseQuery(OperationContext* txn,
                          Collection* collection,
                          BSONObj obj,
                          WorkingSet* workingSet,
                          OwnedPointerVector<MatchExpression>* exprs) {
        BSONElement firstElt = obj.firstElement();
        if (!firstElt.isABSONObj()) {
            return NULL;
        }
        BSONObj paramObj = firstElt.Obj();

        MatchExpression* matcher = NULL;
        BSONObj nodeArgs;

        // Every node has these two fields.
        const string filterTag = "filter";
        const string argsTag = "args";

        BSONObjIterator it(paramObj);
        while (it.more()) {
            BSONElement e = it.next();
            if (!e.isABSONObj()) {
                return NULL;
            }
            BSONObj argObj = e.Obj();
            if (filterTag == e.fieldName()) {
                StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse(
                    argObj, ExtensionsCallbackReal(txn, &collection->ns()));
                if (!statusWithMatcher.isOK()) {
                    return NULL;
                }
                std::unique_ptr<MatchExpression> me = std::move(statusWithMatcher.getValue());
                // exprs is what will wind up deleting this.
                matcher = me.release();
                verify(NULL != matcher);
                exprs->mutableVector().push_back(matcher);
            } else if (argsTag == e.fieldName()) {
                nodeArgs = argObj;
            } else {
                uasserted(16910,
                          "Unknown fieldname " + string(e.fieldName()) + " in query node " +
                              obj.toString());
                return NULL;
            }
        }

        string nodeName = firstElt.fieldName();

        if ("ixscan" == nodeName) {
            // This'll throw if it's not an obj but that's OK.
            BSONObj keyPatternObj = nodeArgs["keyPattern"].Obj();

            IndexDescriptor* desc =
                collection->getIndexCatalog()->findIndexByKeyPattern(txn, keyPatternObj);
            uassert(16890, "Can't find index: " + keyPatternObj.toString(), desc);

            IndexScanParams params;
            params.descriptor = desc;
            params.bounds.isSimpleRange = true;
            params.bounds.startKey = stripFieldNames(nodeArgs["startKey"].Obj());
            params.bounds.endKey = stripFieldNames(nodeArgs["endKey"].Obj());
            params.bounds.endKeyInclusive = nodeArgs["endKeyInclusive"].Bool();
            params.direction = nodeArgs["direction"].numberInt();

            return new IndexScan(txn, params, workingSet, matcher);
        } else if ("andHash" == nodeName) {
            uassert(
                16921, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj());

            auto andStage = make_unique<AndHashStage>(txn, workingSet, collection);

            int nodesAdded = 0;
            BSONObjIterator it(nodeArgs["nodes"].Obj());
            while (it.more()) {
                BSONElement e = it.next();
                uassert(16922, "node of AND isn't an obj?: " + e.toString(), e.isABSONObj());

                PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs);
                uassert(
                    16923, "Can't parse sub-node of AND: " + e.Obj().toString(), NULL != subNode);
                // takes ownership
                andStage->addChild(subNode);
                ++nodesAdded;
            }

            uassert(16927, "AND requires more than one child", nodesAdded >= 2);

            return andStage.release();
        } else if ("andSorted" == nodeName) {
            uassert(
                16924, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj());

            auto andStage = make_unique<AndSortedStage>(txn, workingSet, collection);

            int nodesAdded = 0;
            BSONObjIterator it(nodeArgs["nodes"].Obj());
            while (it.more()) {
                BSONElement e = it.next();
                uassert(16925, "node of AND isn't an obj?: " + e.toString(), e.isABSONObj());

                PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs);
                uassert(
                    16926, "Can't parse sub-node of AND: " + e.Obj().toString(), NULL != subNode);
                // takes ownership
                andStage->addChild(subNode);
                ++nodesAdded;
            }

            uassert(16928, "AND requires more than one child", nodesAdded >= 2);

            return andStage.release();
        } else if ("or" == nodeName) {
            uassert(
                16934, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj());
            uassert(16935, "Dedup argument must be provided to OR", !nodeArgs["dedup"].eoo());
            BSONObjIterator it(nodeArgs["nodes"].Obj());
            auto orStage = make_unique<OrStage>(txn, workingSet, nodeArgs["dedup"].Bool(), matcher);
            while (it.more()) {
                BSONElement e = it.next();
                if (!e.isABSONObj()) {
                    return NULL;
                }
                PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs);
                uassert(
                    16936, "Can't parse sub-node of OR: " + e.Obj().toString(), NULL != subNode);
                // takes ownership
                orStage->addChild(subNode);
            }

            return orStage.release();
        } else if ("fetch" == nodeName) {
            uassert(
                16929, "Node argument must be provided to fetch", nodeArgs["node"].isABSONObj());
            PlanStage* subNode =
                parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs);
            uassert(28731,
                    "Can't parse sub-node of FETCH: " + nodeArgs["node"].Obj().toString(),
                    NULL != subNode);
            return new FetchStage(txn, workingSet, subNode, matcher, collection);
        } else if ("limit" == nodeName) {
            uassert(
                16937, "Limit stage doesn't have a filter (put it on the child)", NULL == matcher);
            uassert(
                16930, "Node argument must be provided to limit", nodeArgs["node"].isABSONObj());
            uassert(16931, "Num argument must be provided to limit", nodeArgs["num"].isNumber());
            PlanStage* subNode =
                parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs);
            uassert(28732,
                    "Can't parse sub-node of LIMIT: " + nodeArgs["node"].Obj().toString(),
                    NULL != subNode);
            return new LimitStage(txn, nodeArgs["num"].numberInt(), workingSet, subNode);
        } else if ("skip" == nodeName) {
            uassert(
                16938, "Skip stage doesn't have a filter (put it on the child)", NULL == matcher);
            uassert(16932, "Node argument must be provided to skip", nodeArgs["node"].isABSONObj());
            uassert(16933, "Num argument must be provided to skip", nodeArgs["num"].isNumber());
            PlanStage* subNode =
                parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs);
            uassert(28733,
                    "Can't parse sub-node of SKIP: " + nodeArgs["node"].Obj().toString(),
                    NULL != subNode);
            return new SkipStage(txn, nodeArgs["num"].numberInt(), workingSet, subNode);
        } else if ("cscan" == nodeName) {
            CollectionScanParams params;
            params.collection = collection;

            // What direction?
            uassert(16963,
                    "Direction argument must be specified and be a number",
                    nodeArgs["direction"].isNumber());
            if (1 == nodeArgs["direction"].numberInt()) {
                params.direction = CollectionScanParams::FORWARD;
            } else {
                params.direction = CollectionScanParams::BACKWARD;
            }

            return new CollectionScan(txn, params, workingSet, matcher);
        }
// sort is disabled for now.
#if 0
            else if ("sort" == nodeName) {
                uassert(16969, "Node argument must be provided to sort",
                        nodeArgs["node"].isABSONObj());
                uassert(16970, "Pattern argument must be provided to sort",
                        nodeArgs["pattern"].isABSONObj());
                PlanStage* subNode = parseQuery(txn, db, nodeArgs["node"].Obj(), workingSet, exprs);
                SortStageParams params;
                params.pattern = nodeArgs["pattern"].Obj();
                return new SortStage(params, workingSet, subNode);
            }
#endif
        else if ("mergeSort" == nodeName) {
            uassert(
                16971, "Nodes argument must be provided to sort", nodeArgs["nodes"].isABSONObj());
            uassert(16972,
                    "Pattern argument must be provided to sort",
                    nodeArgs["pattern"].isABSONObj());

            MergeSortStageParams params;
            params.pattern = nodeArgs["pattern"].Obj();
            // Dedup is true by default.

            auto mergeStage = make_unique<MergeSortStage>(txn, params, workingSet, collection);

            BSONObjIterator it(nodeArgs["nodes"].Obj());
            while (it.more()) {
                BSONElement e = it.next();
                uassert(16973, "node of mergeSort isn't an obj?: " + e.toString(), e.isABSONObj());

                PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs);
                uassert(16974,
                        "Can't parse sub-node of mergeSort: " + e.Obj().toString(),
                        NULL != subNode);
                // takes ownership
                mergeStage->addChild(subNode);
            }
            return mergeStage.release();
        } else if ("text" == nodeName) {
            string search = nodeArgs["search"].String();

            vector<IndexDescriptor*> idxMatches;
            collection->getIndexCatalog()->findIndexByType(txn, "text", idxMatches);
            uassert(17194, "Expected exactly one text index", idxMatches.size() == 1);

            IndexDescriptor* index = idxMatches[0];
            FTSAccessMethod* fam =
                dynamic_cast<FTSAccessMethod*>(collection->getIndexCatalog()->getIndex(index));
            TextStageParams params(fam->getSpec());
            params.index = index;

            // TODO: Deal with non-empty filters.  This is a hack to put in covering information
            // that can only be checked for equality.  We ignore this now.
            Status s = fam->getSpec().getIndexPrefix(BSONObj(), &params.indexPrefix);
            if (!s.isOK()) {
                // errmsg = s.toString();
                return NULL;
            }

            params.spec = fam->getSpec();

            params.query.setQuery(search);
            params.query.setLanguage(fam->getSpec().defaultLanguage().str());
            params.query.setCaseSensitive(TextMatchExpressionBase::kCaseSensitiveDefault);
            params.query.setDiacriticSensitive(TextMatchExpressionBase::kDiacriticSensitiveDefault);
            if (!params.query.parse(fam->getSpec().getTextIndexVersion()).isOK()) {
                return NULL;
            }

            return new TextStage(txn, params, workingSet, matcher);
        } else if ("delete" == nodeName) {
            uassert(
                18636, "Delete stage doesn't have a filter (put it on the child)", NULL == matcher);
            uassert(
                18637, "node argument must be provided to delete", nodeArgs["node"].isABSONObj());
            uassert(18638,
                    "isMulti argument must be provided to delete",
                    nodeArgs["isMulti"].type() == Bool);
            PlanStage* subNode =
                parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs);
            uassert(28734,
                    "Can't parse sub-node of DELETE: " + nodeArgs["node"].Obj().toString(),
                    NULL != subNode);
            DeleteStageParams params;
            params.isMulti = nodeArgs["isMulti"].Bool();
            return new DeleteStage(txn, params, workingSet, collection, subNode);
        } else {
            return NULL;
        }
    }
Beispiel #17
0
Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp,
                                         lldb::ValueObjectSP &new_value_sp) {
  Status error;
  if (!new_value_sp) {
    error.SetErrorString("Empty value object for return value.");
    return error;
  }

  CompilerType compiler_type = new_value_sp->GetCompilerType();
  if (!compiler_type) {
    error.SetErrorString("Null clang type for return value.");
    return error;
  }

  Thread *thread = frame_sp->GetThread().get();

  bool is_signed;
  uint32_t count;
  bool is_complex;

  RegisterContext *reg_ctx = thread->GetRegisterContext().get();

  bool set_it_simple = false;
  if (compiler_type.IsIntegerOrEnumerationType(is_signed) ||
      compiler_type.IsPointerType()) {
    const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName("r3", 0);

    DataExtractor data;
    Status data_error;
    size_t num_bytes = new_value_sp->GetData(data, data_error);
    if (data_error.Fail()) {
      error.SetErrorStringWithFormat(
          "Couldn't convert return value to raw data: %s",
          data_error.AsCString());
      return error;
    }
    lldb::offset_t offset = 0;
    if (num_bytes <= 8) {
      uint64_t raw_value = data.GetMaxU64(&offset, num_bytes);

      if (reg_ctx->WriteRegisterFromUnsigned(reg_info, raw_value))
        set_it_simple = true;
    } else {
      error.SetErrorString("We don't support returning longer than 64 bit "
                           "integer values at present.");
    }
  } else if (compiler_type.IsFloatingPointType(count, is_complex)) {
    if (is_complex)
      error.SetErrorString(
          "We don't support returning complex values at present");
    else {
      size_t bit_width = compiler_type.GetBitSize(frame_sp.get());
      if (bit_width <= 64) {
        DataExtractor data;
        Status data_error;
        size_t num_bytes = new_value_sp->GetData(data, data_error);
        if (data_error.Fail()) {
          error.SetErrorStringWithFormat(
              "Couldn't convert return value to raw data: %s",
              data_error.AsCString());
          return error;
        }

        unsigned char buffer[16];
        ByteOrder byte_order = data.GetByteOrder();

        data.CopyByteOrderedData(0, num_bytes, buffer, 16, byte_order);
        set_it_simple = true;
      } else {
        // FIXME - don't know how to do 80 bit long doubles yet.
        error.SetErrorString(
            "We don't support returning float values > 64 bits at present");
      }
    }
  }

  if (!set_it_simple) {
    // Okay we've got a structure or something that doesn't fit in a simple
    // register.
    // We should figure out where it really goes, but we don't support this yet.
    error.SetErrorString("We only support setting simple integer and float "
                         "return types at present.");
  }

  return error;
}
Beispiel #18
0
    Status MMAPV1Engine::repairDatabase( OperationContext* txn,
                                         const std::string& dbName,
                                         bool preserveClonedFilesOnFailure,
                                         bool backupOriginalFiles ) {
        // We must hold some form of lock here
        invariant(txn->lockState()->threadState());
        invariant( dbName.find( '.' ) == string::npos );

        scoped_ptr<RepairFileDeleter> repairFileDeleter;
        doingRepair dr;

        log() << "repairDatabase " << dbName << endl;

        BackgroundOperation::assertNoBgOpInProgForDb(dbName);

        txn->recoveryUnit()->syncDataAndTruncateJournal(); // Must be done before and after repair

        intmax_t totalSize = dbSize( dbName );
        intmax_t freeSize = File::freeSpace(storageGlobalParams.repairpath);

        if ( freeSize > -1 && freeSize < totalSize ) {
            return Status( ErrorCodes::OutOfDiskSpace,
                           str::stream() << "Cannot repair database " << dbName
                           << " having size: " << totalSize
                           << " (bytes) because free disk space is: " << freeSize << " (bytes)" );
        }

        txn->checkForInterrupt();

        Path reservedPath =
            uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
                                "backup" : "_tmp" );
        MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
        string reservedPathString = reservedPath.string();

        if ( !preserveClonedFilesOnFailure )
            repairFileDeleter.reset( new RepairFileDeleter( txn,
                                                            dbName,
                                                            reservedPathString,
                                                            reservedPath ) );

        {
            Database* originalDatabase =
                            dbHolder().get(txn, dbName);
            if (originalDatabase == NULL) {
                return Status(ErrorCodes::NamespaceNotFound, "database does not exist to repair");
            }

            scoped_ptr<MMAPV1DatabaseCatalogEntry> dbEntry;
            scoped_ptr<Database> tempDatabase;
            {
                dbEntry.reset( new MMAPV1DatabaseCatalogEntry( txn,
                                                               dbName,
                                                               reservedPathString,
                                                               storageGlobalParams.directoryperdb,
                                                               true ) );
                invariant( !dbEntry->exists() );
                tempDatabase.reset( new Database( txn,
                                                  dbName,
                                                  dbEntry.get() ) );

            }

            map<string,CollectionOptions> namespacesToCopy;
            {
                string ns = dbName + ".system.namespaces";
                Client::Context ctx(txn,  ns );
                Collection* coll = originalDatabase->getCollection( txn, ns );
                if ( coll ) {
                    scoped_ptr<RecordIterator> it( coll->getIterator( txn,
                                                                      DiskLoc(),
                                                                      false,
                                                                      CollectionScanParams::FORWARD ) );
                    while ( !it->isEOF() ) {
                        DiskLoc loc = it->getNext();
                        BSONObj obj = coll->docFor( loc );

                        string ns = obj["name"].String();

                        NamespaceString nss( ns );
                        if ( nss.isSystem() ) {
                            if ( nss.isSystemDotIndexes() )
                                continue;
                            if ( nss.coll() == "system.namespaces" )
                                continue;
                        }

                        if ( !nss.isNormal() )
                            continue;

                        CollectionOptions options;
                        if ( obj["options"].isABSONObj() ) {
                            Status status = options.parse( obj["options"].Obj() );
                            if ( !status.isOK() )
                                return status;
                        }
                        namespacesToCopy[ns] = options;
                    }
                }
            }

            for ( map<string,CollectionOptions>::const_iterator i = namespacesToCopy.begin();
                  i != namespacesToCopy.end();
                  ++i ) {
                string ns = i->first;
                CollectionOptions options = i->second;

                Collection* tempCollection = NULL;
                {
                    Client::Context tempContext(txn, ns, tempDatabase );
                    tempCollection = tempDatabase->createCollection( txn, ns, options, true, false );
                }

                Client::Context readContext(txn, ns, originalDatabase);
                Collection* originalCollection = originalDatabase->getCollection( txn, ns );
                invariant( originalCollection );

                // data

                MultiIndexBlock indexBlock(txn, tempCollection );
                {
                    vector<BSONObj> indexes;
                    IndexCatalog::IndexIterator ii =
                        originalCollection->getIndexCatalog()->getIndexIterator( false );
                    while ( ii.more() ) {
                        IndexDescriptor* desc = ii.next();
                        indexes.push_back( desc->infoObj() );
                    }

                    Client::Context tempContext(txn, ns, tempDatabase);
                    Status status = indexBlock.init( indexes );
                    if ( !status.isOK() )
                        return status;

                }

                scoped_ptr<RecordIterator> iterator(
                    originalCollection->getIterator( txn, DiskLoc(), false,
                                                     CollectionScanParams::FORWARD ));
                while ( !iterator->isEOF() ) {
                    DiskLoc loc = iterator->getNext();
                    invariant( !loc.isNull() );

                    BSONObj doc = originalCollection->docFor( loc );

                    Client::Context tempContext(txn, ns, tempDatabase);
                    StatusWith<DiskLoc> result = tempCollection->insertDocument( txn, doc, indexBlock );
                    if ( !result.isOK() )
                        return result.getStatus();

                    txn->recoveryUnit()->commitIfNeeded();
                    txn->checkForInterrupt(false);
                }

                {
                    Client::Context tempContext(txn, ns, tempDatabase);
                    Status status = indexBlock.commit();
                    if ( !status.isOK() )
                        return status;
                }

            }

            txn->recoveryUnit()->syncDataAndTruncateJournal();
            globalStorageEngine->flushAllFiles(true); // need both in case journaling is disabled

            txn->checkForInterrupt(false);
        }

        // at this point if we abort, we don't want to delete new files
        // as they might be the only copies

        if ( repairFileDeleter.get() )
            repairFileDeleter->success();

        dbHolder().close( txn, dbName );

        if ( backupOriginalFiles ) {
            _renameForBackup( dbName, reservedPath );
        }
        else {
            // first make new directory before deleting data
            Path newDir = Path(storageGlobalParams.dbpath) / dbName;
            MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir));

            // this deletes old files
            _deleteDataFiles( dbName );

            if ( !boost::filesystem::exists(newDir) ) {
                // we deleted because of directoryperdb
                // re-create
                MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir));
            }
        }

        _replaceWithRecovered( dbName, reservedPathString.c_str() );

        if ( !backupOriginalFiles )
            MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );

        return Status::OK();
    }
    Status MMAPV1DatabaseCatalogEntry::renameCollection( OperationContext* txn,
                                                        const StringData& fromNS,
                                                        const StringData& toNS,
                                                        bool stayTemp ) {
        Status s = _renameSingleNamespace( txn, fromNS, toNS, stayTemp );
        if ( !s.isOK() )
            return s;

        NamespaceDetails* details = _namespaceIndex.details( toNS );
        invariant( details );

        RecordStoreV1Base* systemIndexRecordStore = _getIndexRecordStore( txn );
        scoped_ptr<RecordIterator> it( systemIndexRecordStore->getIterator() );

        while ( !it->isEOF() ) {
            DiskLoc loc = it->getNext();
            const Record* rec = it->recordFor( loc );
            BSONObj oldIndexSpec( rec->data() );
            if ( fromNS != oldIndexSpec["ns"].valuestrsafe() )
                continue;

            BSONObj newIndexSpec;
            {
                BSONObjBuilder b;
                BSONObjIterator i( oldIndexSpec );
                while( i.more() ) {
                    BSONElement e = i.next();
                    if ( strcmp( e.fieldName(), "ns" ) != 0 )
                        b.append( e );
                    else
                        b << "ns" << toNS;
                }
                newIndexSpec = b.obj();
            }

            StatusWith<DiskLoc> newIndexSpecLoc =
                systemIndexRecordStore->insertRecord( txn,
                                                      newIndexSpec.objdata(),
                                                      newIndexSpec.objsize(),
                                                      -1 );
            if ( !newIndexSpecLoc.isOK() )
                return newIndexSpecLoc.getStatus();

            const string& indexName = oldIndexSpec.getStringField( "name" );

            {
                // fix IndexDetails pointer
                NamespaceDetailsCollectionCatalogEntry ce( toNS, details,
                                                           _getIndexRecordStore( txn ), this );
                int indexI = ce._findIndexNumber( indexName );

                IndexDetails& indexDetails = details->idx(indexI);
                *txn->recoveryUnit()->writing(&indexDetails.info) = newIndexSpecLoc.getValue(); // XXX: dur
            }

            {
                // move underlying namespac
                string oldIndexNs = IndexDescriptor::makeIndexNamespace( fromNS, indexName );
                string newIndexNs = IndexDescriptor::makeIndexNamespace( toNS, indexName );

                Status s = _renameSingleNamespace( txn, oldIndexNs, newIndexNs, false );
                if ( !s.isOK() )
                    return s;
            }

            systemIndexRecordStore->deleteRecord( txn, loc );
        }

        return Status::OK();
    }
Beispiel #20
0
    Status WriteCmd::explain(OperationContext* txn,
                             const std::string& dbname,
                             const BSONObj& cmdObj,
                             ExplainCommon::Verbosity verbosity,
                             BSONObjBuilder* out) const {
        // For now we only explain update and delete write commands.
        if ( BatchedCommandRequest::BatchType_Update != _writeType &&
             BatchedCommandRequest::BatchType_Delete != _writeType ) {
            return Status( ErrorCodes::IllegalOperation,
                           "Only update and delete write ops can be explained" );
        }

        // Parse the batch request.
        BatchedCommandRequest request( _writeType );
        std::string errMsg;
        if ( !request.parseBSON( cmdObj, &errMsg ) || !request.isValid( &errMsg ) ) {
            return Status( ErrorCodes::FailedToParse, errMsg );
        }

        // Note that this is a runCommmand, and therefore, the database and the collection name
        // are in different parts of the grammar for the command. But it's more convenient to
        // work with a NamespaceString. We built it here and replace it in the parsed command.
        // Internally, everything work with the namespace string as opposed to just the
        // collection name.
        NamespaceString nsString(dbname, request.getNS());
        request.setNS(nsString.ns());

        // Do the validation of the batch that is shared with non-explained write batches.
        Status isValid = WriteBatchExecutor::validateBatch( request );
        if (!isValid.isOK()) {
            return isValid;
        }

        // Explain must do one additional piece of validation: For now we only explain
        // singleton batches.
        if ( request.sizeWriteOps() != 1u ) {
            return Status( ErrorCodes::InvalidLength,
                           "explained write batches must be of size 1" );
        }

        // Get a reference to the singleton batch item (it's the 0th item in the batch).
        BatchItemRef batchItem( &request, 0 );

        if ( BatchedCommandRequest::BatchType_Update == _writeType ) {
            // Create the update request.
            UpdateRequest updateRequest( txn, nsString );
            updateRequest.setQuery( batchItem.getUpdate()->getQuery() );
            updateRequest.setUpdates( batchItem.getUpdate()->getUpdateExpr() );
            updateRequest.setMulti( batchItem.getUpdate()->getMulti() );
            updateRequest.setUpsert( batchItem.getUpdate()->getUpsert() );
            updateRequest.setUpdateOpLog( true );
            UpdateLifecycleImpl updateLifecycle( true, updateRequest.getNamespaceString() );
            updateRequest.setLifecycle( &updateLifecycle );
            updateRequest.setExplain();

            // Explained updates can yield.
            updateRequest.setYieldPolicy(PlanExecutor::YIELD_AUTO);

            // Use the request to create an UpdateExecutor, and from it extract the
            // plan tree which will be used to execute this update.
            UpdateExecutor updateExecutor( &updateRequest, &txn->getCurOp()->debug() );
            Status prepStatus = updateExecutor.prepare();
            if ( !prepStatus.isOK() ) {
                return prepStatus;
            }

            // Explains of write commands are read-only, but we take an exclusive lock so
            // that timing info is more accurate.
            Lock::DBLock dlk(txn->lockState(), nsString.db(), MODE_IX);
            Client::Context ctx(txn, nsString);

            Status prepInLockStatus = updateExecutor.prepareInLock(ctx.db());
            if ( !prepInLockStatus.isOK() ) {
                return prepInLockStatus;
            }

            // Executor registration and yield policy is handled internally by the update executor.
            PlanExecutor* exec = updateExecutor.getPlanExecutor();

            // Explain the plan tree.
            return Explain::explainStages( exec, verbosity, out );
        }
        else {
            invariant( BatchedCommandRequest::BatchType_Delete == _writeType );

            // Create the delete request.
            DeleteRequest deleteRequest( txn, nsString );
            deleteRequest.setQuery( batchItem.getDelete()->getQuery() );
            deleteRequest.setMulti( batchItem.getDelete()->getLimit() != 1 );
            deleteRequest.setUpdateOpLog(true);
            deleteRequest.setGod( false );
            deleteRequest.setExplain();

            // Explained deletes can yield.
            deleteRequest.setYieldPolicy(PlanExecutor::YIELD_AUTO);

            // Use the request to create a DeleteExecutor, and from it extract the
            // plan tree which will be used to execute this update.
            DeleteExecutor deleteExecutor( &deleteRequest );
            Status prepStatus = deleteExecutor.prepare();
            if ( !prepStatus.isOK() ) {
                return prepStatus;
            }

            // Explains of write commands are read-only, but we take a write lock so that timing
            // info is more accurate.
            Lock::DBLock dlk(txn->lockState(), nsString.db(), MODE_IX);
            Client::Context ctx(txn, nsString);

            Status prepInLockStatus = deleteExecutor.prepareInLock(ctx.db());
            if ( !prepInLockStatus.isOK()) {
                return prepInLockStatus;
            }

            // Executor registration and yield policy is handled internally by the delete executor.
            PlanExecutor* exec = deleteExecutor.getPlanExecutor();

            // Explain the plan tree.
            return Explain::explainStages( exec, verbosity, out );
        }
    }
Beispiel #21
0
    DiskLoc DataFileMgr::insert(const char* ns,
                                const void* obuf,
                                int32_t len,
                                bool mayInterrupt,
                                bool god,
                                bool mayAddIndex,
                                bool* addedID) {

        Database* database = cc().database();

        bool wouldAddIndex = false;
        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
        {
            const char *sys = strstr(ns, "system.");
            if ( sys ) {

                if ( !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
                    return DiskLoc();

                if ( mayAddIndex && wouldAddIndex ) {
                    // TODO: this should be handled above this function
                    BSONObj spec( static_cast<const char*>( obuf ) );
                    string collectionToIndex = spec.getStringField( "ns" );
                    uassert(10096, "invalid ns to index", collectionToIndex.find( '.' ) != string::npos);
                    massert(10097,
                            str::stream() << "trying to create index on wrong db "
                            << " db: " << database->name() << " collection: " << collectionToIndex,
                            database->ownsNS( collectionToIndex ) );

                    Collection* collection = database->getCollection( collectionToIndex );
                    if ( !collection ) {
                        collection = database->createCollection( collectionToIndex, false, NULL, true );
                        verify( collection );
                        if ( !god )
                            ensureIdIndexForNewNs( collection );
                    }

                    Status status = collection->getIndexCatalog()->createIndex( spec, mayInterrupt );
                    if ( status.code() == ErrorCodes::IndexAlreadyExists )
                        return DiskLoc();
                    uassertStatusOK( status );
                    return DiskLoc();
                }
            }
        }

        Collection* collection = database->getCollection( ns );
        if ( collection == NULL ) {
            collection = database->createCollection( ns, false, NULL, false );

            int ies = Extent::initialSize(len);
            if( str::contains(ns, '$') &&
                len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 &&
                len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) {
                // probably an index.  so we pick a value here for the first extent instead of using
                // initialExtentSize() which is more for user collections.
                // TODO: we could look at the # of records in the parent collection to be smarter here.
                ies = (32+4) * 1024;
            }
            collection->increaseStorageSize( ies, false);
            if ( !god )
                ensureIdIndexForNewNs( collection );
        }

        NamespaceDetails* d = collection->details();

        IDToInsert idToInsert; // only initialized if needed

        if( !god ) {
            /* Check if we have an _id field. If we don't, we'll add it.
               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
            */
            BSONObj io((const char *) obuf);
            BSONElement idField = io.getField( "_id" );
            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
            // we don't add _id for capped collections in local as they don't have an _id index
            if( idField.eoo() &&
                !wouldAddIndex &&
                nsToDatabase( ns ) != "local" &&
                d->haveIdIndex() ) {

                if( addedID )
                    *addedID = true;

                idToInsert.init();
                len += idToInsert.size();
            }

            BSONElementManipulator::lookForTimestamps( io );
        }

        int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize );
        fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) );

        // If the collection is capped, check if the new object will violate a unique index
        // constraint before allocating space.
        if ( d->isCapped() && !god) {
            BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) );
            Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp );
            uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() );
        }

        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);

        if ( loc.isNull() ) {
            string errmsg = str::stream() << "insert: couldn't alloc space for object ns:" << ns
                                          << " capped:" << d->isCapped();
            log() << errmsg;
            uasserted( 17248, errmsg );
        }

        Record *r = loc.rec();
        {
            verify( r->lengthWithHeaders() >= lenWHdr );
            r = (Record*) getDur().writingPtr(r, lenWHdr);
            if( idToInsert.needed() ) {
                /* a little effort was made here to avoid a double copy when we add an ID */
                int originalSize = *((int*) obuf);
                ((int&)*r->data()) = originalSize + idToInsert.size();
                memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size());
                memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4);
            }
            else {
                if( obuf ) // obuf can be null from internal callers
                    memcpy(r->data(), obuf, len);
            }
        }

        addRecordToRecListInExtent(r, loc);

        d->incrementStats( r->netLength(), 1 );

        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
        if ( !god )
            collection->infoCache()->notifyOfWriteOp();

        /* add this record to our indexes */
        if ( d->getTotalIndexCount() > 0 ) {
            try {
                BSONObj obj(r->data());
                collection->getIndexCatalog()->indexRecord(obj, loc);
            }
            catch( AssertionException& e ) {
                // should be a dup key error on _id index
                if( d->isCapped() ) {
                    massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() );
                    string s = e.toString();
                    s += " : on addIndex/capped - collection and its index will not match";
                    setLastError(0, s.c_str());
                    error() << s << endl;
                }
                else {
                    // normal case -- we can roll back
                    _deleteRecord(d, ns, r, loc);
                    throw;
                }
            }
        }

        d->paddingFits();

        return loc;
    }
Beispiel #22
0
void CursorCache::gotKillCursors(Message& m) {
    LastError::get(cc()).disable();
    DbMessage dbmessage(m);
    int n = dbmessage.pullInt();

    if (n > 2000) {
        (n < 30000 ? warning() : error()) << "receivedKillCursors, n=" << n << endl;
    }

    uassert(13286, "sent 0 cursors to kill", n >= 1);
    uassert(13287, "too many cursors to kill", n < 30000);
    massert(18632,
            str::stream() << "bad kill cursors size: " << m.dataSize(),
            m.dataSize() == 8 + (8 * n));


    ConstDataCursor cursors(dbmessage.getArray(n));

    ClientBasic* client = ClientBasic::getCurrent();
    AuthorizationSession* authSession = AuthorizationSession::get(client);
    for (int i = 0; i < n; i++) {
        long long id = cursors.readAndAdvance<LittleEndian<int64_t>>();
        LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl;

        if (!id) {
            warning() << " got cursor id of 0 to kill" << endl;
            continue;
        }

        string server;
        {
            stdx::lock_guard<stdx::mutex> lk(_mutex);

            MapSharded::iterator i = _cursors.find(id);
            if (i != _cursors.end()) {
                Status authorizationStatus =
                    authSession->checkAuthForKillCursors(NamespaceString(i->second->getNS()), id);
                audit::logKillCursorsAuthzCheck(
                    client,
                    NamespaceString(i->second->getNS()),
                    id,
                    authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized);
                if (authorizationStatus.isOK()) {
                    _cursorsMaxTimeMS.erase(i->second->getId());
                    _cursors.erase(i);
                }
                continue;
            }

            MapNormal::iterator refsIt = _refs.find(id);
            MapNormal::iterator refsNSIt = _refsNS.find(id);
            if (refsIt == _refs.end()) {
                warning() << "can't find cursor: " << id << endl;
                continue;
            }
            verify(refsNSIt != _refsNS.end());
            Status authorizationStatus =
                authSession->checkAuthForKillCursors(NamespaceString(refsNSIt->second), id);
            audit::logKillCursorsAuthzCheck(client,
                                            NamespaceString(refsNSIt->second),
                                            id,
                                            authorizationStatus.isOK() ? ErrorCodes::OK
                                                                       : ErrorCodes::Unauthorized);
            if (!authorizationStatus.isOK()) {
                continue;
            }
            server = refsIt->second;
            _refs.erase(refsIt);
            _refsNS.erase(refsNSIt);
            cursorStatsSingleTarget.decrement();
        }

        LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server
                         << endl;

        verify(server.size());
        ScopedDbConnection conn(server);
        conn->killCursor(id);
        conn.done();
    }
}
Beispiel #23
0
Status ChunkMoveOperationState::commitMigration(const MigrationSessionId& sessionId) {
    invariant(_distLockStatus.is_initialized());
    invariant(_distLockStatus->isOK());

    log() << "About to enter migrate critical section";

    // We're under the collection distributed lock here, so no other migrate can change maxVersion
    // or CollectionMetadata state.
    ShardingState* const shardingState = ShardingState::get(_txn);

    Status startStatus = ShardingStateRecovery::startMetadataOp(_txn);
    if (!startStatus.isOK()) {
        warning() << "Failed to write sharding state recovery document" << causedBy(startStatus);
        return startStatus;
    }

    shardingState->migrationSourceManager()->setInCriticalSection(true);

    const ChunkVersion originalCollVersion = getCollMetadata()->getCollVersion();

    ChunkVersion myVersion = originalCollVersion;
    myVersion.incMajor();

    {
        ScopedTransaction transaction(_txn, MODE_IX);
        Lock::DBLock lk(_txn->lockState(), _nss.db(), MODE_IX);
        Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

        invariant(myVersion > shardingState->getVersion(_nss.ns()));

        // Bump the metadata's version up and "forget" about the chunk being moved. This is
        // not the commit point, but in practice the state in this shard won't change until
        // the commit it done.
        shardingState->donateChunk(_txn, _nss.ns(), _minKey, _maxKey, myVersion);
    }

    log() << "moveChunk setting version to: " << myVersion << migrateLog;

    // We're under the collection lock here, too, so we can undo the chunk donation because
    // no other state change could be ongoing
    BSONObj res;
    Status recvChunkCommitStatus{ErrorCodes::InternalError, "status not set"};

    try {
        ScopedDbConnection connTo(_toShardCS, 35.0);
        connTo->runCommand("admin", createRecvChunkCommitRequest(sessionId), res);
        connTo.done();
        recvChunkCommitStatus = getStatusFromCommandResult(res);
    } catch (const DBException& e) {
        const string msg = stream() << "moveChunk could not contact to shard " << _toShard
                                    << " to commit transfer" << causedBy(e);
        warning() << msg;
        recvChunkCommitStatus = Status(e.toStatus().code(), msg);
    }

    if (MONGO_FAIL_POINT(failMigrationCommit) && recvChunkCommitStatus.isOK()) {
        recvChunkCommitStatus =
            Status(ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint.");
    }

    if (!recvChunkCommitStatus.isOK()) {
        log() << "moveChunk migrate commit not accepted by TO-shard: " << res
              << " resetting shard version to: " << getShardVersion() << migrateLog;

        {
            ScopedTransaction transaction(_txn, MODE_IX);
            Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX);
            Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

            log() << "moveChunk collection lock acquired to reset shard version from "
                     "failed migration";

            // Revert the chunk manager back to the state before "forgetting" about the chunk
            shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata());
        }

        log() << "Shard version successfully reset to clean up failed migration";

        return Status(recvChunkCommitStatus.code(),
                      stream() << "_recvChunkCommit failed: " << causedBy(recvChunkCommitStatus));
    }

    log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog;

    BSONArrayBuilder updates;

    {
        // Update for the chunk being moved
        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);  // No upserting
        op.append("ns", ChunkType::ConfigNS);

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey));
        myVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _nss.ns());
        n.append(ChunkType::min(), _minKey);
        n.append(ChunkType::max(), _maxKey);
        n.append(ChunkType::shard(), _toShard);
        n.done();

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey));
        q.done();

        updates.append(op.obj());
    }

    // Version at which the next highest lastmod will be set. If the chunk being moved is the last
    // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the
    // chunk being bumped on the FROM-shard.
    ChunkVersion nextVersion = myVersion;

    // If we have chunks left on the FROM shard, update the version of one of them as well. We can
    // figure that out by grabbing the metadata as it has been changed.
    const std::shared_ptr<CollectionMetadata> bumpedCollMetadata(
        shardingState->getCollectionMetadata(_nss.ns()));
    if (bumpedCollMetadata->getNumChunks() > 0) {
        // get another chunk on that shard
        ChunkType bumpChunk;
        invariant(bumpedCollMetadata->getNextChunk(bumpedCollMetadata->getMinKey(), &bumpChunk));

        BSONObj bumpMin = bumpChunk.getMin();
        BSONObj bumpMax = bumpChunk.getMax();

        dassert(bumpMin.woCompare(_minKey) != 0);

        BSONObjBuilder op;
        op.append("op", "u");
        op.appendBool("b", false);
        op.append("ns", ChunkType::ConfigNS);

        nextVersion.incMinor();  // same as used on donateChunk

        BSONObjBuilder n(op.subobjStart("o"));
        n.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin));
        nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod());
        n.append(ChunkType::ns(), _nss.ns());
        n.append(ChunkType::min(), bumpMin);
        n.append(ChunkType::max(), bumpMax);
        n.append(ChunkType::shard(), _fromShard);
        n.done();

        BSONObjBuilder q(op.subobjStart("o2"));
        q.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin));
        q.done();

        updates.append(op.obj());

        log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin
              << " -> " << bumpMax << " for collection '" << _nss.ns() << "'" << migrateLog;
    } else {
        log() << "moveChunk moved last chunk out for collection '" << _nss.ns() << "'"
              << migrateLog;
    }

    BSONArrayBuilder preCond;
    {
        BSONObjBuilder b;
        b.append("ns", ChunkType::ConfigNS);
        b.append("q",
                 BSON("query" << BSON(ChunkType::ns(_nss.ns())) << "orderby"
                              << BSON(ChunkType::DEPRECATED_lastmod() << -1)));
        {
            BSONObjBuilder bb(b.subobjStart("res"));

            // TODO: For backwards compatibility, we can't yet require an epoch here
            bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), originalCollVersion.toLong());
            bb.done();
        }

        preCond.append(b.obj());
    }

    Status applyOpsStatus{Status::OK()};
    try {
        // For testing migration failures
        if (MONGO_FAIL_POINT(failMigrationConfigWritePrepare)) {
            throw DBException("mock migration failure before config write",
                              ErrorCodes::PrepareConfigsFailed);
        }

        applyOpsStatus =
            grid.catalogManager(_txn)->applyChunkOpsDeprecated(_txn, updates.arr(), preCond.arr());

        if (MONGO_FAIL_POINT(failMigrationApplyOps)) {
            throw SocketException(SocketException::RECV_ERROR,
                                  shardingState->getConfigServer(_txn).toString());
        }
    } catch (const DBException& ex) {
        applyOpsStatus = ex.toStatus();
    }

    if (applyOpsStatus == ErrorCodes::PrepareConfigsFailed) {
        // In the process of issuing the migrate commit, the SyncClusterConnection checks that
        // the config servers are reachable. If they are not, we are sure that the applyOps
        // command was not sent to any of the configs, so we can safely back out of the
        // migration here, by resetting the shard version that we bumped up to in the
        // donateChunk() call above.
        log() << "About to acquire moveChunk coll lock to reset shard version from "
              << "failed migration";

        {
            ScopedTransaction transaction(_txn, MODE_IX);
            Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX);
            Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X);

            // Revert the metadata back to the state before "forgetting" about the chunk
            shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata());
        }

        log() << "Shard version successfully reset to clean up failed migration";

        const string msg = stream() << "Failed to send migrate commit to configs "
                                    << causedBy(applyOpsStatus);
        return Status(applyOpsStatus.code(), msg);
    } else if (!applyOpsStatus.isOK()) {
        // This could be a blip in the connectivity. Wait out a few seconds and check if the
        // commit request made it.
        //
        // If the commit made it to the config, we'll see the chunk in the new shard and
        // there's no further action to be done.
        //
        // If the commit did not make it, currently the only way to fix this state is to
        // bounce the mongod so that the old state (before migrating) is brought in.

        warning() << "moveChunk commit failed and metadata will be revalidated"
                  << causedBy(applyOpsStatus) << migrateLog;
        sleepsecs(10);

        // Look for the chunk in this shard whose version got bumped. We assume that if that
        // mod made it to the config server, then applyOps was successful.
        try {
            std::vector<ChunkType> newestChunk;
            Status status =
                grid.catalogManager(_txn)->getChunks(_txn,
                                                     BSON(ChunkType::ns(_nss.ns())),
                                                     BSON(ChunkType::DEPRECATED_lastmod() << -1),
                                                     1,
                                                     &newestChunk,
                                                     nullptr);
            uassertStatusOK(status);

            ChunkVersion checkVersion;
            if (!newestChunk.empty()) {
                invariant(newestChunk.size() == 1);
                checkVersion = newestChunk[0].getVersion();
            }

            if (checkVersion.equals(nextVersion)) {
                log() << "moveChunk commit confirmed" << migrateLog;
            } else {
                error() << "moveChunk commit failed: version is at " << checkVersion
                        << " instead of " << nextVersion << migrateLog;
                error() << "TERMINATING" << migrateLog;

                dbexit(EXIT_SHARDING_ERROR);
            }
        } catch (...) {
            error() << "moveChunk failed to get confirmation of commit" << migrateLog;
            error() << "TERMINATING" << migrateLog;

            dbexit(EXIT_SHARDING_ERROR);
        }
    }

    MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection);

    shardingState->migrationSourceManager()->setInCriticalSection(false);
    ShardingStateRecovery::endMetadataOp(_txn);

    // Migration is done, just log some diagnostics information
    BSONObj chunkInfo =
        BSON("min" << _minKey << "max" << _maxKey << "from" << _fromShard << "to" << _toShard);

    BSONObjBuilder commitInfo;
    commitInfo.appendElements(chunkInfo);
    if (res["counts"].type() == Object) {
        commitInfo.appendElements(res["counts"].Obj());
    }

    grid.catalogManager(_txn)->logChange(_txn, "moveChunk.commit", _nss.ns(), commitInfo.obj());

    shardingState->migrationSourceManager()->done(_txn);
    _isRunning = false;

    return Status::OK();
}
Beispiel #24
0
void BackgroundSync::_produce(
    OperationContext* txn,
    ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    SyncSourceResolverResponse syncSourceResp =
        _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched);

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        StorageInterface::get(txn)
            ->setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen});
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode.";
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _replCoord->signalUpstreamUpdater();
    }

    // "lastFetched" not used. Already set in _enqueueDocuments.
    Status fetcherReturnStatus = Status::OK();
    DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
        _replCoord, replicationCoordinatorExternalState, this);
    OplogFetcher* oplogFetcher;
    try {
        auto config = _replCoord->getConfig();
        auto onOplogFetcherShutdownCallbackFn =
            [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) {
                fetcherReturnStatus = status;
            };

        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _oplogFetcher =
            stdx::make_unique<OplogFetcher>(&_threadPoolTaskExecutor,
                                            OpTimeWithHash(lastHashFetched, lastOpTimeFetched),
                                            source,
                                            NamespaceString(rsOplogName),
                                            config,
                                            &dataReplicatorExternalState,
                                            stdx::bind(&BackgroundSync::_enqueueDocuments,
                                                       this,
                                                       stdx::placeholders::_1,
                                                       stdx::placeholders::_2,
                                                       stdx::placeholders::_3,
                                                       stdx::placeholders::_4),
                                            onOplogFetcherShutdownCallbackFn);
        oplogFetcher = _oplogFetcher.get();
    } catch (const mongo::DBException& ex) {
        fassertFailedWithStatus(34440, exceptionToStatus());
    }

    LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at "
           << oplogFetcher->getCommandObject_forTest()["filter"];
    auto scheduleStatus = oplogFetcher->startup();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }

    oplogFetcher->join();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << fetcherReturnStatus.toString();
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), kOplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << fetcherReturnStatus;

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        BatchBoundaries boundaries = StorageInterface::get(txn)->getMinValid(txn);
        if (!boundaries.start.isNull() || boundaries.end > lastApplied) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream()
                                      << "need to rollback, but in inconsistent state. "
                                      << "minvalid: " << boundaries.end.toString()
                                      << " > our last optime: " << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString();
    }
}
bool ShardingCatalogClientImpl::runUserManagementWriteCommand(OperationContext* opCtx,
                                                              const std::string& commandName,
                                                              const std::string& dbname,
                                                              const BSONObj& cmdObj,
                                                              BSONObjBuilder* result) {
    BSONObj cmdToRun = cmdObj;
    {
        // Make sure that if the command has a write concern that it is w:1 or w:majority, and
        // convert w:1 or no write concern to w:majority before sending.
        WriteConcernOptions writeConcern;
        writeConcern.reset();

        BSONElement writeConcernElement = cmdObj[WriteConcernOptions::kWriteConcernField];
        bool initialCmdHadWriteConcern = !writeConcernElement.eoo();
        if (initialCmdHadWriteConcern) {
            Status status = writeConcern.parse(writeConcernElement.Obj());
            if (!status.isOK()) {
                return CommandHelpers::appendCommandStatusNoThrow(*result, status);
            }

            if (!(writeConcern.wNumNodes == 1 ||
                  writeConcern.wMode == WriteConcernOptions::kMajority)) {
                return CommandHelpers::appendCommandStatusNoThrow(
                    *result,
                    {ErrorCodes::InvalidOptions,
                     str::stream() << "Invalid replication write concern. User management write "
                                      "commands may only use w:1 or w:'majority', got: "
                                   << writeConcern.toBSON()});
            }
        }

        writeConcern.wMode = WriteConcernOptions::kMajority;
        writeConcern.wNumNodes = 0;

        BSONObjBuilder modifiedCmd;
        if (!initialCmdHadWriteConcern) {
            modifiedCmd.appendElements(cmdObj);
        } else {
            BSONObjIterator cmdObjIter(cmdObj);
            while (cmdObjIter.more()) {
                BSONElement e = cmdObjIter.next();
                if (WriteConcernOptions::kWriteConcernField == e.fieldName()) {
                    continue;
                }
                modifiedCmd.append(e);
            }
        }
        modifiedCmd.append(WriteConcernOptions::kWriteConcernField, writeConcern.toBSON());
        cmdToRun = modifiedCmd.obj();
    }

    auto response =
        Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts(
            opCtx,
            ReadPreferenceSetting{ReadPreference::PrimaryOnly},
            dbname,
            cmdToRun,
            Shard::kDefaultConfigCommandTimeout,
            Shard::RetryPolicy::kNotIdempotent);

    if (!response.isOK()) {
        return CommandHelpers::appendCommandStatusNoThrow(*result, response.getStatus());
    }
    if (!response.getValue().commandStatus.isOK()) {
        return CommandHelpers::appendCommandStatusNoThrow(*result,
                                                          response.getValue().commandStatus);
    }
    if (!response.getValue().writeConcernStatus.isOK()) {
        return CommandHelpers::appendCommandStatusNoThrow(*result,
                                                          response.getValue().writeConcernStatus);
    }

    CommandHelpers::filterCommandReplyForPassthrough(response.getValue().response, result);
    return true;
}
Beispiel #26
0
Status enforceLegacyWriteConcern(MultiCommandDispatch* dispatcher,
                                 StringData dbName,
                                 const BSONObj& options,
                                 const HostOpTimeMap& hostOpTimes,
                                 vector<LegacyWCResponse>* legacyWCResponses) {
    if (hostOpTimes.empty()) {
        return Status::OK();
    }

    for (HostOpTimeMap::const_iterator it = hostOpTimes.begin(); it != hostOpTimes.end(); ++it) {
        const ConnectionString& shardEndpoint = it->first;
        const HostOpTime hot = it->second;
        const Timestamp& opTime = hot.opTime;
        const OID& electionId = hot.electionId;

        LOG(3) << "enforcing write concern " << options << " on " << shardEndpoint.toString()
               << " at opTime " << opTime.toStringPretty() << " with electionID " << electionId;

        BSONObj gleCmd = buildGLECmdWithOpTime(options, opTime, electionId);
        dispatcher->addCommand(shardEndpoint, dbName, gleCmd);
    }

    dispatcher->sendAll();

    vector<Status> failedStatuses;

    while (dispatcher->numPending() > 0) {
        ConnectionString shardEndpoint;
        RawBSONSerializable gleResponseSerial;

        Status dispatchStatus = dispatcher->recvAny(&shardEndpoint, &gleResponseSerial);
        if (!dispatchStatus.isOK()) {
            // We need to get all responses before returning
            failedStatuses.push_back(dispatchStatus);
            continue;
        }

        BSONObj gleResponse = stripNonWCInfo(gleResponseSerial.toBSON());

        // Use the downconversion tools to determine if this GLE response is ok, a
        // write concern error, or an unknown error we should immediately abort for.
        GLEErrors errors;
        Status extractStatus = extractGLEErrors(gleResponse, &errors);
        if (!extractStatus.isOK()) {
            failedStatuses.push_back(extractStatus);
            continue;
        }

        LegacyWCResponse wcResponse;
        wcResponse.shardHost = shardEndpoint.toString();
        wcResponse.gleResponse = gleResponse;
        if (errors.wcError.get()) {
            wcResponse.errToReport = errors.wcError->getErrMessage();
        }

        legacyWCResponses->push_back(wcResponse);
    }

    if (failedStatuses.empty()) {
        return Status::OK();
    }

    StringBuilder builder;
    builder << "could not enforce write concern";

    for (vector<Status>::const_iterator it = failedStatuses.begin(); it != failedStatuses.end();
         ++it) {
        const Status& failedStatus = *it;
        if (it == failedStatuses.begin()) {
            builder << causedBy(failedStatus.toString());
        } else {
            builder << ":: and ::" << failedStatus.toString();
        }
    }

    return Status(failedStatuses.size() == 1u ? failedStatuses.front().code()
                                              : ErrorCodes::MultipleErrorsOccurred,
                  builder.str());
}
Status ShardingCatalogClientImpl::insertConfigDocument(OperationContext* opCtx,
                                                       const NamespaceString& nss,
                                                       const BSONObj& doc,
                                                       const WriteConcernOptions& writeConcern) {
    invariant(nss.db() == NamespaceString::kAdminDb || nss.db() == NamespaceString::kConfigDb);

    const BSONElement idField = doc.getField("_id");
    invariant(!idField.eoo());

    BatchedCommandRequest request([&] {
        write_ops::Insert insertOp(nss);
        insertOp.setDocuments({doc});
        return insertOp;
    }());
    request.setWriteConcern(writeConcern.toBSON());

    auto configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard();
    for (int retry = 1; retry <= kMaxWriteRetry; retry++) {
        auto response = configShard->runBatchWriteCommand(
            opCtx, Shard::kDefaultConfigCommandTimeout, request, Shard::RetryPolicy::kNoRetry);

        Status status = response.toStatus();

        if (retry < kMaxWriteRetry &&
            configShard->isRetriableError(status.code(), Shard::RetryPolicy::kIdempotent)) {
            // Pretend like the operation is idempotent because we're handling DuplicateKey errors
            // specially
            continue;
        }

        // If we get DuplicateKey error on the first attempt to insert, this definitively means that
        // we are trying to insert the same entry a second time, so error out. If it happens on a
        // retry attempt though, it is not clear whether we are actually inserting a duplicate key
        // or it is because we failed to wait for write concern on the first attempt. In order to
        // differentiate, fetch the entry and check.
        if (retry > 1 && status == ErrorCodes::DuplicateKey) {
            LOG(1) << "Insert retry failed because of duplicate key error, rechecking.";

            auto fetchDuplicate =
                _exhaustiveFindOnConfig(opCtx,
                                        ReadPreferenceSetting{ReadPreference::PrimaryOnly},
                                        repl::ReadConcernLevel::kMajorityReadConcern,
                                        nss,
                                        idField.wrap(),
                                        BSONObj(),
                                        boost::none);
            if (!fetchDuplicate.isOK()) {
                return fetchDuplicate.getStatus();
            }

            auto existingDocs = fetchDuplicate.getValue().value;
            if (existingDocs.empty()) {
                return {status.withContext(
                    stream() << "DuplicateKey error was returned after a retry attempt, but no "
                                "documents were found. This means a concurrent change occurred "
                                "together with the retries.")};
            }

            invariant(existingDocs.size() == 1);

            BSONObj existing = std::move(existingDocs.front());
            if (existing.woCompare(doc) == 0) {
                // Documents match, so treat the operation as success
                return Status::OK();
            }
        }

        return status;
    }

    MONGO_UNREACHABLE;
}
Beispiel #28
0
int main(int argc, char** argv) {
    string impl = "printer";
    const char* script = NULL;
    for (int32_t i = 1; i < argc; i++) {
      if (string(argv[i]) == "-j") {
        impl = "jit";
      } else {
        script = argv[i];
      }
    }
    Translator* translator = Translator::create(impl);

    if (translator == 0) {
        cout << "TODO: Implement translator factory in translator.cpp!!!!" << endl;
        return 1;
    }

    const char* expr = "double x; double y;"
                        "x += 8.0; y = 2.0;"
                        "print('Hello, x=',x,' y=',y,'\n');"
        ;
    bool isDefaultExpr = true;

    if (script != NULL) {
        expr = loadFile(script);
        if (expr == 0) {
            printf("Cannot read file: %s\n", script);
            return 1;
        }
        isDefaultExpr = false;
    }

    Code* code = 0;

    Status* translateStatus = translator->translate(expr, &code);
    if (translateStatus->isError()) {
        uint32_t position = translateStatus->getPosition();
        uint32_t line = 0, offset = 0;
        positionToLineOffset(expr, position, line, offset);
        fprintf(stderr, "Cannot translate expression: expression at %d,%d; "
               "error '%s'\n",
               line, offset,
               translateStatus->getError().c_str());
    } else {
        assert(code != 0);
        vector<Var*> vars;

        if (isDefaultExpr) {
            Var* xVar = new Var(VT_DOUBLE, "x");
            Var* yVar = new Var(VT_DOUBLE, "y");
            vars.push_back(xVar);
            vars.push_back(yVar);
            xVar->setDoubleValue(42.0);
        }
        Status* execStatus = code->execute(vars);
        if (execStatus->isError()) {
            fprintf(stderr, "Cannot execute expression: error: %s\n",
                   execStatus->getError().c_str());
        } else {
            if (isDefaultExpr) {
              printf("x evaluated to %f\n", vars[0]->getDoubleValue());
              for (uint32_t i = 0; i < vars.size(); i++) {
                delete vars[i];
              }
            }
        }
        delete code;
        delete execStatus;
    }
    delete translateStatus;
    delete translator;

    if (!isDefaultExpr) {
      delete [] expr;
    }

    return 0;
}
Beispiel #29
0
/// Add or update an entry in the cache.
///
/// @param[in] path          Asset path.
/// @param[in] subDataIndex  Sub-data index associated with the cached data.
/// @param[in] pData         Data to cache.
/// @param[in] timestamp     Timestamp value to associate with the entry in the cache.
/// @param[in] size          Number of bytes to cache.
///
/// @return  True if the cache was updated successfully, false if not.
bool Cache::CacheEntry(
					   AssetPath path,
					   uint32_t subDataIndex,
					   const void* pData,
					   int64_t timestamp,
					   uint32_t size )
{
	HELIUM_ASSERT( pData || size == 0 );

	Status status;
	status.Read( m_cacheFileName.GetData() );
	int64_t cacheFileSize = status.m_Size;
	uint64_t entryOffset = ( cacheFileSize == -1 ? 0 : static_cast< uint64_t >( cacheFileSize ) );

	HELIUM_ASSERT( m_pEntryPool );
	Entry* pEntryUpdate = m_pEntryPool->Allocate();
	HELIUM_ASSERT( pEntryUpdate );
	pEntryUpdate->offset = entryOffset;
	pEntryUpdate->timestamp = timestamp;
	pEntryUpdate->path = path;
	pEntryUpdate->subDataIndex = subDataIndex;
	pEntryUpdate->size = size;

	uint64_t originalOffset = 0;
	int64_t originalTimestamp = 0;
	uint32_t originalSize = 0;

	EntryKey key;
	key.path = path;
	key.subDataIndex = subDataIndex;

	EntryMapType::Accessor entryAccessor;
	bool bNewEntry = m_entryMap.Insert( entryAccessor, KeyValue< EntryKey, Entry* >( key, pEntryUpdate ) );
	if( bNewEntry )
	{
		HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Adding \"%s\" to cache \"%s\".\n" ), *path.ToString(), *m_cacheFileName );

		m_entries.Push( pEntryUpdate );
	}
	else
	{
		HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Updating \"%s\" in cache \"%s\".\n" ), *path.ToString(), *m_cacheFileName );

		m_pEntryPool->Release( pEntryUpdate );

		pEntryUpdate = entryAccessor->Second();
		HELIUM_ASSERT( pEntryUpdate );

		originalOffset = pEntryUpdate->offset;
		originalTimestamp = pEntryUpdate->timestamp;
		originalSize = pEntryUpdate->size;

		if( originalSize < size )
		{
			pEntryUpdate->offset = entryOffset;
		}
		else
		{
			entryOffset = originalOffset;
		}

		pEntryUpdate->timestamp = timestamp;
		pEntryUpdate->size = size;
	}

	AsyncLoader& rLoader = AsyncLoader::GetStaticInstance();

	rLoader.Lock();

	bool bCacheSuccess = true;

	FileStream* pCacheStream = FileStream::OpenFileStream( m_cacheFileName, FileStream::MODE_WRITE, false );
	if( !pCacheStream )
	{
		HELIUM_TRACE( TraceLevels::Error, TXT( "Cache: Failed to open cache \"%s\" for writing.\n" ), *m_cacheFileName );

		bCacheSuccess = false;
	}
	else
	{
		HELIUM_TRACE(
			TraceLevels::Info,
			TXT( "Cache: Caching \"%s\" to \"%s\" (%" ) PRIu32 TXT( " bytes @ offset %" ) PRIu64 TXT( ").\n" ),
			*path.ToString(),
			*m_cacheFileName,
			size,
			entryOffset );

		uint64_t seekOffset = static_cast< uint64_t >( pCacheStream->Seek(
			static_cast< int64_t >( entryOffset ),
			SeekOrigins::Begin ) );
		if( seekOffset != entryOffset )
		{
			HELIUM_TRACE( TraceLevels::Error, TXT( "Cache: Cache file offset seek failed.\n" ) );

			if( bNewEntry )
			{
				m_entries.Pop();
				m_entryMap.Remove( entryAccessor );
				m_pEntryPool->Release( pEntryUpdate );
			}
			else
			{
				pEntryUpdate->offset = originalOffset;
				pEntryUpdate->timestamp = originalTimestamp;
				pEntryUpdate->size = originalSize;
			}

			bCacheSuccess = false;
		}
		else
		{
			size_t writeSize = pCacheStream->Write( pData, 1, size );
			if( writeSize != size )
			{
				HELIUM_TRACE(
					TraceLevels::Error,
					( TXT( "Cache: Failed to write %" ) PRIu32 TXT( " bytes to cache \"%s\" (%" ) PRIuSZ
					TXT( " bytes written).\n" ) ),
					size,
					*m_cacheFileName,
					writeSize );

				if( bNewEntry )
				{
					m_entries.Pop();
					m_entryMap.Remove( entryAccessor );
					m_pEntryPool->Release( pEntryUpdate );
				}
				else
				{
					pEntryUpdate->offset = originalOffset;
					pEntryUpdate->timestamp = originalTimestamp;
					pEntryUpdate->size = originalSize;
				}

				bCacheSuccess = false;
			}
			else
			{
				HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Rewriting TOC file \"%s\".\n" ), *m_tocFileName );

				FileStream* pTocStream = FileStream::OpenFileStream( m_tocFileName, FileStream::MODE_WRITE, true );
				if( !pTocStream )
				{
					HELIUM_TRACE( TraceLevels::Error, TXT( "Cache: Failed to open TOC \"%s\" for writing.\n" ), *m_tocFileName );
				}
				else
				{
					BufferedStream* pBufferedStream = new BufferedStream( pTocStream );
					HELIUM_ASSERT( pBufferedStream );

					pBufferedStream->Write( &TOC_MAGIC, sizeof( TOC_MAGIC ), 1 );
					pBufferedStream->Write( &sm_Version, sizeof( sm_Version ), 1 );

					uint32_t entryCount = static_cast< uint32_t >( m_entries.GetSize() );
					pBufferedStream->Write( &entryCount, sizeof( entryCount ), 1 );

					String entryPath;
					uint_fast32_t entryCountFast = entryCount;
					for( uint_fast32_t entryIndex = 0; entryIndex < entryCountFast; ++entryIndex )
					{
						Entry* pEntry = m_entries[ entryIndex ];
						HELIUM_ASSERT( pEntry );

						pEntry->path.ToString( entryPath );
						HELIUM_ASSERT( entryPath.GetSize() < UINT16_MAX );
						uint16_t pathSize = static_cast< uint16_t >( entryPath.GetSize() );
						pBufferedStream->Write( &pathSize, sizeof( pathSize ), 1 );

						pBufferedStream->Write( *entryPath, sizeof( char ), pathSize );

						pBufferedStream->Write( &pEntry->subDataIndex, sizeof( pEntry->subDataIndex ), 1 );

						pBufferedStream->Write( &pEntry->offset, sizeof( pEntry->offset ), 1 );
						pBufferedStream->Write( &pEntry->timestamp, sizeof( pEntry->timestamp ), 1 );
						pBufferedStream->Write( &pEntry->size, sizeof( pEntry->size ), 1 );
					}

					delete pBufferedStream;
					delete pTocStream;
				}
			}
		}

		delete pCacheStream;
	}

	rLoader.Unlock();

	return bCacheSuccess;
}
Beispiel #30
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        // This is a read lock.
        Client::ReadContext ctx(q.ns, dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner;
        CanonicalQuery* cq;
        Status status = getRunner(q, &rawRunner, &cq);
        if (!status.isOK()) {
            uasserted(17007, "Couldn't process query " + q.query.toString()
                         + " why: " + status.reason());
        }
        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        log() << "Running query on new system: " << cq->toString();

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns);

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // TODO: Remove when impl'd
        if (pq.hasOption(QueryOption_OplogReplay)) {
            warning() << "haven't implemented findingstartcursor yet\n";
        }

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);
        auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety(
            new DeregisterEvenIfUnderlyingCodeThrows(runner.get()));

        BSONObj obj;
        Runner::RunnerState state;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // If we're sharded make sure that we don't return any data that hasn't been migrated
            // off of our shared yet.
            if (collMetadata) {
                // This information can change if we yield and as such we must make sure to re-fetch
                // it if we yield.
                KeyPattern kp(collMetadata->getKeyPattern());
                // This performs excessive BSONObj creation but that's OK for now.
                if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
            }

            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    saveClientCursor = true;
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable) && (1 != pq.getNumToReturn())) {
            // If pq.hasOption(tailable) the only plan the planner will output is a collscan with
            // tailable set.
            saveClientCursor = true;
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            log() << "caching runner with cursorid " << ccId
                  << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }

        // append explain information to query results
        if (isExplain) {
            BSONObjBuilder bob;
            bob.append("n", numResults);
            BSONObj obj = bob.done();
            bb.appendBuf((void*)obj.objdata(), obj.objsize());
            // The explain output is actually a result.
            numResults = 1;
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;
        // TODO: nscanned is bogus.
        // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }