Example #1
0
        bool run(OperationContext* txn,
                 const string& dbname,
                 BSONObj& jsobj,
                 int,
                 string& errmsg,
                 BSONObjBuilder& result,
                 bool /*fromRepl*/) {

            Client::ReadContext ctx( txn, dbname );
            const Database* d = ctx.ctx().db();
            const DatabaseCatalogEntry* dbEntry = d->getDatabaseCatalogEntry();

            list<string> names;
            dbEntry->getCollectionNamespaces( &names );

            BSONArrayBuilder arr;

            for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) {
                string ns = *i;

                BSONObjBuilder b;
                b.append( "name", nsToCollectionSubstring( ns ) );

                CollectionOptions options =
                    dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn);
                b.append( "options", options.toBSON() );

                arr.append( b.obj() );
            }

            result.append( "collections", arr.arr() );

            return true;
        }
Example #2
0
    void Database::clearTmpCollections(OperationContext* txn) {

        txn->lockState()->assertWriteLocked( _name );

        list<string> collections;
        _dbEntry->getCollectionNamespaces( &collections );

        for ( list<string>::iterator i = collections.begin(); i != collections.end(); ++i ) {
            string ns = *i;
            invariant( NamespaceString::normal( ns ) );

            CollectionCatalogEntry* coll = _dbEntry->getCollectionCatalogEntry( txn, ns );

            CollectionOptions options = coll->getCollectionOptions( txn );
            if ( !options.temp )
                continue;

            WriteUnitOfWork wunit(txn);
            Status status = dropCollection( txn, ns );
            if ( !status.isOK() ) {
                warning() << "could not drop temp collection '" << ns << "': " << status;
                continue;
            }

            string cmdNs = _name + ".$cmd";
            repl::logOp( txn,
                         "c",
                         cmdNs.c_str(),
                         BSON( "drop" << nsToCollectionSubstring( ns ) ) );
            wunit.commit();
        }
    }
    CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions( OperationContext* txn,
                                                                        const StringData& ns ) const {
        if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) {
            return CollectionOptions();
        }

        RecordStoreV1Base* rs = _getNamespaceRecordStore();
        invariant( rs );

        scoped_ptr<RecordIterator> it( rs->getIterator(txn) );
        while ( !it->isEOF() ) {
            DiskLoc loc = it->getNext();
            BSONObj entry = it->dataFor( loc ).toBson();
            BSONElement name = entry["name"];
            if ( name.type() == String && name.String() == ns ) {
                CollectionOptions options;
                if ( entry["options"].isABSONObj() ) {
                    Status status = options.parse( entry["options"].Obj() );
                    fassert( 18523, status );
                }
                return options;
            }
        }

        return CollectionOptions();
    }
Example #4
0
    void insertObjects(const char *ns, const vector<BSONObj> &objs, bool keepGoing, uint64_t flags, bool logop ) {
        StringData _ns(ns);
        if (NamespaceString::isSystem(_ns)) {
            massert(16748, "need transaction to run insertObjects", cc().txnStackSize() > 0);
            uassert(10095, "attempt to insert in reserved database name 'system'", nsToDatabaseSubstring(_ns) != "system");
            massert(16750, "attempted to insert multiple objects into a system namspace at once", objs.size() == 1);

            // Trying to insert into a system collection.  Fancy side-effects go here:
            if (nsToCollectionSubstring(ns) == "system.indexes") {
                BSONObj obj = stripDropDups(objs[0]);
                NamespaceDetails *d = getAndMaybeCreateNS(obj["ns"].Stringdata(), logop);
                bool ok = d->ensureIndex(obj);
                if (!ok) {
                    // Already had that index
                    return;
                }

                // Now we have to actually insert that document into system.indexes, we may have
                // modified it with stripDropDups.
                vector<BSONObj> newObjs;
                newObjs.push_back(obj);
                _insertObjects(ns, newObjs, keepGoing, flags, logop);
                return;
            } else if (!legalClientSystemNS(ns, true)) {
                uasserted(16459, str::stream() << "attempt to insert in system namespace '" << ns << "'");
            }
        }
        _insertObjects(ns, objs, keepGoing, flags, logop);
    }
Example #5
0
        bool run(OperationContext* txn,
                 const string& dbname,
                 BSONObj& jsobj,
                 int,
                 string& errmsg,
                 BSONObjBuilder& result,
                 bool /*fromRepl*/) {

            ScopedTransaction scopedXact(txn, MODE_IS);
            AutoGetDb autoDb(txn, dbname, MODE_S);

            const Database* d = autoDb.getDb();
            const DatabaseCatalogEntry* dbEntry = NULL;

            list<string> names;
            if ( d ) {
                dbEntry = d->getDatabaseCatalogEntry();
                dbEntry->getCollectionNamespaces( &names );
                names.sort();
            }

            scoped_ptr<MatchExpression> matcher;
            if ( jsobj["filter"].isABSONObj() ) {
                StatusWithMatchExpression parsed =
                    MatchExpressionParser::parse( jsobj["filter"].Obj() );
                if ( !parsed.isOK() ) {
                    return appendCommandStatus( result, parsed.getStatus() );
                }
                matcher.reset( parsed.getValue() );
            }

            BSONArrayBuilder arr;

            for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) {
                string ns = *i;

                StringData collection = nsToCollectionSubstring( ns );
                if ( collection == "system.namespaces" ) {
                    continue;
                }

                BSONObjBuilder b;
                b.append( "name", collection );

                CollectionOptions options =
                    dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn);
                b.append( "options", options.toBSON() );

                BSONObj maybe = b.obj();
                if ( matcher && !matcher->matchesBSON( maybe ) ) {
                    continue;
                }

                arr.append( maybe );
            }

            result.append( "collections", arr.arr() );

            return true;
        }
Example #6
0
 // returns true if the operation should run in an alternate
 // transaction stack instead of the possible multi statement
 // transaction stack that it is a part of. Several operations/statements,
 // such as authentication, should not run 
 static bool opNeedsAltTxn(const StringData &ns) {
     // for now, the only operations that need to run in an
     // alternate transaction stack are authentication related
     // operations. We do not want them to be part of multi statement
     // transactions.
     return nsToCollectionSubstring(ns) == "system.users";
 }
Example #7
0
AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* txn, const std::string& ns)
    : _txn(txn),
      _transaction(txn, MODE_IS),
      _db(_txn, nsToDatabaseSubstring(ns), MODE_IS),
      _collLock(_txn->lockState(), ns, MODE_IS),
      _coll(NULL) {
    _init(ns, nsToCollectionSubstring(ns));
}
Example #8
0
    void CollectionMap::drop() {
        Lock::assertWriteLocked(_database);
        init();
        if (!allocated()) {
            return;
        }

        string errmsg;
        BSONObjBuilder result;

        // This implementation is inefficient and slightly messy, but it was easy.
        // Feel free to improve it as necessary:
        // - The getCursor call will grab a table lock on .system.namespaces.
        // - We'll look at the entire system.namespaces collection just for one database.
        // - Code is duplicated to handle dropping system system collections in stages.
        vector<string> sysIndexesEntries;
        const string systemNamespacesNs = getSisterNS(_database, "system.namespaces");
        Collection *sysCl = getCollection(systemNamespacesNs);
        for (shared_ptr<Cursor> c(Cursor::make(sysCl)); c->ok(); c->advance()) {
            const BSONObj nsObj = c->current();
            const StringData ns = nsObj["name"].Stringdata();
            if (nsToDatabaseSubstring(ns) != _database) {
                // Not part of this database, skip.
                continue;
            }
            if (nsToCollectionSubstring(ns) == "system.indexes") {
                // Save .system.indexes collection for last, because drop() deletes from it.
                sysIndexesEntries.push_back(ns.toString());
            } else {
                Collection *cl = getCollection(ns);
                if (cl != NULL) {
                    cl->drop(errmsg, result, true);
                }
            }
        }
        if (sysCl != NULL) {
            // The .system.namespaces collection does not include itself.
            sysCl->drop(errmsg, result, true);
        }
        // Now drop the system.indexes entries.
        for (vector<string>::const_iterator it = sysIndexesEntries.begin(); it != sysIndexesEntries.end(); it++) {
            // Need to close any existing handle before drop.
            Collection *cl = getCollection(*it);
            if (cl != NULL) {
                cl->drop(errmsg, result, true);
            }
        }
        // Everything that was open should have been closed due to drop.
        verify(_collections.empty());

        shared_ptr<storage::Dictionary> metadb = _metadb;
        _metadb.reset();
        const int r = metadb->close();
        if (r != 0) {
            storage::handle_ydb_error(r);
        }
        storage::db_remove(_metadname);
    }
Example #9
0
    // Copies ops out of the bgsync queue into the deque passed in as a parameter.
    // Returns true if the batch should be ended early.
    // Batch should end early if we encounter a command, or if
    // there are no further ops in the bgsync queue to read.
    // This function also blocks 1 second waiting for new ops to appear in the bgsync
    // queue.  We can't block forever because there are maintenance things we need
    // to periodically check in the loop.
    bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops, ReplicationCoordinator* replCoord) {
        BSONObj op;
        // Check to see if there are ops waiting in the bgsync queue
        bool peek_success = peek(&op);

        if (!peek_success) {
            // if we don't have anything in the queue, wait a bit for something to appear
            if (ops->empty()) {
                replCoord->signalDrainComplete();
                // block up to 1 second
                _networkQueue->waitForMore();
                return false;
            }

            // otherwise, apply what we have
            return true;
        }

        const char* ns = op["ns"].valuestrsafe();

        // check for commands
        if ((op["op"].valuestrsafe()[0] == 'c') ||
            // Index builds are acheived through the use of an insert op, not a command op.
            // The following line is the same as what the insert code uses to detect an index build.
            ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) {

            if (ops->empty()) {
                // apply commands one-at-a-time
                ops->push_back(op);
                _networkQueue->consume();
            }

            // otherwise, apply what we have so far and come back for the command
            return true;
        }

        // check for oplog version change
        BSONElement elemVersion = op["v"];
        int curVersion = 0;
        if (elemVersion.eoo())
            // missing version means version 1
            curVersion = 1;
        else
            curVersion = elemVersion.Int();
        
        if (curVersion != OPLOG_VERSION) {
            severe() << "expected oplog version " << OPLOG_VERSION << " but found version " 
                     << curVersion << " in oplog entry: " << op;
            fassertFailedNoTrace(18820);
        }
    
        // Copy the op to the deque and remove it from the bgsync queue.
        ops->push_back(op);
        _networkQueue->consume();

        // Go back for more ops
        return false;
    }
// Copies ops out of the bgsync queue into the deque passed in as a parameter.
// Returns true if the batch should be ended early.
// Batch should end early if we encounter a command, or if
// there are no further ops in the bgsync queue to read.
// This function also blocks 1 second waiting for new ops to appear in the bgsync
// queue.  We can't block forever because there are maintenance things we need
// to periodically check in the loop.
bool SyncTail::tryPopAndWaitForMore(OperationContext* txn, SyncTail::OpQueue* ops) {
    BSONObj op;
    // Check to see if there are ops waiting in the bgsync queue
    bool peek_success = peek(&op);

    if (!peek_success) {
        // if we don't have anything in the queue, wait a bit for something to appear
        if (ops->empty()) {
            // block up to 1 second
            _networkQueue->waitForMore();
            return false;
        }

        // otherwise, apply what we have
        return true;
    }

    auto entry = OplogEntry(op);

    // Check for ops that must be processed one at a time.
    if (entry.raw.isEmpty() ||       // sentinel that network queue is drained.
        (entry.opType[0] == 'c') ||  // commands.
        // Index builds are acheived through the use of an insert op, not a command op.
        // The following line is the same as what the insert code uses to detect an index build.
        (!entry.ns.empty() && nsToCollectionSubstring(entry.ns) == "system.indexes")) {
        if (ops->empty()) {
            // apply commands one-at-a-time
            ops->push_back(std::move(entry));
            _networkQueue->consume();
        }

        // otherwise, apply what we have so far and come back for the command
        return true;
    }

    // check for oplog version change
    int curVersion = 0;
    if (entry.version.eoo())
        // missing version means version 1
        curVersion = 1;
    else
        curVersion = entry.version.Int();

    if (curVersion != OPLOG_VERSION) {
        severe() << "expected oplog version " << OPLOG_VERSION << " but found version "
                 << curVersion << " in oplog entry: " << op;
        fassertFailedNoTrace(18820);
    }

    // Copy the op to the deque and remove it from the bgsync queue.
    ops->push_back(std::move(entry));
    _networkQueue->consume();

    // Go back for more ops
    return false;
}
Example #11
0
    void receivedInsert(Message& m, CurOp& op) {
        DbMessage d(m);
        const char *ns = d.getns();
        op.debug().ns = ns;

        StringData coll = nsToCollectionSubstring(ns);
        // Auth checking for index writes happens later.
        if (coll != "system.indexes") {
            Status status = cc().getAuthorizationManager()->checkAuthForInsert(ns);
            uassert(16544, status.reason(), status.isOK());
        }

        if (!d.moreJSObjs()) {
            // strange.  should we complain?
            return;
        }

        vector<BSONObj> objs;
        while (d.moreJSObjs()) {
            objs.push_back(d.nextJsObj());
        }

        const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;

        OpSettings settings;
        settings.setQueryCursorMode(WRITE_LOCK_CURSOR);
        cc().setOpSettings(settings);

        if (coll == "system.indexes" &&
                // Can only build non-unique indexes in the background, because the
                // hot indexer does not know how to perform unique checks.
                objs[0]["background"].trueValue() && !objs[0]["unique"].trueValue()) {
            _buildHotIndex(ns, m, objs);
            return;
        }

        scoped_ptr<Client::ShardedOperationScope> scp;
        if (coll != "system.indexes") {
            scp.reset(new Client::ShardedOperationScope);
            if (scp->handlePossibleShardedMessage(m, 0)) {
                return;
            }
        }

        LOCK_REASON(lockReason, "insert");
        try {
            Lock::DBRead lk(ns, lockReason);
            lockedReceivedInsert(ns, m, objs, op, keepGoing);
        }
        catch (RetryWithWriteLock &e) {
            Lock::DBWrite lk(ns, lockReason);
            lockedReceivedInsert(ns, m, objs, op, keepGoing);
        }
    }
Example #12
0
    void IndexDetails::kill_idx() {
        const string ns = indexNamespace();
        const string parentns = parentNS();

        close();
        storage::db_remove(ns);

        // Removing this index's ns from the system.indexes/namespaces catalog.
        removeNamespaceFromCatalog(ns);
        if (nsToCollectionSubstring(parentns) != "system.indexes") {
            removeFromSysIndexes(parentns, indexName());
        }
    }
Example #13
0
    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
       caller must check if capped
    */
    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const StringData& ns, Record *todelete, const DiskLoc& dl) {
        /* remove ourself from the record next/prev chain */
        {
            if ( todelete->prevOfs() != DiskLoc::NullOfs )
                getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs() ) = todelete->nextOfs();
            if ( todelete->nextOfs() != DiskLoc::NullOfs )
                getDur().writingInt( todelete->getNext(dl).rec()->prevOfs() ) = todelete->prevOfs();
        }

        /* remove ourself from extent pointers */
        {
            Extent *e = getDur().writing( todelete->myExtent(dl) );
            if ( e->firstRecord == dl ) {
                if ( todelete->nextOfs() == DiskLoc::NullOfs )
                    e->firstRecord.Null();
                else
                    e->firstRecord.set(dl.a(), todelete->nextOfs() );
            }
            if ( e->lastRecord == dl ) {
                if ( todelete->prevOfs() == DiskLoc::NullOfs )
                    e->lastRecord.Null();
                else
                    e->lastRecord.set(dl.a(), todelete->prevOfs() );
            }
        }

        /* add to the free list */
        {
            d->incrementStats( -1 * todelete->netLength(), -1 );

            if ( nsToCollectionSubstring(ns) == "system.indexes") {
                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
                   careful until validated more, as IndexDetails has pointers
                   to this disk location.  so an incorrectly done remove would cause
                   a lot of problems.
                */
                memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() );
            }
            else {
                DEV {
                    unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() );
                    *getDur().writing(p) = 0;
                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
                }
                d->addDeletedRec((DeletedRecord*)todelete, dl);
            }
        }
    }
Example #14
0
    void SyncClusterConnection::insert( const string &ns, BSONObj obj , int flags) {

        uassert(13119,
                str::stream() << "SyncClusterConnection::insert obj has to have an _id: " << obj,
                nsToCollectionSubstring(ns) == "system.indexes" || obj["_id"].type());

        string errmsg;
        if ( ! prepare( errmsg ) )
            throw UserException( 8003 , (string)"SyncClusterConnection::insert prepare failed: " + errmsg );

        for ( size_t i=0; i<_conns.size(); i++ ) {
            _conns[i]->insert( ns , obj , flags);
        }

        _checkLast();
    }
Example #15
0
    /** { ..., capped: true, size: ..., max: ... }
     * @param createDefaultIndexes - if false, defers id (and other) index creation.
     * @return true if successful
    */
    Status userCreateNS( OperationContext* txn,
                         Database* db,
                         StringData ns,
                         BSONObj options,
                         bool logForReplication,
                         bool createDefaultIndexes ) {

        invariant( db );

        LOG(1) << "create collection " << ns << ' ' << options;

        if ( !NamespaceString::validCollectionComponent(ns) )
            return Status( ErrorCodes::InvalidNamespace,
                           str::stream() << "invalid ns: " << ns );

        Collection* collection = db->getCollection( ns );

        if ( collection )
            return Status( ErrorCodes::NamespaceExists,
                           "collection already exists" );

        CollectionOptions collectionOptions;
        Status status = collectionOptions.parse(options);
        if ( !status.isOK() )
            return status;

        status = validateStorageOptions(collectionOptions.storageEngine,
                                        &StorageEngine::Factory::validateCollectionStorageOptions);
        if ( !status.isOK() )
            return status;

        invariant( db->createCollection( txn, ns, collectionOptions, true, createDefaultIndexes ) );

        if ( logForReplication ) {
            if ( options.getField( "create" ).eoo() ) {
                BSONObjBuilder b;
                b << "create" << nsToCollectionSubstring( ns );
                b.appendElements( options );
                options = b.obj();
            }
            string logNs = nsToDatabase(ns) + ".$cmd";
            repl::logOp(txn, "c", logNs.c_str(), options);
        }

        return Status::OK();
    }
Example #16
0
    bool WriteBatchExecutor::ExecInsertsState::_lockAndCheckImpl(WriteOpResult* result) {
        if (hasLock()) {
            txn->getCurOp()->enter(_context.get());
            return true;
        }

        invariant(!_context.get());
        _writeLock.reset(new Lock::DBLock(txn->lockState(),
                                          nsToDatabase(request->getNS()),
                                          newlm::MODE_X));
        if (!checkIsMasterForDatabase(request->getNS(), result)) {
            return false;
        }
        if (!checkShardVersion(txn, &shardingState, *request, result)) {
            return false;
        }
        if (!checkIndexConstraints(txn, &shardingState, *request, result)) {
            return false;
        }

        _context.reset(new Client::Context(txn, request->getNS(), false));

        Database* database = _context->db();
        dassert(database);
        _collection = database->getCollection(txn, request->getTargetingNS());
        if (!_collection) {
            WriteUnitOfWork wunit (txn);
            // Implicitly create if it doesn't exist
            _collection = database->createCollection(txn, request->getTargetingNS());
            if (!_collection) {
                result->setError(
                        toWriteError(Status(ErrorCodes::InternalError,
                                            "could not create collection " +
                                            request->getTargetingNS())));
                return false;
            }
            repl::logOp(txn,
                        "c",
                        (database->name() + ".$cmd").c_str(),
                        BSON("create" << nsToCollectionSubstring(request->getTargetingNS())));
            wunit.commit();
        }
        return true;
    }
Example #17
0
        bool run(OperationContext* txn,
                 const string& dbname,
                 BSONObj& jsobj,
                 int,
                 string& errmsg,
                 BSONObjBuilder& result,
                 bool /*fromRepl*/) {

            Lock::DBRead lk( txn->lockState(), dbname );

            const Database* d = dbHolder().get( txn, dbname );
            const DatabaseCatalogEntry* dbEntry = NULL;

            list<string> names;
            if ( d ) {
                dbEntry = d->getDatabaseCatalogEntry();
                dbEntry->getCollectionNamespaces( &names );
                names.sort();
            }

            BSONArrayBuilder arr;

            for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) {
                string ns = *i;

                StringData collection = nsToCollectionSubstring( ns );
                if ( collection == "system.namespaces" ) {
                    continue;
                }

                BSONObjBuilder b;
                b.append( "name", collection );

                CollectionOptions options =
                    dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn);
                b.append( "options", options.toBSON() );

                arr.append( b.obj() );
            }

            result.append( "collections", arr.arr() );

            return true;
        }
    void MMAPV1DatabaseCatalogEntry::_addNamespaceToNamespaceCollection( OperationContext* txn,
                                                                        const StringData& ns,
                                                                        const BSONObj* options ) {
        if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) {
            // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
            return;
        }

        BSONObjBuilder b;
        b.append("name", ns);
        if ( options && !options->isEmpty() )
            b.append("options", *options);
        BSONObj obj = b.done();

        RecordStoreV1Base* rs = _getNamespaceRecordStore( txn, ns );
        invariant( rs );
        StatusWith<DiskLoc> loc = rs->insertRecord( txn, obj.objdata(), obj.objsize(), -1 );
        massertStatusOK( loc.getStatus() );
    }
    void MMAPV1DatabaseCatalogEntry::_removeNamespaceFromNamespaceCollection( OperationContext* txn,
                                                                             const StringData& ns ) {
        if ( nsToCollectionSubstring( ns ) == "system.namespaces" ) {
            // system.namespaces holds all the others, so it is not explicitly listed in the catalog.
            return;
        }

        RecordStoreV1Base* rs = _getNamespaceRecordStore();
        invariant( rs );

        scoped_ptr<RecordIterator> it( rs->getIterator(txn) );
        while ( !it->isEOF() ) {
            DiskLoc loc = it->getNext();
            BSONObj entry = it->dataFor( loc ).toBson();
            BSONElement name = entry["name"];
            if ( name.type() == String && name.String() == ns ) {
                rs->deleteRecord( txn, loc );
                break;
            }
        }
    }
Example #20
0
 bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) {
     uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
     if ( strstr(ns, ".system.") ) {
         // later:check for dba-type permissions here if have that at some point separate
         if (nsToCollectionSubstring(ns) == "system.indexes")
             wouldAddIndex = true;
         else if ( legalClientSystemNS( ns , true ) ) {
             if ( obuf &&
                     StringData(ns) == StringData(".system.users", StringData::LiteralTag()) ) {
                 BSONObj t( reinterpret_cast<const char *>( obuf ) );
                 V2UserDocumentParser parser;
                 uassertStatusOK(parser.checkValidUserDocument(t));
             }
         }
         else if ( !god ) {
             uasserted(16459, str::stream() << "attempt to insert in system namespace '"
                                            << ns << "'");
         }
     }
     return true;
 }
Example #21
0
    void Database::clearTmpCollections(OperationContext* txn) {
        invariant(txn->lockState()->isDbLockedForMode(name(), MODE_X));

        list<string> collections;
        _dbEntry->getCollectionNamespaces( &collections );

        for ( list<string>::iterator i = collections.begin(); i != collections.end(); ++i ) {
            string ns = *i;
            invariant( NamespaceString::normal( ns ) );

            CollectionCatalogEntry* coll = _dbEntry->getCollectionCatalogEntry( ns );

            CollectionOptions options = coll->getCollectionOptions( txn );
            if ( !options.temp )
                continue;
            try {
                WriteUnitOfWork wunit(txn);
                Status status = dropCollection( txn, ns );
                if ( !status.isOK() ) {
                    warning() << "could not drop temp collection '" << ns << "': " << status;
                    continue;
                }

                string cmdNs = _name + ".$cmd";
                repl::logOp( txn,
                             "c",
                             cmdNs.c_str(),
                             BSON( "drop" << nsToCollectionSubstring( ns ) ) );
                wunit.commit();
            }
            catch (const WriteConflictException& exp) {
                warning() << "could not drop temp collection '" << ns << "' due to "
                    "WriteConflictException";
                txn->recoveryUnit()->commitAndRestart();
            }
        }
    }
Example #22
0
DBClientCursor::DBClientCursor(DBClientBase* client,
                               const std::string& ns,
                               const BSONObj& query,
                               long long cursorId,
                               int nToReturn,
                               int nToSkip,
                               const BSONObj* fieldsToReturn,
                               int queryOptions,
                               int batchSize)
    : _client(client),
      _originalHost(_client->getServerAddress()),
      ns(ns),
      _isCommand(nsIsFull(ns) ? nsToCollectionSubstring(ns) == "$cmd" : false),
      query(query),
      nToReturn(nToReturn),
      haveLimit(nToReturn > 0 && !(queryOptions & QueryOption_CursorTailable)),
      nToSkip(nToSkip),
      fieldsToReturn(fieldsToReturn),
      opts(queryOptions),
      batchSize(batchSize == 1 ? 2 : batchSize),
      resultFlags(0),
      cursorId(cursorId),
      _ownCursor(true),
      wasError(false) {}
Example #23
0
void BackgroundSync::_produce(OperationContext* txn) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    SyncSourceResolverResponse syncSourceResp =
        _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched);

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen});
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode.";
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _replCoord->signalUpstreamUpdater();
    }

    const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol();
    // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election
    // timeout. This enables the sync source to communicate liveness of the primary to secondaries.
    // Under protocol version 0, use a default timeout of 2 seconds for awaitData.
    const Milliseconds fetcherMaxTimeMS(
        isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2));

    Status fetcherReturnStatus = Status::OK();
    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      fetcherMaxTimeMS,
                                      &fetcherReturnStatus);


    BSONObjBuilder cmdBob;
    cmdBob.append("find", nsToCollectionSubstring(rsOplogName));
    cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())));
    cmdBob.append("tailable", true);
    cmdBob.append("oplogReplay", true);
    cmdBob.append("awaitData", true);
    cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1)));  // 1 min initial find.

    BSONObjBuilder metadataBob;
    if (isV1ElectionProtocol) {
        cmdBob.append("term", _replCoord->getTerm());
        metadataBob.append(rpc::kReplSetMetadataFieldName, 1);
    }

    auto dbName = nsToDatabase(rsOplogName);
    auto cmdObj = cmdBob.obj();
    auto metadataObj = metadataBob.obj();
    Fetcher fetcher(&_threadPoolTaskExecutor,
                    source,
                    dbName,
                    cmdObj,
                    fetcherCallback,
                    metadataObj,
                    _replCoord->getConfig().getElectionTimeoutPeriod());

    LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at "
           << cmdObj["filter"];
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << fetcherReturnStatus.toString();
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), oplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << fetcherReturnStatus;

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        BatchBoundaries boundaries = getMinValid(txn);
        if (!boundaries.start.isNull() || boundaries.end > lastApplied) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream()
                                      << "need to rollback, but in inconsistent state. "
                                      << "minvalid: " << boundaries.end.toString()
                                      << " > our last optime: " << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString();
    }
}
Example #24
0
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    OplogReader syncSourceReader;
    syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

    // no server found
    if (syncSourceReader.getHost().empty()) {
        sleepsecs(1);
        // if there is no one to sync from
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_pause) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _syncSourceHost = syncSourceReader.getHost();
        _replCoord->signalUpstreamUpdater();
    }

    const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout);

    // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared
    // if sync source feedback fails.
    const HostAndPort source = syncSourceReader.getHost();
    syncSourceReader.resetConnection();
    // no more references to oplog reader from here on.

    // If this status is not OK after the fetcher returns from wait(),
    // proceed to execute rollback
    Status remoteOplogStartStatus = Status::OK();

    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      &remoteOplogStartStatus);

    auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter"
                              << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))
                              << "tailable" << true << "oplogReplay" << true << "awaitData" << true
                              << "maxTimeMS" << int(fetcherMaxTimeMS.count()));
    Fetcher fetcher(taskExecutor,
                    source,
                    nsToDatabase(rsOplogName),
                    cmdObj,
                    fetcherCallback,
                    rpc::makeEmptyMetadata());
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();

    // If the background sync is paused after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isPaused()) {
        return;
    }

    // Execute rollback if necessary.
    // Rollback is a synchronous operation that uses the task executor and may not be
    // executed inside the fetcher callback.
    if (!remoteOplogStartStatus.isOK()) {
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection =
            [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* {
                if (!connection.get()) {
                    connection.reset(new ConnectionPool::ConnectionPtr(
                        &connectionPool, source, Date_t::now(), oplogSocketTimeout));
                };
                return connection->get();
            };

        log() << "starting rollback: " << remoteOplogStartStatus;
        _rollback(txn, source, getConnection);
        stop();
    }
}
Example #25
0
// static
Status SyncTail::syncApply(OperationContext* txn,
                           const BSONObj& op,
                           bool convertUpdateToUpsert,
                           ApplyOperationInLockFn applyOperationInLock,
                           ApplyCommandInLockFn applyCommandInLock,
                           IncrementOpsAppliedStatsFn incrementOpsAppliedStats) {
    if (inShutdown()) {
        return Status::OK();
    }

    // Count each log op application as a separate operation, for reporting purposes
    CurOp individualOp(txn);

    const char* ns = op.getStringField("ns");
    verify(ns);

    const char* opType = op["op"].valuestrsafe();

    bool isCommand(opType[0] == 'c');
    bool isNoOp(opType[0] == 'n');

    if ((*ns == '\0') || (*ns == '.')) {
        // this is ugly
        // this is often a no-op
        // but can't be 100% sure
        if (!isNoOp) {
            error() << "skipping bad op in oplog: " << op.toString();
        }
        return Status::OK();
    }

    if (isCommand) {
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            // a command may need a global write lock. so we will conservatively go
            // ahead and grab one here. suboptimal. :-(
            Lock::GlobalWrite globalWriteLock(txn->lockState());

            // special case apply for commands to avoid implicit database creation
            Status status = applyCommandInLock(txn, op);
            incrementOpsAppliedStats();
            return status;
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "syncApply_command", ns);
    }

    auto applyOp = [&](Database* db) {
        // For non-initial-sync, we convert updates to upserts
        // to suppress errors when replaying oplog entries.
        txn->setReplicatedWrites(false);
        DisableDocumentValidation validationDisabler(txn);

        Status status =
            applyOperationInLock(txn, db, op, convertUpdateToUpsert, incrementOpsAppliedStats);
        if (!status.isOK() && status.code() == ErrorCodes::WriteConflict) {
            throw WriteConflictException();
        }
        return status;
    };

    if (isNoOp || (opType[0] == 'i' && nsToCollectionSubstring(ns) == "system.indexes")) {
        auto opStr = isNoOp ? "syncApply_noop" : "syncApply_indexBuild";
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X);
            OldClientContext ctx(txn, ns);
            return applyOp(ctx.db());
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, opStr, ns);
    }
Example #26
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            if ( context ) {
                context->relocked();
            }

            while( i.moreInCurrentBatch() ) {
                if ( n % 128 == 127 /*yield some*/ ) {
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) { 
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << n << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                if ( !tmp.valid() ) {
                    stringstream ss;
                    ss << "Cloner: skipping corrupt object from " << from_collection;
                    BSONElement e = tmp.firstElement();
                    try {
                        e.validate();
                        ss << " firstElement: " << e;
                    }
                    catch( ... ) {
                        ss << " firstElement corrupt";
                    }
                    out() << ss.str() << endl;
                    continue;
                }

                ++n;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify(nsToCollectionSubstring(from_collection) == "system.indexes");
                    js = fixindex(tmp);
                    storedForLater->push_back( js.getOwned() );
                    continue;
                }

                try {
                    DiskLoc loc = theDataFileMgr.insertWithObjMod(to_collection, js);
                    loc.assertOk();
                    if ( logForRepl )
                        logOp("i", to_collection, js);

                    getDur().commitIfNeeded();
                }
                catch( UserException& e ) {
                    error() << "error: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
                    throw;
                }

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << n << " objects cloned so far from collection " << from_collection << endl;
                    saveLast = time( 0 );
                }
            }
        }
Example #27
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            context.relocked();

            bool createdCollection = false;
            Collection* collection = NULL;

            while( i.moreInCurrentBatch() ) {
                if ( numSeen % 128 == 127 /*yield some*/ ) {
                    collection = NULL;
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) {
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << numSeen << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                if ( isindex == false && collection == NULL ) {
                    collection = context.db()->getCollection( to_collection );
                    if ( !collection ) {
                        massert( 17321,
                                 str::stream()
                                 << "collection dropped during clone ["
                                 << to_collection << "]",
                                 !createdCollection );
                        createdCollection = true;
                        collection = context.db()->createCollection( txn, to_collection );
                        verify( collection );
                    }
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                const Status status = validateBSON(tmp.objdata(), tmp.objsize());
                if (!status.isOK()) {
                    out() << "Cloner: skipping corrupt object from " << from_collection
                          << ": " << status.reason();
                    continue;
                }

                ++numSeen;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify(nsToCollectionSubstring(from_collection) == "system.indexes");
                    js = fixindex(context.db()->name(), tmp);
                    indexesToBuild->push_back( js.getOwned() );
                    continue;
                }

                verify(nsToCollectionSubstring(from_collection) != "system.indexes");

                StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true );
                if ( !loc.isOK() ) {
                    error() << "error: exception cloning object in " << from_collection
                            << ' ' << loc.toString() << " obj:" << js;
                }
                uassertStatusOK( loc.getStatus() );
                if ( logForRepl )
                    logOp(txn, "i", to_collection, js);

                getDur().commitIfNeeded();

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << numSeen << " objects cloned so far from collection " << from_collection;
                    saveLast = time( 0 );
                }
            }
        }
Example #28
0
    /** @param fromRepl false if from ApplyOpsCmd
        @return true if was and update should have happened and the document DNE.  see replset initial sync code.
     */
    bool applyOperation_inlock(OperationContext* txn,
                               Database* db,
                               const BSONObj& op,
                               bool fromRepl,
                               bool convertUpdateToUpsert) {
        LOG(3) << "applying op: " << op << endl;
        bool failedUpdate = false;

        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;

        const char *names[] = { "o", "ns", "op", "b", "o2" };
        BSONElement fields[5];
        op.getFields(5, names, fields);
        BSONElement& fieldO = fields[0];
        BSONElement& fieldNs = fields[1];
        BSONElement& fieldOp = fields[2];
        BSONElement& fieldB = fields[3];
        BSONElement& fieldO2 = fields[4];

        BSONObj o;
        if( fieldO.isABSONObj() )
            o = fieldO.embeddedObject();

        const char *ns = fieldNs.valuestrsafe();

        BSONObj o2;
        if (fieldO2.isABSONObj())
            o2 = fieldO2.Obj();

        bool valueB = fieldB.booleanSafe();

        txn->lockState()->assertWriteLocked(ns);

        Collection* collection = db->getCollection( txn, ns );
        IndexCatalog* indexCatalog = collection == NULL ? NULL : collection->getIndexCatalog();

        // operation type -- see logOp() comments for types
        const char *opType = fieldOp.valuestrsafe();

        if ( *opType == 'i' ) {
            opCounters->gotInsert();

            const char *p = strchr(ns, '.');
            if ( p && nsToCollectionSubstring( p ) == "system.indexes" ) {
                if (o["background"].trueValue()) {
                    IndexBuilder* builder = new IndexBuilder(o);
                    // This spawns a new thread and returns immediately.
                    builder->go();
                }
                else {
                    IndexBuilder builder(o);
                    Status status = builder.buildInForeground(txn, db);
                    if ( status.isOK() ) {
                        // yay
                    }
                    else if ( status.code() == ErrorCodes::IndexOptionsConflict ||
                              status.code() == ErrorCodes::IndexKeySpecsConflict ) {
                        // SERVER-13206, SERVER-13496
                        // 2.4 (and earlier) will add an ensureIndex to an oplog if its ok or not
                        // so in 2.6+ where we do stricter validation, it will fail
                        // but we shouldn't care as the primary is responsible
                        warning() << "index creation attempted on secondary that conflicts, "
                                  << "skipping: " << status;
                    }
                    else {
                        uassertStatusOK( status );
                    }
                }
            }
            else {
                // do upserts for inserts as we might get replayed more than once
                OpDebug debug;
                BSONElement _id;
                if( !o.getObjectID(_id) ) {
                    /* No _id.  This will be very slow. */
                    Timer t;

                    const NamespaceString requestNs(ns);
                    UpdateRequest request(txn, requestNs);

                    request.setQuery(o);
                    request.setUpdates(o);
                    request.setUpsert();
                    request.setFromReplication();
                    UpdateLifecycleImpl updateLifecycle(true, requestNs);
                    request.setLifecycle(&updateLifecycle);

                    update(db, request, &debug);

                    if( t.millis() >= 2 ) {
                        RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
                    }
                }
                else {
                    // probably don't need this since all replicated colls have _id indexes now
                    // but keep it just in case
                    RARELY if ( indexCatalog
                                 && !collection->isCapped()
                                 && !indexCatalog->haveIdIndex(txn) ) {
                        try {
                            Helpers::ensureIndex(txn, collection, BSON("_id" << 1), true, "_id_");
                        }
                        catch (const DBException& e) {
                            warning() << "Ignoring error building id index on " << collection->ns()
                                      << ": " << e.toString();
                        }
                    }

                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                              then.  very few upserts will not be inserts...
                              */
                    BSONObjBuilder b;
                    b.append(_id);

                    const NamespaceString requestNs(ns);
                    UpdateRequest request(txn, requestNs);

                    request.setQuery(b.done());
                    request.setUpdates(o);
                    request.setUpsert();
                    request.setFromReplication();
                    UpdateLifecycleImpl updateLifecycle(true, requestNs);
                    request.setLifecycle(&updateLifecycle);

                    update(db, request, &debug);
                }
            }
        }
Example #29
0
    // Copies ops out of the bgsync queue into the deque passed in as a parameter.
    // Returns true if the batch should be ended early.
    // Batch should end early if we encounter a command, or if
    // there are no further ops in the bgsync queue to read.
    // This function also blocks 1 second waiting for new ops to appear in the bgsync
    // queue.  We can't block forever because there are maintenance things we need
    // to periodically check in the loop.
    bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops) {
        BSONObj op;
        // Check to see if there are ops waiting in the bgsync queue
        bool peek_success = peek(&op);

        if (!peek_success) {
            // if we don't have anything in the queue, wait a bit for something to appear
            if (ops->empty()) {
                // block up to 1 second
                _networkQueue->waitForMore();
                return false;
            }

            // otherwise, apply what we have
            return true;
        }

        const char* ns = op["ns"].valuestrsafe();

        // check for commands
        if ((op["op"].valuestrsafe()[0] == 'c') ||
            // Index builds are acheived through the use of an insert op, not a command op.
            // The following line is the same as what the insert code uses to detect an index build.
            ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) {

            if (ops->empty()) {
                // apply commands one-at-a-time
                ops->push_back(op);
                _networkQueue->consume();
            }

            // otherwise, apply what we have so far and come back for the command
            return true;
        }

        // check for oplog version change
        BSONElement elemVersion = op["v"];
        int curVersion = 0;
        if (elemVersion.eoo())
            // missing version means version 1
            curVersion = 1;
        else
            curVersion = elemVersion.Int();

        if (curVersion != oplogVersion) {
            // Version changes cause us to end a batch.
            // If we are starting a new batch, reset version number
            // and continue.
            if (ops->empty()) {
                oplogVersion = curVersion;
            } 
            else {
                // End batch early
                return true;
            }
        }
    
        // Copy the op to the deque and remove it from the bgsync queue.
        ops->push_back(op);
        _networkQueue->consume();

        // Go back for more ops
        return false;
    }
Example #30
0
    virtual bool run(OperationContext* txn,
                     const string& db,
                     BSONObj& cmdObj,
                     int options,
                     string& errmsg,
                     BSONObjBuilder& result) {
        const std::string ns = parseNs(db, cmdObj);
        if (nsToCollectionSubstring(ns).empty()) {
            errmsg = "missing collection name";
            return false;
        }
        NamespaceString nss(ns);

        // Parse the options for this request.
        auto request = AggregationRequest::parseFromBSON(nss, cmdObj);
        if (!request.isOK()) {
            return appendCommandStatus(result, request.getStatus());
        }

        // Set up the ExpressionContext.
        intrusive_ptr<ExpressionContext> expCtx = new ExpressionContext(txn, request.getValue());
        expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp";

        // Parse the pipeline.
        auto statusWithPipeline = Pipeline::parse(request.getValue().getPipeline(), expCtx);
        if (!statusWithPipeline.isOK()) {
            return appendCommandStatus(result, statusWithPipeline.getStatus());
        }
        auto pipeline = std::move(statusWithPipeline.getValue());

        auto resolvedNamespaces = resolveInvolvedNamespaces(txn, pipeline, expCtx);
        if (!resolvedNamespaces.isOK()) {
            return appendCommandStatus(result, resolvedNamespaces.getStatus());
        }
        expCtx->resolvedNamespaces = std::move(resolvedNamespaces.getValue());

        unique_ptr<ClientCursorPin> pin;  // either this OR the exec will be non-null
        unique_ptr<PlanExecutor> exec;
        auto curOp = CurOp::get(txn);
        {
            // This will throw if the sharding version for this connection is out of date. If the
            // namespace is a view, the lock will be released before re-running the aggregation.
            // Otherwise, the lock must be held continuously from now until we have we created both
            // the output ClientCursor and the input executor. This ensures that both are using the
            // same sharding version that we synchronize on here. This is also why we always need to
            // create a ClientCursor even when we aren't outputting to a cursor. See the comment on
            // ShardFilterStage for more details.
            AutoGetCollectionOrViewForRead ctx(txn, nss);
            Collection* collection = ctx.getCollection();

            // If running $collStats on a view, we do not resolve the view since we want stats
            // on this view namespace.
            auto startsWithCollStats = [&pipeline]() {
                const Pipeline::SourceContainer& sources = pipeline->getSources();
                return !sources.empty() &&
                    dynamic_cast<DocumentSourceCollStats*>(sources.front().get());
            };

            // If this is a view, resolve it by finding the underlying collection and stitching view
            // pipelines and this request's pipeline together. We then release our locks before
            // recursively calling run, which will re-acquire locks on the underlying collection.
            // (The lock must be released because recursively acquiring locks on the database will
            // prohibit yielding.)
            auto view = ctx.getView();
            if (view && !startsWithCollStats()) {
                auto viewDefinition =
                    ViewShardingCheck::getResolvedViewIfSharded(txn, ctx.getDb(), view);
                if (!viewDefinition.isOK()) {
                    return appendCommandStatus(result, viewDefinition.getStatus());
                }

                if (!viewDefinition.getValue().isEmpty()) {
                    ViewShardingCheck::appendShardedViewStatus(viewDefinition.getValue(), &result);
                    return false;
                }

                auto resolvedView = ctx.getDb()->getViewCatalog()->resolveView(txn, nss);
                if (!resolvedView.isOK()) {
                    return appendCommandStatus(result, resolvedView.getStatus());
                }

                // With the view resolved, we can relinquish locks.
                ctx.releaseLocksForView();

                // Parse the resolved view into a new aggregation request.
                auto viewCmd =
                    resolvedView.getValue().asExpandedViewAggregation(request.getValue());
                if (!viewCmd.isOK()) {
                    return appendCommandStatus(result, viewCmd.getStatus());
                }

                bool status = this->run(txn, db, viewCmd.getValue(), options, errmsg, result);
                {
                    // Set the namespace of the curop back to the view namespace so ctx records
                    // stats on this view namespace on destruction.
                    stdx::lock_guard<Client>(*txn->getClient());
                    curOp->setNS_inlock(nss.ns());
                }
                return status;
            }

            // If the pipeline does not have a user-specified collation, set it from the collection
            // default.
            if (request.getValue().getCollation().isEmpty() && collection &&
                collection->getDefaultCollator()) {
                invariant(!expCtx->getCollator());
                expCtx->setCollator(collection->getDefaultCollator()->clone());
            }

            // Propagate the ExpressionContext throughout all of the pipeline's stages and
            // expressions.
            pipeline->injectExpressionContext(expCtx);

            // The pipeline must be optimized after the correct collator has been set on it (by
            // injecting the ExpressionContext containing the collator). This is necessary because
            // optimization may make string comparisons, e.g. optimizing {$eq: [<str1>, <str2>]} to
            // a constant.
            pipeline->optimizePipeline();

            if (kDebugBuild && !expCtx->isExplain && !expCtx->inShard) {
                // Make sure all operations round-trip through Pipeline::serialize() correctly by
                // re-parsing every command in debug builds. This is important because sharded
                // aggregations rely on this ability.  Skipping when inShard because this has
                // already been through the transformation (and this un-sets expCtx->inShard).
                pipeline = reparsePipeline(pipeline, request.getValue(), expCtx);
            }

            // This does mongod-specific stuff like creating the input PlanExecutor and adding
            // it to the front of the pipeline if needed.
            PipelineD::prepareCursorSource(collection, pipeline);

            // Create the PlanExecutor which returns results from the pipeline. The WorkingSet
            // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created
            // PlanExecutor.
            auto ws = make_unique<WorkingSet>();
            auto proxy = make_unique<PipelineProxyStage>(txn, pipeline, ws.get());

            auto statusWithPlanExecutor = (NULL == collection)
                ? PlanExecutor::make(
                      txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL)
                : PlanExecutor::make(
                      txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL);
            invariant(statusWithPlanExecutor.isOK());
            exec = std::move(statusWithPlanExecutor.getValue());

            {
                auto planSummary = Explain::getPlanSummary(exec.get());
                stdx::lock_guard<Client>(*txn->getClient());
                curOp->setPlanSummary_inlock(std::move(planSummary));
            }

            if (collection) {
                PlanSummaryStats stats;
                Explain::getSummaryStats(*exec, &stats);
                collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed);
            }

            if (collection) {
                const bool isAggCursor = true;  // enable special locking behavior
                ClientCursor* cursor =
                    new ClientCursor(collection->getCursorManager(),
                                     exec.release(),
                                     nss.ns(),
                                     txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                                     0,
                                     cmdObj.getOwned(),
                                     isAggCursor);
                pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid()));
                // Don't add any code between here and the start of the try block.
            }

            // At this point, it is safe to release the collection lock.
            // - In the case where we have a collection: we will need to reacquire the
            //   collection lock later when cleaning up our ClientCursorPin.
            // - In the case where we don't have a collection: our PlanExecutor won't be
            //   registered, so it will be safe to clean it up outside the lock.
            invariant(!exec || !collection);
        }

        try {
            // Unless set to true, the ClientCursor created above will be deleted on block exit.
            bool keepCursor = false;

            // Use of the aggregate command without specifying to use a cursor is deprecated.
            // Applications should migrate to using cursors. Cursors are strictly more useful than
            // outputting the results as a single document, since results that fit inside a single
            // BSONObj will also fit inside a single batch.
            //
            // We occasionally log a deprecation warning.
            if (!request.getValue().isCursorCommand()) {
                RARELY {
                    warning()
                        << "Use of the aggregate command without the 'cursor' "
                           "option is deprecated. See "
                           "http://dochub.mongodb.org/core/aggregate-without-cursor-deprecation.";
                }
            }

            // If both explain and cursor are specified, explain wins.
            if (expCtx->isExplain) {
                result << "stages" << Value(pipeline->writeExplainOps());
            } else if (request.getValue().isCursorCommand()) {
                keepCursor = handleCursorCommand(txn,
                                                 nss.ns(),
                                                 pin.get(),
                                                 pin ? pin->c()->getExecutor() : exec.get(),
                                                 request.getValue(),
                                                 result);
            } else {
                pipeline->run(result);
            }

            if (!expCtx->isExplain) {
                PlanSummaryStats stats;
                Explain::getSummaryStats(pin ? *pin->c()->getExecutor() : *exec.get(), &stats);
                curOp->debug().setPlanSummaryMetrics(stats);
                curOp->debug().nreturned = stats.nReturned;
            }

            // Clean up our ClientCursorPin, if needed.  We must reacquire the collection lock
            // in order to do so.
            if (pin) {
                // We acquire locks here with DBLock and CollectionLock instead of using
                // AutoGetCollectionForRead.  AutoGetCollectionForRead will throw if the
                // sharding version is out of date, and we don't care if the sharding version
                // has changed.
                Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS);
                Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS);
                if (keepCursor) {
                    pin->release();
                } else {
                    pin->deleteUnderlying();
                }
            }
        } catch (...) {