Esempio n. 1
0
    /* get a table scan cursor, but can be forward or reverse direction.
       order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
    */
    shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
        BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 }

        if ( el.number() >= 0 )
            return DataFileMgr::findAll(ns, startLoc);

        // "reverse natural order"
        NamespaceDetails *d = nsdetails(ns);

        if ( !d )
            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));

        if ( !d->isCapped() ) {
            if ( !startLoc.isNull() )
                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
            Extent *e = d->lastExtent().ext();
            while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
                OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
                e = e->getPrevExtent();
            }
            return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
        }
        else {
            return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
        }
    }
Esempio n. 2
0
    bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) {
        massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) );
        massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails

        bool ok;
        {
            Lock::DBWrite lk(ns);
            BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str());
            Client::Context ctx(ns);
            NamespaceDetails *d = nsdetails(ns.c_str());
            massert( 13660, str::stream() << "namespace " << ns << " does not exist", d );
            massert( 13661, "cannot compact capped collection", !d->isCapped() );
            log() << "compact " << ns << " begin" << endl;
            if( pf != 0 || pb != 0 ) { 
                log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl;
            } 
            try { 
                ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb);
            }
            catch(...) { 
                log() << "compact " << ns << " end (with error)" << endl;
                throw;
            }
            log() << "compact " << ns << " end" << endl;
        }
        return ok;
    }
Esempio n. 3
0
    BSONObj Sync::getMissingDoc(const BSONObj& o) {
        OplogReader missingObjReader;
        const char *ns = o.getStringField("ns");

        // capped collections
        NamespaceDetails *nsd = nsdetails(ns);
        if ( nsd && nsd->isCapped() ) {
            log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl;
            return BSONObj();
        }

        uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));

        // might be more than just _id in the update criteria
        BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
        BSONObj missingObj;
        try {
            missingObj = missingObjReader.findOne(ns, query);
        } catch(DBException& e) {
            log() << "replication assertion fetching missing object: " << e.what() << endl;
            throw;
        }

        return missingObj;
    }
Esempio n. 4
0
    /** @param fromRepl false if from ApplyOpsCmd
        @return true if was and update should have happened and the document DNE.  see replset initial sync code.
     */
    bool applyOperation_inlock(const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) {
        LOG(6) << "applying op: " << op << endl;
        bool failedUpdate = false;

        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;

        const char *names[] = { "o", "ns", "op", "b" };
        BSONElement fields[4];
        op.getFields(4, names, fields);

        BSONObj o;
        if( fields[0].isABSONObj() )
            o = fields[0].embeddedObject();
            
        const char *ns = fields[1].valuestrsafe();

        Lock::assertWriteLocked(ns);

        NamespaceDetails *nsd = nsdetails(ns);

        // operation type -- see logOp() comments for types
        const char *opType = fields[2].valuestrsafe();

        if ( *opType == 'i' ) {
            opCounters->gotInsert();

            const char *p = strchr(ns, '.');
            if ( p && strcmp(p, ".system.indexes") == 0 ) {
                // updates aren't allowed for indexes -- so we will do a regular insert. if index already
                // exists, that is ok.
                theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize());
            }
            else {
                // do upserts for inserts as we might get replayed more than once
                OpDebug debug;
                BSONElement _id;
                if( !o.getObjectID(_id) ) {
                    /* No _id.  This will be very slow. */
                    Timer t;
                    updateObjectsForReplication(ns, o, o, true, false, false, debug, false,
                                                QueryPlanSelectionPolicy::idElseNatural() );
                    if( t.millis() >= 2 ) {
                        RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
                    }
                }
                else {
                    // probably don't need this since all replicated colls have _id indexes now
                    // but keep it just in case
                    RARELY if ( nsd && !nsd->isCapped() ) { ensureHaveIdIndex(ns); }

                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                              then.  very few upserts will not be inserts...
                              */
                    BSONObjBuilder b;
                    b.append(_id);
                    updateObjectsForReplication(ns, o, b.done(), true, false, false , debug, false,
                                                QueryPlanSelectionPolicy::idElseNatural() );
                }
            }
        }
Esempio n. 5
0
        void operator()(DBClientCursorBatchIterator &i) {
            const string to_dbname = nsToDatabase(to_collection);
            while (i.moreInCurrentBatch()) {
                if (n % 128 == 127) {
                    time_t now = time(0);
                    if (now - lastLog >= 60) { 
                        // report progress
                        if (lastLog) {
                            log() << "clone " << to_collection << ' ' << n << endl;
                        }
                        lastLog = now;
                    }
                    mayInterrupt(_mayBeInterrupted);
                }

                BSONObj js = i.nextSafe();
                ++n;

                if (isindex) {
                    verify(strstr(from_collection, "system.indexes"));
                    storedForLater->push_back(fixindex(js, to_dbname).getOwned());
                }
                else {
                    try {
                        Client::ReadContext ctx(to_collection);
                        if (_isCapped) {
                            NamespaceDetails *d = nsdetails(to_collection);
                            verify(d->isCapped());
                            BSONObj pk = js["$_"].Obj();
                            BSONObjBuilder rowBuilder;                        
                            BSONObjIterator it(js);
                            while (it.moreWithEOO()) {
                                BSONElement e = it.next();
                                if (e.eoo()) {
                                    break;
                                }
                                if (!mongoutils::str::equals(e.fieldName(), "$_")) {
                                    rowBuilder.append(e);
                                }
                            }
                            BSONObj row = rowBuilder.obj();
                            d->insertObjectIntoCappedWithPK(pk, row, NamespaceDetails::NO_LOCKTREE);
                        }
                        else {
                            insertObject(to_collection, js, 0, logForRepl);
                        }
                    }
                    catch (UserException& e) {
                        error() << "error: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
                        throw;
                    }

                    RARELY if ( time( 0 ) - saveLast > 60 ) {
                        log() << n << " objects cloned so far from collection " << from_collection << endl;
                        saveLast = time( 0 );
                    }
                }
            }
        }
Esempio n. 6
0
NamespaceDetails* NamespaceIndex::details(const Namespace& ns) {
    if ( !_ht )
        return 0;
    NamespaceDetails *d = _ht->get(ns);
    if ( d && d->isCapped() )
        d->cappedCheckMigrate();
    return d;
}
Esempio n. 7
0
    BSONObj Sync::getMissingDoc(const BSONObj& o) {
        OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query?
        const char *ns = o.getStringField("ns");

        // capped collections
        NamespaceDetails *nsd = nsdetails(ns);
        if ( nsd && nsd->isCapped() ) {
            log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl;
            return BSONObj();
        }

        const int retryMax = 3;
        for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
            if (retryCount != 1) {
                // if we are retrying, sleep a bit to let the network possibly recover
                sleepsecs(retryCount * retryCount);
            }
            try {
                bool ok = missingObjReader.connect(hn);
                if (!ok) {
                    warning() << "network problem detected while connecting to the "
                              << "sync source, attempt " << retryCount << " of "
                              << retryMax << endl;
                        continue;  // try again
                }
            } 
            catch (const SocketException&) {
                warning() << "network problem detected while connecting to the "
                          << "sync source, attempt " << retryCount << " of "
                          << retryMax << endl;
                continue; // try again
            }

            // might be more than just _id in the update criteria
            BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
            BSONObj missingObj;
            try {
                missingObj = missingObjReader.findOne(ns, query);
            } 
            catch (const SocketException&) {
                warning() << "network problem detected while fetching a missing document from the "
                          << "sync source, attempt " << retryCount << " of "
                          << retryMax << endl;
                continue; // try again
            } 
            catch (DBException& e) {
                log() << "replication assertion fetching missing object: " << e.what() << endl;
                throw;
            }

            // success!
            return missingObj;
        }
        // retry count exceeded
        msgasserted(15916, 
                    str::stream() << "Can no longer connect to initial sync source: " << hn);
    }
Esempio n. 8
0
    // prefetch for an oplog operation
    void prefetchPagesForReplicatedOp(const BSONObj& op) {
        const char *opField;
        const char *opType = op.getStringField("op");
        switch (*opType) {
        case 'i': // insert
        case 'd': // delete
            opField = "o";
            break;
        case 'u': // update
            opField = "o2";
            break;
        default:
            // prefetch ignores other ops
            return;
        }
         
        BSONObj obj = op.getObjectField(opField);
        const char *ns = op.getStringField("ns");
        NamespaceDetails *nsd = nsdetails(ns);
        if (!nsd) return; // maybe not opened yet
        
        LOG(4) << "index prefetch for op " << *opType << endl;

        DEV Lock::assertAtLeastReadLocked(ns);

        // should we prefetch index pages on updates? if the update is in-place and doesn't change 
        // indexed values, it is actually slower - a lot slower if there are a dozen indexes or 
        // lots of multikeys.  possible variations (not all mutually exclusive):
        //  1) current behavior: full prefetch
        //  2) don't do it for updates
        //  3) don't do multikey indexes for updates
        //  4) don't prefetchIndexPages on some heuristic; e.g., if it's an $inc.
        //  5) if not prefetching index pages (#2), we should do it if we are upsertings and it 
        //     will be an insert. to do that we could do the prefetchRecordPage first and if DNE
        //     then we do #1.
        // 
        // note that on deletes 'obj' does not have all the keys we would want to prefetch on.
        // a way to achieve that would be to prefetch the record first, and then afterwards do 
        // this part.
        //
        prefetchIndexPages(nsd, obj);

        // do not prefetch the data for inserts; it doesn't exist yet
        // 
        // we should consider doing the record prefetch for the delete op case as we hit the record
        // when we delete.  note if done we only want to touch the first page.
        // 
        // update: do record prefetch. 
        if ((*opType == 'u') &&
            // do not prefetch the data for capped collections because
            // they typically do not have an _id index for findById() to use.
            !nsd->isCapped()) {
            prefetchRecordPages(ns, obj);
        }
    }
Esempio n. 9
0
        virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            string coll = cmdObj.firstElement().valuestr();
            if( coll.empty() || db.empty() ) {
                errmsg = "no collection name specified";
                return false;
            }

            if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { 
                errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force";
                return false;
            }
            
            string ns = db + '.' + coll;
            if ( ! NamespaceString::normal(ns.c_str()) ) {
                errmsg = "bad namespace name";
                return false;
            }
            
            // parameter validation to avoid triggering assertions in compact()
            if ( str::contains(ns, ".system.") ) {
                errmsg = "can't compact a system namespace";
                return false;
            }
            
            {
                Lock::DBWrite lk(ns);
                Client::Context ctx(ns);
                NamespaceDetails *d = nsdetails(ns.c_str());
                if( ! d ) {
                    errmsg = "namespace does not exist";
                    return false;
                }

                if ( d->isCapped() ) {
                    errmsg = "cannot compact a capped collection";
                    return false;
                }
            }

            double pf = 1.0;
            int pb = 0;
            if( cmdObj.hasElement("paddingFactor") ) {
                pf = cmdObj["paddingFactor"].Number();
                verify( pf >= 1.0 && pf <= 4.0 );
            }
            if( cmdObj.hasElement("paddingBytes") ) {
                pb = (int) cmdObj["paddingBytes"].Number();
                verify( pb >= 0 && pb <= 1024 * 1024 );
            }

            bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment
            bool ok = compact(ns, errmsg, validate, result, pf, pb);
            return ok;
        }
Esempio n. 10
0
    void insertObjects(const char *ns, const vector<BSONObj> &objs, bool keepGoing, uint64_t flags, bool logop ) {
        if (mongoutils::str::contains(ns, "system.")) {
            massert(16748, "need transaction to run insertObjects", cc().txnStackSize() > 0);
            uassert(10095, "attempt to insert in reserved database name 'system'", !mongoutils::str::startsWith(ns, "system."));
            massert(16750, "attempted to insert multiple objects into a system namspace at once", objs.size() == 1);
            if (handle_system_collection_insert(ns, objs[0], logop) != 0) {
                return;
            }
        }

        NamespaceDetails *details = getAndMaybeCreateNS(ns, logop);
        NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get(ns);
        for (size_t i = 0; i < objs.size(); i++) {
            const BSONObj &obj = objs[i];
            try {
                uassert( 10059 , "object to insert too large", obj.objsize() <= BSONObjMaxUserSize);
                BSONObjIterator i( obj );
                while ( i.more() ) {
                    BSONElement e = i.next();
                    uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );
                }
                uassert( 16440 ,  "_id cannot be an array", obj["_id"].type() != Array );

                BSONObj objModified = obj;
                BSONElementManipulator::lookForTimestamps(objModified);
                if (details->isCapped() && logop) {
                    // unfortunate hack we need for capped collections
                    // we do this because the logic for generating the pk
                    // and what subsequent rows to delete are buried in the
                    // namespace details object. There is probably a nicer way
                    // to do this, but this works.
                    details->insertObjectIntoCappedAndLogOps(objModified, flags);
                    if (nsdt != NULL) {
                        nsdt->notifyOfWriteOp();
                    }
                }
                else {
                    insertOneObject(details, nsdt, objModified, flags); // may add _id field
                    if (logop) {
                        OpLogHelpers::logInsert(ns, objModified, &cc().txn());
                    }
                }
            } catch (const UserException &) {
                if (!keepGoing || i == objs.size() - 1) {
                    throw;
                }
            }
        }
    }
Esempio n. 11
0
    shared_ptr<Cursor> DataFileMgr::findAll(const StringData& ns, const DiskLoc &startLoc) {
        NamespaceDetails * d = nsdetails( ns );
        if ( ! d )
            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));

        DiskLoc loc = d->firstExtent();
        if ( loc.isNull() )
            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));
        Extent *e = getExtent(loc);

        DEBUGGING {
            out() << "listing extents for " << ns << endl;
            DiskLoc tmp = loc;
            set<DiskLoc> extents;

            while ( 1 ) {
                Extent *f = getExtent(tmp);
                out() << "extent: " << tmp.toString() << endl;
                extents.insert(tmp);
                tmp = f->xnext;
                if ( tmp.isNull() )
                    break;
                f = f->getNextExtent();
            }

            out() << endl;
            d->dumpDeleted(&extents);
        }

        if ( d->isCapped() )
            return shared_ptr<Cursor>( ForwardCappedCursor::make( d , startLoc ) );

        if ( !startLoc.isNull() )
            return shared_ptr<Cursor>(new BasicCursor( startLoc ));

        while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
            /* todo: if extent is empty, free it for reuse elsewhere.
               that is a bit complicated have to clean up the freelists.
            */
            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl;
            // find a nonempty extent
            // it might be nice to free the whole extent here!  but have to clean up free recs then.
            e = e->getNextExtent();
        }
        return shared_ptr<Cursor>(new BasicCursor( e->firstRecord ));
    }
Esempio n. 12
0
    // Does not check magic system collection inserts.
    void _insertObjects(const char *ns, const vector<BSONObj> &objs, bool keepGoing, uint64_t flags, bool logop ) {
        NamespaceDetails *details = getAndMaybeCreateNS(ns, logop);
        for (size_t i = 0; i < objs.size(); i++) {
            const BSONObj &obj = objs[i];
            try {
                uassert( 10059 , "object to insert too large", obj.objsize() <= BSONObjMaxUserSize);
                BSONObjIterator i( obj );
                while ( i.more() ) {
                    BSONElement e = i.next();
                    // check no $ modifiers.  note we only check top level.
                    // (scanning deep would be quite expensive)
                    uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );

                    // check no regexp for _id (SERVER-9502)
                    if (str::equals(e.fieldName(), "_id")) {
                        uassert(17033, "can't use a regex for _id", e.type() != RegEx);
                    }
                }
                uassert( 16440 ,  "_id cannot be an array", obj["_id"].type() != Array );

                BSONObj objModified = obj;
                BSONElementManipulator::lookForTimestamps(objModified);
                if (details->isCapped() && logop) {
                    // unfortunate hack we need for capped collections
                    // we do this because the logic for generating the pk
                    // and what subsequent rows to delete are buried in the
                    // namespace details object. There is probably a nicer way
                    // to do this, but this works.
                    details->insertObjectIntoCappedAndLogOps(objModified, flags);
                    details->notifyOfWriteOp();
                }
                else {
                    insertOneObject(details, objModified, flags); // may add _id field
                    if (logop) {
                        OpLogHelpers::logInsert(ns, objModified);
                    }
                }
            } catch (const UserException &) {
                if (!keepGoing || i == objs.size() - 1) {
                    throw;
                }
            }
        }
    }
Esempio n. 13
0
    NamespaceDetails* getOrCreateProfileCollection(Database *db, bool force, string* errmsg ) {
        fassert(16372, db);
        const char* profileName = db->profileName.c_str();
        NamespaceDetails* details = db->namespaceIndex.details(profileName);
        if (!details && (cmdLine.defaultProfile || force)) {
            // system.profile namespace doesn't exist; create it
            log() << "creating profile collection: " << profileName << endl;
            string myerrmsg;
            if (!userCreateNS(db->profileName.c_str(),
                              BSON("capped" << true << "size" << 1024 * 1024), myerrmsg , false)) {
                myerrmsg = str::stream() << "could not create ns " << db->profileName << ": " << myerrmsg;
                log() << myerrmsg << endl;
                if ( errmsg )
                    *errmsg = myerrmsg;
                return NULL;
            }
            details = db->namespaceIndex.details(profileName);
        }
        else if ( details && !details->isCapped() ) {
            string myerrmsg = str::stream() << profileName << " exists but isn't capped";
            log() << myerrmsg << endl;
            if ( errmsg )
                *errmsg = myerrmsg;
            return NULL;
        }

        if (!details) {
            // failed to get or create profile collection
            static time_t last = time(0) - 10;  // warn the first time
            if( time(0) > last+10 ) {
                log() << "profile: warning ns " << profileName << " does not exist" << endl;
                last = time(0);
            }
        }
        return details;
    }
Esempio n. 14
0
/* ns:      namespace, e.g. <database>.<collection>
   pattern: the "where" clause / criteria
   justOne: stop after 1 match
   god:     allow access to system namespaces, and don't yield
*/
long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god, RemoveSaver * rs ) {
    if( !god ) {
        if ( strstr(ns, ".system.") ) {
            /* note a delete from system.indexes would corrupt the db
            if done here, as there are pointers into those objects in
            NamespaceDetails.
            */
            uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
        }
        if ( strchr( ns , '$' ) ) {
            log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
            uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
        }
    }

    {
        NamespaceDetails *d = nsdetails( ns );
        if ( ! d )
            return 0;
        uassert( 10101 ,  "can't remove from a capped collection" , ! d->isCapped() );
    }

    long long nDeleted = 0;

    shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern );

    if( !creal->ok() )
        return nDeleted;

    shared_ptr< Cursor > cPtr = creal;
    auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
    cc->setDoingDeletes( true );

    CursorId id = cc->cursorid();

    bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());

    do {
        // TODO: we can generalize this I believe
        //
        bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
        if ( ! willNeedRecord ) {
            // TODO: this is a total hack right now
            // check if the index full encompasses query

            if ( pattern.nFields() == 1 &&
                    str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
                willNeedRecord = true;
        }

        if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
            cc.release(); // has already been deleted elsewhere
            // TODO should we assert or something?
            break;
        }
        if ( !cc->ok() ) {
            break; // if we yielded, could have hit the end
        }

        // this way we can avoid calling prepareToYield() every time (expensive)
        // as well as some other nuances handled
        cc->setDoingDeletes( true );

        DiskLoc rloc = cc->currLoc();
        BSONObj key = cc->currKey();

        bool match = creal->currentMatches();

        cc->advance();

        if ( ! match )
            continue;

        // SERVER-5198 Advance past the document to be modified, but see SERVER-5725.
        while( cc->ok() && rloc == cc->currLoc() ) {
            cc->advance();
        }

        bool foundAllResults = ( justOne || !cc->ok() );

        if ( !foundAllResults ) {
            // NOTE: Saving and restoring a btree cursor's position was historically described
            // as slow here.
            cc->c()->prepareToTouchEarlierIterate();
        }

        if ( logop ) {
            BSONElement e;
            if( BSONObj::make( rloc.rec() ).getObjectID( e ) ) {
                BSONObjBuilder b;
                b.append( e );
                bool replJustOne = true;
                logOp( "d", ns, b.done(), 0, &replJustOne );
            }
            else {
                problem() << "deleted object without id, not logging" << endl;
            }
        }

        theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
        nDeleted++;
        if ( foundAllResults ) {
            break;
        }
        cc->c()->recoverFromTouchingEarlierIterate();

        if( !god )
            getDur().commitIfNeeded();

        if( debug && god && nDeleted == 100 )
            log() << "warning high number of deletes with god=true which could use significant memory" << endl;
    }
    while ( cc->ok() );

    if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
        // TODO: remove this and the id declaration above if this doesn't trigger
        //       if it does, then i'm very confused (ERH 06/2011)
        error() << "this should be impossible" << endl;
        printStackTrace();
        cc.release();
    }

    return nDeleted;
}
Esempio n. 15
0
    /** @param fromRepl false if from ApplyOpsCmd
        @return true if was and update should have happened and the document DNE.  see replset initial sync code.
     */
    bool applyOperation_inlock(const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) {
        LOG(3) << "applying op: " << op << endl;
        bool failedUpdate = false;

        OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters;

        const char *names[] = { "o", "ns", "op", "b" };
        BSONElement fields[4];
        op.getFields(4, names, fields);

        BSONObj o;
        if( fields[0].isABSONObj() )
            o = fields[0].embeddedObject();
            
        const char *ns = fields[1].valuestrsafe();

        Lock::assertWriteLocked(ns);

        NamespaceDetails *nsd = nsdetails(ns);

        // operation type -- see logOp() comments for types
        const char *opType = fields[2].valuestrsafe();

        if ( *opType == 'i' ) {
            opCounters->gotInsert();

            const char *p = strchr(ns, '.');
            if ( p && strcmp(p, ".system.indexes") == 0 ) {
                if (o["background"].trueValue()) {
                    IndexBuilder* builder = new IndexBuilder(ns, o);
                    // This spawns a new thread and returns immediately.
                    builder->go();
                }
                else {
                    IndexBuilder builder(ns, o);
                    // Finish the foreground build before returning
                    builder.build();
                }
            }
            else {
                // do upserts for inserts as we might get replayed more than once
                OpDebug debug;
                BSONElement _id;
                if( !o.getObjectID(_id) ) {
                    /* No _id.  This will be very slow. */
                    Timer t;

                    const NamespaceString requestNs(ns);
                    UpdateRequest request(
                        requestNs, debug,
                        QueryPlanSelectionPolicy::idElseNatural());

                    request.setQuery(o);
                    request.setUpdates(o);
                    request.setUpsert();
                    request.setFromReplication();

                    update(request);

                    if( t.millis() >= 2 ) {
                        RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl;
                    }
                }
                else {
                    // probably don't need this since all replicated colls have _id indexes now
                    // but keep it just in case
                    RARELY if ( nsd && !nsd->isCapped() ) { ensureHaveIdIndex(ns, false); }

                    /* todo : it may be better to do an insert here, and then catch the dup key exception and do update
                              then.  very few upserts will not be inserts...
                              */
                    BSONObjBuilder b;
                    b.append(_id);

                    const NamespaceString requestNs(ns);
                    UpdateRequest request(
                        requestNs, debug,
                        QueryPlanSelectionPolicy::idElseNatural());

                    request.setQuery(b.done());
                    request.setUpdates(o);
                    request.setUpsert();
                    request.setFromReplication();

                    update(request);
                }
            }
        }
Esempio n. 16
0
    DiskLoc DataFileMgr::insert(const char* ns,
                                const void* obuf,
                                int32_t len,
                                bool mayInterrupt,
                                bool god,
                                bool mayAddIndex,
                                bool* addedID) {

        Database* database = cc().database();

        bool wouldAddIndex = false;
        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
        {
            const char *sys = strstr(ns, "system.");
            if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
                return DiskLoc();
        }
        bool addIndex = wouldAddIndex && mayAddIndex;

        Collection* collection = database->getCollection( ns );
        if ( collection == NULL ) {
            collection = database->createCollection( ns, false, NULL );

            int ies = Extent::initialSize(len);
            if( str::contains(ns, '$') &&
                len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 &&
                len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) {
                // probably an index.  so we pick a value here for the first extent instead of using
                // initialExtentSize() which is more for user collections.
                // TODO: we could look at the # of records in the parent collection to be smarter here.
                ies = (32+4) * 1024;
            }
            collection->increaseStorageSize( ies, false);
            if ( !god )
                ensureIdIndexForNewNs(ns);
        }

        NamespaceDetails* d = collection->details();

        string tabletoidxns;
        Collection* collectionToIndex = 0;
        NamespaceDetails* tableToIndex = 0;

        BSONObj fixedIndexObject;
        if ( addIndex ) {
            verify( obuf );
            BSONObj io((const char *) obuf);

            tabletoidxns = io.getStringField( "ns" );
            uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos);
            massert(10097,
                    str::stream() << "trying to create index on wrong db "
                    << " db: " << database->name() << " collection: " << tabletoidxns,
                    database->ownsNS( tabletoidxns ) );

            collectionToIndex = database->getCollection( tabletoidxns );
            if ( !collectionToIndex ) {
                collectionToIndex = database->createCollection( tabletoidxns, false, NULL );
                verify( collectionToIndex );
                if ( !god )
                    ensureIdIndexForNewNs( tabletoidxns.c_str() );
            }

            tableToIndex = collectionToIndex->details();

            Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io );
            if ( status.code() == ErrorCodes::IndexAlreadyExists ) {
                // dup index, we ignore
                return DiskLoc();
            }

            uassert( 17199,
                     str::stream() << "cannot build index on " << tabletoidxns
                     << " because of " << status.toString(),
                     status.isOK() );

            if( !prepareToBuildIndex(io,
                                     mayInterrupt,
                                     god,
                                     tabletoidxns ) ) {
                // prepare creates _id itself, or this indicates to fail the build silently (such 
                // as if index already exists)
                return DiskLoc();
            }

            fixedIndexObject = IndexCatalog::fixIndexSpec( io );

            obuf = fixedIndexObject.objdata();
            len = fixedIndexObject.objsize();
        }

        IDToInsert idToInsert; // only initialized if needed

        if( !god ) {
            /* Check if we have an _id field. If we don't, we'll add it.
               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
            */
            BSONObj io((const char *) obuf);
            BSONElement idField = io.getField( "_id" );
            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
            // we don't add _id for capped collections in local as they don't have an _id index
            if( idField.eoo() &&
                !wouldAddIndex &&
                nsToDatabase( ns ) != "local" &&
                d->haveIdIndex() ) {

                if( addedID )
                    *addedID = true;

                idToInsert.init();
                len += idToInsert.size();
            }

            BSONElementManipulator::lookForTimestamps( io );
        }

        int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize );
        fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) );

        // If the collection is capped, check if the new object will violate a unique index
        // constraint before allocating space.
        if ( d->isCapped() && !god) {
            BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) );
            Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp );
            uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() );
        }

        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);

        if ( loc.isNull() ) {
            log() << "insert: couldn't alloc space for object ns:" << ns
                  << " capped:" << d->isCapped() << endl;
            verify(d->isCapped());
            return DiskLoc();
        }

        Record *r = loc.rec();
        {
            verify( r->lengthWithHeaders() >= lenWHdr );
            r = (Record*) getDur().writingPtr(r, lenWHdr);
            if( idToInsert.needed() ) {
                /* a little effort was made here to avoid a double copy when we add an ID */
                int originalSize = *((int*) obuf);
                ((int&)*r->data()) = originalSize + idToInsert.size();
                memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size());
                memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4);
            }
            else {
                if( obuf ) // obuf can be null from internal callers
                    memcpy(r->data(), obuf, len);
            }
        }

        addRecordToRecListInExtent(r, loc);

        d->incrementStats( r->netLength(), 1 );

        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
        if ( !god )
            collection->infoCache()->notifyOfWriteOp();

        if ( tableToIndex ) {
            insert_makeIndex(collectionToIndex, loc, mayInterrupt);
        }

        /* add this record to our indexes */
        if ( d->getTotalIndexCount() > 0 ) {
            try {
                BSONObj obj(r->data());
                collection->getIndexCatalog()->indexRecord(obj, loc);
            }
            catch( AssertionException& e ) {
                // should be a dup key error on _id index
                if( tableToIndex || d->isCapped() ) {
                    massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() );
                    string s = e.toString();
                    s += " : on addIndex/capped - collection and its index will not match";
                    setLastError(0, s.c_str());
                    error() << s << endl;
                }
                else {
                    // normal case -- we can roll back
                    _deleteRecord(d, ns, r, loc);
                    throw;
                }
            }
        }

        d->paddingFits();

        return loc;
    }
Esempio n. 17
0
    /**
     * Run a query -- includes checking for and running a Command.
     * @return points to ns if exhaust mode. 0=normal mode
     * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
     * @yields the db mutex periodically after acquiring it.
     * @asserts on scan and order memory exhaustion and other cases.
     */
    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
        ParsedQuery& pq( *pq_shared );
        BSONObj jsobj = q.query;
        int queryOptions = q.queryOptions;
        const char *ns = q.ns;

        if( logLevel >= 2 )
            log() << "runQuery called " << ns << " " << jsobj << endl;

        curop.debug().ns = ns;
        curop.debug().ntoreturn = pq.getNumToReturn();
        curop.debug().query = jsobj;
        curop.setQuery(jsobj);

        // Run a command.
        
        if ( pq.couldBeCommand() ) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult));
            BSONObjBuilder cmdResBuf;
            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
                curop.debug().iscommand = true;
                curop.debug().query = jsobj;
                curop.markCommand();

                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = 1;
                result.setData( qr.release(), true );
            }
            else {
                uasserted(13530, "bad or malformed command request?");
            }
            return 0;
        }

        bool explain = pq.isExplain();
        BSONObj order = pq.getOrder();
        BSONObj query = pq.getFilter();

        /* The ElemIter will not be happy if this isn't really an object. So throw exception
           here when that is true.
           (Which may indicate bad data from client.)
        */
        if ( query.objsize() == 0 ) {
            out() << "Bad query object?\n  jsobj:";
            out() << jsobj.toString() << "\n  query:";
            out() << query.toString() << endl;
            uassert( 10110 , "bad query object", false);
        }

        Client::ReadContext ctx( ns , dbpath ); // read locks
        const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );

        replVerifyReadsOk(&pq);

        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
            NamespaceDetails *d = nsdetails( ns );
            uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() );
            const BSONObj nat1 = BSON( "$natural" << 1 );
            if ( order.isEmpty() ) {
                order = nat1;
            }
            else {
                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
            }
        }

        // Run a simple id query.
        
        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {

            int n = 0;
            bool nsFound = false;
            bool indexFound = false;

            BSONObj resObject;
            Client& c = cc();
            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
            if ( nsFound == false || indexFound == true ) {
                
                if ( shardingState.needShardChunkManager( ns ) ) {
                    ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns );
                    if ( m && ! m->belongsToMe( resObject ) ) {
                        // I have something this _id
                        // but it doesn't belong to me
                        // so return nothing
                        resObject = BSONObj();
                        found = false;
                    }
                }

                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
                bb.skip(sizeof(QueryResult));
                
                curop.debug().idhack = true;
                if ( found ) {
                    n = 1;
                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
                }
                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = n;
                result.setData( qr.release(), true );
                return NULL;
            }
        }
        
        // Run a regular query.
        
        BSONObj oldPlan;
        if ( explain && ! pq.hasIndexSpecifier() ) {
            MultiPlanScanner mps( ns, query, order );
            if ( mps.usingCachedPlan() ) {
                oldPlan =
                mps.oldExplain().firstElement().embeddedObject()
                .firstElement().embeddedObject().getOwned();
            }
        }

        // In some cases the query may be retried if there is an in memory sort size assertion.
        for( int retry = 0; retry < 2; ++retry ) {
            try {
                return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order,
                                               pq_shared, oldPlan, shardingVersionAtStart, result );
            } catch ( const QueryRetryException & ) {
                verify( retry == 0 );
            }
        }
        verify( false );
        return 0;
    }
Esempio n. 18
0
    /**
     * For a given query, get a runner.  The runner could be a SingleSolutionRunner, a
     * CachedQueryRunner, or a MultiPlanRunner, depending on the cache/query solver/etc.
     */
    Status getRunner(CanonicalQuery* rawCanonicalQuery, Runner** out) {
        verify(rawCanonicalQuery);
        auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery);

        // Try to look up a cached solution for the query.
        // TODO: Can the cache have negative data about a solution?
        PlanCache* localCache = PlanCache::get(canonicalQuery->ns());
        if (NULL != localCache) {
            CachedSolution* cs = localCache->get(*canonicalQuery);
            if (NULL != cs) {
                // We have a cached solution.  Hand the canonical query and cached solution off to
                // the cached plan runner, which takes ownership of both.
                WorkingSet* ws;
                PlanStage* root;
                verify(StageBuilder::build(*cs->solution, &root, &ws));
                *out = new CachedPlanRunner(canonicalQuery.release(), cs, root, ws);
                return Status::OK();
            }
        }

        // No entry in cache for the query.  We have to solve the query ourself.

        // Get the indices that we could possibly use.
        NamespaceDetails* nsd = nsdetails(canonicalQuery->ns().c_str());

        // If this is NULL, there is no data but the query is valid.  You're allowed to query for
        // data on an empty collection and it's not an error.  There just isn't any data...
        if (NULL == nsd) {
            const std::string& ns = canonicalQuery->ns();
            *out = new EOFRunner(canonicalQuery.release(), ns);
            return Status::OK();
        }

        // Tailable: If the query requests tailable the collection must be capped.
        if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) {
            if (!nsd->isCapped()) {
                return Status(ErrorCodes::BadValue,
                              "tailable cursor requested on non capped collection");
            }

            // If a sort is specified it must be equal to expectedSort.
            const BSONObj expectedSort = BSON("$natural" << 1);
            const BSONObj& actualSort = canonicalQuery->getParsed().getSort();
            if (!actualSort.isEmpty() && !(actualSort == expectedSort)) {
                return Status(ErrorCodes::BadValue,
                              "invalid sort specified for tailable cursor: "
                              + actualSort.toString());
            }
        }

        // If it's not NULL, we may have indices.
        vector<IndexEntry> indices;
        for (int i = 0; i < nsd->getCompletedIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> desc(CatalogHack::getDescriptor(nsd, i));
            indices.push_back(IndexEntry(desc->keyPattern(), desc->isMultikey(), desc->isSparse(), desc->indexName()));
        }

        vector<QuerySolution*> solutions;
        size_t options = QueryPlanner::DEFAULT;
        if (storageGlobalParams.noTableScan) {
            const string& ns = canonicalQuery->ns();
            // There are certain cases where we ignore this restriction:
            bool ignore = canonicalQuery->getQueryObj().isEmpty()
                          || (string::npos != ns.find(".system."))
                          || (0 == ns.find("local."));
            if (!ignore) {
                options |= QueryPlanner::NO_TABLE_SCAN;
            }
        }
        else {
            options |= QueryPlanner::INCLUDE_COLLSCAN;
        }
        QueryPlanner::plan(*canonicalQuery, indices, options, &solutions);

        /*
        for (size_t i = 0; i < solutions.size(); ++i) {
            QLOG() << "solution " << i << " is " << solutions[i]->toString() << endl;
        }
        */

        // We cannot figure out how to answer the query.  Should this ever happen?
        if (0 == solutions.size()) {
            return Status(ErrorCodes::BadValue, "Can't create a plan for the canonical query " +
                                                 canonicalQuery->toString());
        }

        if (1 == solutions.size()) {
            // Only one possible plan.  Run it.  Build the stages from the solution.
            WorkingSet* ws;
            PlanStage* root;
            verify(StageBuilder::build(*solutions[0], &root, &ws));

            // And, run the plan.
            *out = new SingleSolutionRunner(canonicalQuery.release(), solutions[0], root, ws);
            return Status::OK();
        }
        else {
            // Many solutions.  Let the MultiPlanRunner pick the best, update the cache, and so on.
            auto_ptr<MultiPlanRunner> mpr(new MultiPlanRunner(canonicalQuery.release()));
            for (size_t i = 0; i < solutions.size(); ++i) {
                WorkingSet* ws;
                PlanStage* root;
                verify(StageBuilder::build(*solutions[i], &root, &ws));
                // Takes ownership of all arguments.
                mpr->addPlan(solutions[i], root, ws);
            }
            *out = mpr.release();
            return Status::OK();
        }
    }
Esempio n. 19
0
        void validateNS(const string& ns,
                        Collection* collection,
                        const BSONObj& cmdObj,
                        BSONObjBuilder& result) {

            const bool full = cmdObj["full"].trueValue();
            const bool scanData = full || cmdObj["scandata"].trueValue();

            NamespaceDetails* nsd = collection->details();

            bool valid = true;
            BSONArrayBuilder errors; // explanation(s) for why valid = false
            if ( collection->isCapped() ){
                result.append("capped", nsd->isCapped());
                result.appendNumber("max", nsd->maxCappedDocs());
            }

            if ( nsd->firstExtent().isNull() )
                result.append( "firstExtent", "null" );
            else
                result.append( "firstExtent", str::stream() << nsd->firstExtent().toString()
                               << " ns:" << nsd->firstExtent().ext()->nsDiagnostic.toString());
            if ( nsd->lastExtent().isNull() )
                result.append( "lastExtent", "null" );
            else
                result.append( "lastExtent", str::stream() <<  nsd->lastExtent().toString()
                               << " ns:" <<  nsd->lastExtent().ext()->nsDiagnostic.toString());

            BSONArrayBuilder extentData;
            int extentCount = 0;
            try {

                if ( !nsd->firstExtent().isNull() ) {
                    nsd->firstExtent().ext()->assertOk();
                    nsd->lastExtent().ext()->assertOk();
                }

                DiskLoc extentDiskLoc = nsd->firstExtent();
                while (!extentDiskLoc.isNull()) {
                    Extent* thisExtent = extentDiskLoc.ext();
                    if (full) {
                        extentData << thisExtent->dump();
                    }
                    if (!thisExtent->validates(extentDiskLoc, &errors)) {
                        valid = false;
                    }
                    DiskLoc nextDiskLoc = thisExtent->xnext;
                    if (extentCount > 0 && !nextDiskLoc.isNull()
                                        &&  nextDiskLoc.ext()->xprev != extentDiskLoc) {
                        StringBuilder sb;
                        sb << "'xprev' pointer " << nextDiskLoc.ext()->xprev.toString()
                           << " in extent " << nextDiskLoc.toString()
                           << " does not point to extent " << extentDiskLoc.toString();
                        errors << sb.str();
                        valid = false;
                    }
                    if (nextDiskLoc.isNull() && extentDiskLoc != nsd->lastExtent()) {
                        StringBuilder sb;
                        sb << "'lastExtent' pointer " << nsd->lastExtent().toString()
                           << " does not point to last extent in list " << extentDiskLoc.toString();
                        errors << sb.str();
                        valid = false;
                    }
                    extentDiskLoc = nextDiskLoc;
                    extentCount++;
                    killCurrentOp.checkForInterrupt();
                }
            }
            catch (const DBException& e) {
                StringBuilder sb;
                sb << "exception validating extent " << extentCount
                   << ": " << e.what();
                errors << sb.str();
                valid = false;
            }
            result.append("extentCount", extentCount);

            if ( full )
                result.appendArray( "extents" , extentData.arr() );

            result.appendNumber("datasize", nsd->dataSize());
            result.appendNumber("nrecords", nsd->numRecords());
            result.appendNumber("lastExtentSize", nsd->lastExtentSize());
            result.appendNumber("padding", nsd->paddingFactor());

            try {

                bool testingLastExtent = false;
                try {
                    if (nsd->firstExtent().isNull()) {
                        // this is ok
                    }
                    else {
                        result.append("firstExtentDetails", nsd->firstExtent().ext()->dump());
                        if (!nsd->firstExtent().ext()->xprev.isNull()) {
                            StringBuilder sb;
                            sb << "'xprev' pointer in 'firstExtent' " << nsd->firstExtent().toString()
                               << " is " << nsd->firstExtent().ext()->xprev.toString()
                               << ", should be null";
                            errors << sb.str();
                            valid=false;
                        }
                    }
                    testingLastExtent = true;
                    if (nsd->lastExtent().isNull()) {
                        // this is ok
                    }
                    else {
                        if (nsd->firstExtent() != nsd->lastExtent()) {
                            result.append("lastExtentDetails", nsd->lastExtent().ext()->dump());
                            if (!nsd->lastExtent().ext()->xnext.isNull()) {
                                StringBuilder sb;
                                sb << "'xnext' pointer in 'lastExtent' " << nsd->lastExtent().toString()
                                   << " is " << nsd->lastExtent().ext()->xnext.toString()
                                   << ", should be null";
                                errors << sb.str();
                                valid = false;
                            }
                        }
                    }
                }
                catch (const DBException& e) {
                    StringBuilder sb;
                    sb << "exception processing '"
                       << (testingLastExtent ? "lastExtent" : "firstExtent")
                       << "': " << e.what();
                    errors << sb.str();
                    valid = false;
                }

                set<DiskLoc> recs;
                if( scanData ) {
                    int n = 0;
                    int nInvalid = 0;
                    long long nQuantizedSize = 0;
                    long long nPowerOf2QuantizedSize = 0;
                    long long len = 0;
                    long long nlen = 0;
                    long long bsonLen = 0;
                    int outOfOrder = 0;
                    DiskLoc cl_last;

                    DiskLoc cl;
                    Runner::RunnerState state;
                    auto_ptr<Runner> runner(InternalPlanner::collectionScan(ns));
                    while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &cl))) {
                        n++;

                        if ( n < 1000000 )
                            recs.insert(cl);
                        if ( nsd->isCapped() ) {
                            if ( cl < cl_last )
                                outOfOrder++;
                            cl_last = cl;
                        }

                        Record *r = cl.rec();
                        len += r->lengthWithHeaders();
                        nlen += r->netLength();
                        
                        if ( r->lengthWithHeaders() ==
                                NamespaceDetails::quantizeAllocationSpace
                                    ( r->lengthWithHeaders() ) ) {
                            // Count the number of records having a size consistent with
                            // the quantizeAllocationSpace quantization implementation.
                            ++nQuantizedSize;
                        }

                        if ( r->lengthWithHeaders() ==
                                NamespaceDetails::quantizePowerOf2AllocationSpace
                                    ( r->lengthWithHeaders() - 1 ) ) {
                            // Count the number of records having a size consistent with the
                            // quantizePowerOf2AllocationSpace quantization implementation.
                            // Because of SERVER-8311, power of 2 quantization is not idempotent and
                            // r->lengthWithHeaders() - 1 must be checked instead of the record
                            // length itself.
                            ++nPowerOf2QuantizedSize;
                        }

                        if (full){
                            BSONObj obj = BSONObj::make(r);
                            if (!obj.isValid() || !obj.valid()){ // both fast and deep checks
                                valid = false;
                                if (nInvalid == 0) // only log once;
                                    errors << "invalid bson object detected (see logs for more info)";

                                nInvalid++;
                                if (strcmp("_id", obj.firstElementFieldName()) == 0){
                                    try {
                                        obj.firstElement().validate(); // throws on error
                                        log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl;
                                    }
                                    catch(...){
                                        log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl;
                                    }
                                }
                                else {
                                    log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl;
                                }
                            }
                            else {
                                bsonLen += obj.objsize();
                            }
                        }
                    }
                    if (Runner::RUNNER_EOF != state) {
                        // TODO: more descriptive logging.
                        warning() << "Internal error while reading collection " << ns << endl;
                    }
                    if ( nsd->isCapped() && !nsd->capLooped() ) {
                        result.append("cappedOutOfOrder", outOfOrder);
                        if ( outOfOrder > 1 ) {
                            valid = false;
                            errors << "too many out of order records";
                        }
                    }
                    result.append("objectsFound", n);

                    if (full) {
                        result.append("invalidObjects", nInvalid);
                    }

                    result.appendNumber("nQuantizedSize", nQuantizedSize);
                    result.appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize);
                    result.appendNumber("bytesWithHeaders", len);
                    result.appendNumber("bytesWithoutHeaders", nlen);

                    if (full) {
                        result.appendNumber("bytesBson", bsonLen);
                    }
                }

                BSONArrayBuilder deletedListArray;
                for ( int i = 0; i < Buckets; i++ ) {
                    deletedListArray << nsd->deletedListEntry(i).isNull();
                }

                int ndel = 0;
                long long delSize = 0;
                BSONArrayBuilder delBucketSizes;
                int incorrect = 0;
                for ( int i = 0; i < Buckets; i++ ) {
                    DiskLoc loc = nsd->deletedListEntry(i);
                    try {
                        int k = 0;
                        while ( !loc.isNull() ) {
                            if ( recs.count(loc) )
                                incorrect++;
                            ndel++;

                            if ( loc.questionable() ) {
                                if( nsd->isCapped() && !loc.isValid() && i == 1 ) {
                                    /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
                                       see comments in namespace.h
                                    */
                                    break;
                                }

                                string err( str::stream() << "bad pointer in deleted record list: "
                                                          << loc.toString()
                                                          << " bucket: " << i
                                                          << " k: " << k );
                                errors << err;
                                valid = false;
                                break;
                            }

                            DeletedRecord *d = loc.drec();
                            delSize += d->lengthWithHeaders();
                            loc = d->nextDeleted();
                            k++;
                            killCurrentOp.checkForInterrupt();
                        }
                        delBucketSizes << k;
                    }
                    catch (...) {
                        errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i));
                        valid = false;
                    }
                }
                result.appendNumber("deletedCount", ndel);
                result.appendNumber("deletedSize", delSize);
                if ( full ) {
                    result << "delBucketSizes" << delBucketSizes.arr();
                }

                if ( incorrect ) {
                    errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list");
                    valid = false;
                }

                int idxn = 0;
                try  {
                    IndexCatalog* indexCatalog = collection->getIndexCatalog();

                    result.append("nIndexes", nsd->getCompletedIndexCount());
                    BSONObjBuilder indexes; // not using subObjStart to be exception safe
                    NamespaceDetails::IndexIterator i = nsd->ii();
                    while( i.more() ) {
                        IndexDetails& id = i.next();
                        log() << "validating index " << idxn << ": " << id.indexNamespace() << endl;

                        IndexDescriptor* descriptor = indexCatalog->getDescriptor( idxn );
                        verify( descriptor );
                        IndexAccessMethod* iam = indexCatalog->getIndex( descriptor );
                        verify( iam );

                        int64_t keys;
                        iam->validate(&keys);
                        indexes.appendNumber(id.indexNamespace(), static_cast<long long>(keys));
                        idxn++;
                    }
                    result.append("keysPerIndex", indexes.done());
                }
                catch (...) {
                    errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn));
                    valid=false;
                }

            }
            catch (AssertionException) {
                errors << "exception during validate";
                valid = false;
            }

            result.appendBool("valid", valid);
            result.append("errors", errors.arr());

            if ( !full ){
                result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan.");
            }
            
            if ( !valid ) {
                result.append("advice", "ns corrupt, requires repair");
            }

        }
        virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            string source = cmdObj.getStringField( name.c_str() );
            string target = cmdObj.getStringField( "to" );
            uassert(15967,
                    "invalid collection name: " + target,
                    NamespaceString::validCollectionComponent(target.c_str()));
            if ( source.empty() || target.empty() ) {
                errmsg = "invalid command syntax";
                return false;
            }

            string sourceDB = nsToDatabase(source);
            string targetDB = nsToDatabase(target);
            string databaseName = sourceDB;
            databaseName += ".system.indexes";

            int longestIndexNameLength = 0;
            vector<BSONObj> oldIndSpec = Helpers::findAll(databaseName, BSON("ns" << source));
            for (size_t i = 0; i < oldIndSpec.size(); ++i) {
                int thisLength = oldIndSpec[i].getField("name").valuesize();
                if (thisLength > longestIndexNameLength) {
                     longestIndexNameLength = thisLength;
                }
            }
            unsigned int longestAllowed = maxNamespaceLen - longestIndexNameLength - 1;
            if (target.size() > longestAllowed) {
                StringBuilder sb;
                sb << "collection name length of " << target.size()
                << " exceeds maximum length of " << longestAllowed
                << ", allowing for index names";
                uasserted(16451, sb.str());
            }

            bool capped = false;
            long long size = 0;
            std::vector<BSONObj> indexesInProg;
            {
                Client::Context ctx( source );
                NamespaceDetails *nsd = nsdetails( source );
                uassert( 10026 ,  "source namespace does not exist", nsd );
                indexesInProg = stopIndexBuilds(dbname, cmdObj);
                capped = nsd->isCapped();
                if ( capped )
                    for( DiskLoc i = nsd->firstExtent(); !i.isNull(); i = i.ext()->xnext )
                        size += i.ext()->length;
            }

            Client::Context ctx( target );

            if ( nsdetails( target ) ) {
                uassert( 10027 ,  "target namespace exists", cmdObj["dropTarget"].trueValue() );

                Status s = cc().database()->dropCollection( target );
                if ( !s.isOK() ) {
                    errmsg = s.toString();
                    return false;
                }

            }


            // if we are renaming in the same database, just
            // rename the namespace and we're done.
            {
                if ( sourceDB == targetDB ) {
                    Status s = ctx.db()->renameCollection( source, target, cmdObj["stayTemp"].trueValue() );
                    if ( !s.isOK() ) {
                        errmsg = s.toString();
                        return false;
                    }
                    return true;
                }
            }

            // renaming across databases, so we must copy all
            // the data and then remove the source collection.
            BSONObjBuilder spec;
            if ( capped ) {
                spec.appendBool( "capped", true );
                spec.append( "size", double( size ) );
            }
            if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) )
                return false;

            auto_ptr< DBClientCursor > c;
            DBDirectClient bridge;

            {
                c = bridge.query( source, BSONObj(), 0, 0, 0, fromRepl ? QueryOption_SlaveOk : 0 );
            }
            while( 1 ) {
                {
                    if ( !c->more() )
                        break;
                }
                BSONObj o = c->next();
                theDataFileMgr.insertWithObjMod( target.c_str(), o );
            }

            string sourceIndexes = nsToDatabase( source ) + ".system.indexes";
            string targetIndexes = nsToDatabase( target ) + ".system.indexes";
            {
                c = bridge.query( sourceIndexes, QUERY( "ns" << source ), 0, 0, 0, fromRepl ? QueryOption_SlaveOk : 0 );
            }
            while( 1 ) {
                {
                    if ( !c->more() )
                        break;
                }
                BSONObj o = c->next();
                BSONObjBuilder b;
                BSONObjIterator i( o );
                while( i.moreWithEOO() ) {
                    BSONElement e = i.next();
                    if ( e.eoo() )
                        break;
                    if ( strcmp( e.fieldName(), "ns" ) == 0 ) {
                        b.append( "ns", target );
                    }
                    else {
                        b.append( e );
                    }
                }
                BSONObj n = b.done();
                theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n );
            }

            {
                Client::Context ctx( source );
                Status s = ctx.db()->dropCollection( source );
                if ( !s.isOK() ) {
                    errmsg = s.toString();
                    return false;
                }
                IndexBuilder::restoreIndexes(targetIndexes, indexesInProg);
            }
            return true;
        }
Esempio n. 21
0
        virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            string coll = cmdObj.firstElement().valuestr();
            if( coll.empty() || db.empty() ) {
                errmsg = "no collection name specified";
                return false;
            }

            if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { 
                errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force";
                return false;
            }
            
            string ns = db + '.' + coll;
            if ( ! NamespaceString::normal(ns.c_str()) ) {
                errmsg = "bad namespace name";
                return false;
            }
            
            // parameter validation to avoid triggering assertions in compact()
            if ( str::contains(ns, ".system.") ) {
                errmsg = "can't compact a system namespace";
                return false;
            }
            
            {
                Lock::DBWrite lk(ns);
                Client::Context ctx(ns);
                NamespaceDetails *d = nsdetails(ns);
                if( ! d ) {
                    errmsg = "namespace does not exist";
                    return false;
                }

                if ( d->isCapped() ) {
                    errmsg = "cannot compact a capped collection";
                    return false;
                }
            }


            double pf = 1.0;
            int pb = 0;
            // preservePadding trumps all other compact methods
            bool preservePadding = false;
            // useDefaultPadding is used to track whether or not a padding requirement was passed in
            // if it wasn't than UsePowerOf2Sizes will be maintained when compacting
            bool useDefaultPadding = true;
            if (cmdObj.hasElement("preservePadding")) {
                preservePadding = cmdObj["preservePadding"].trueValue();
                useDefaultPadding = false;
            }

            if( cmdObj.hasElement("paddingFactor") ) {
                if (preservePadding == true) {
                    errmsg = "preservePadding is incompatible with paddingFactor";
                    return false;
                }
                useDefaultPadding = false;
                pf = cmdObj["paddingFactor"].Number();
                verify( pf >= 1.0 && pf <= 4.0 );
            }
            if( cmdObj.hasElement("paddingBytes") ) {
                if (preservePadding == true) {
                    errmsg = "preservePadding is incompatible with paddingBytes";
                    return false;
                }
                useDefaultPadding = false;
                pb = (int) cmdObj["paddingBytes"].Number();
                verify( pb >= 0 && pb <= 1024 * 1024 );
            }

            bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment

            massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) );
            massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails

            bool ok;
            {
                Lock::DBWrite lk(ns);
                BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str());
                Client::Context ctx(ns);
                NamespaceDetails *d = nsdetails(ns);
                massert( 13660, str::stream() << "namespace " << ns << " does not exist", d );
                massert( 13661, "cannot compact capped collection", !d->isCapped() );
                log() << "compact " << ns << " begin" << endl;

                std::vector<BSONObj> indexesInProg = stopIndexBuilds(db, cmdObj);

                if( pf != 0 || pb != 0 ) { 
                    log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl;
                } 
                try { 
                    ok = _compact(ns.c_str(), d, errmsg, validate,
                                  result, pf, pb, useDefaultPadding, preservePadding);
                }
                catch(...) { 
                    log() << "compact " << ns << " end (with error)" << endl;
                    throw;
                }
                log() << "compact " << ns << " end" << endl;

                IndexBuilder::restoreIndexes(indexesInProg);
            }

            return ok;
        }