Esempio n. 1
0
    void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
        verify( candidateChunks );

        //
        // 1. Check whether there is any sharded collection to be balanced by querying
        // the ShardsNS::collections collection
        //

        auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() );
        vector< string > collections;
        while ( cursor->more() ) {
            BSONObj col = cursor->nextSafe();

            // sharded collections will have a shard "key".
            if ( ! col["key"].eoo() && ! col["noBalance"].trueValue() ){
                collections.push_back( col["_id"].String() );
            }
            else if( col["noBalance"].trueValue() ){
                LOG(1) << "not balancing collection " << col["_id"].String() << ", explicitly disabled" << endl;
            }

        }
        cursor.reset();

        if ( collections.empty() ) {
            LOG(1) << "no collections to balance" << endl;
            return;
        }

        //
        // 2. Get a list of all the shards that are participating in this balance round
        // along with any maximum allowed quotas and current utilization. We get the
        // latter by issuing db.serverStatus() (mem.mapped) to all shards.
        //
        // TODO: skip unresponsive shards and mark information as stale.
        //

        vector<Shard> allShards;
        Shard::getAllShards( allShards );
        if ( allShards.size() < 2) {
            LOG(1) << "can't balance without more active shards" << endl;
            return;
        }
        
        ShardInfoMap shardInfo;
        for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
            const Shard& s = *it;
            ShardStatus status = s.getStatus();
            shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(),
                                                  status.mapped(),
                                                  s.isDraining(),
                                                  status.hasOpsQueued(),
                                                  s.tags()
                                                  );
        }

        //
        // 3. For each collection, check if the balancing policy recommends moving anything around.
        //

        for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
            const string& ns = *it;

            map< string,vector<BSONObj> > shardToChunksMap;
            cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) );
            while ( cursor->more() ) {
                BSONObj chunk = cursor->nextSafe();
                if ( chunk["jumbo"].trueValue() )
                    continue;
                vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()];
                chunks.push_back( chunk.getOwned() );
            }
            cursor.reset();

            if (shardToChunksMap.empty()) {
                LOG(1) << "skipping empty collection (" << ns << ")";
                continue;
            }
            
            for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
                // this just makes sure there is an entry in shardToChunksMap for every shard
                Shard s = *i;
                shardToChunksMap[s.getName()].size();
            }

            DistributionStatus status( shardInfo, shardToChunksMap );
            
            // load tags
            conn.ensureIndex( ShardNS::tags, BSON( "ns" << 1 << "min" << 1 ), true );
            cursor = conn.query( ShardNS::tags , QUERY( "ns" << ns ).sort( "min" ) );
            while ( cursor->more() ) {
                BSONObj tag = cursor->nextSafe();
                uassert( 16356 , str::stream() << "tag ranges not valid for: " << ns ,
                         status.addTagRange( TagRange( tag["min"].Obj().getOwned(), 
                                                       tag["max"].Obj().getOwned(), 
                                                       tag["tag"].String() ) ) );
                    
            }
            cursor.reset();
            
            CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
            if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
        }
    }
Esempio n. 2
0
    void go( const string db , const boost::filesystem::path outdir ) {
        log() << "DATABASE: " << db << "\t to \t" << outdir.string() << endl;

        boost::filesystem::create_directories( outdir );

        map <string, BSONObj> collectionOptions;
        multimap <string, BSONObj> indexes;
        vector <string> collections;

        // Save indexes for database
        string ins = db + ".system.indexes";
        auto_ptr<DBClientCursor> cursor = conn( true ).query( ins.c_str() , Query() , 0 , 0 , 0 , QueryOption_SlaveOk | QueryOption_NoCursorTimeout );
        while ( cursor->more() ) {
            BSONObj obj = cursor->nextSafe();
            const string name = obj.getField( "ns" ).valuestr();
            indexes.insert( pair<string, BSONObj> (name, obj.getOwned()) );
        }

        string sns = db + ".system.namespaces";
        cursor = conn( true ).query( sns.c_str() , Query() , 0 , 0 , 0 , QueryOption_SlaveOk | QueryOption_NoCursorTimeout );
        while ( cursor->more() ) {
            BSONObj obj = cursor->nextSafe();
            const string name = obj.getField( "name" ).valuestr();
            if (obj.hasField("options")) {
                collectionOptions.insert( pair<string,BSONObj> (name, obj.getField("options").embeddedObject()) );
            }

            // skip namespaces with $ in them only if we don't specify a collection to dump
            if ( _coll == "" && name.find( ".$" ) != string::npos ) {
                log(1) << "\tskipping collection: " << name << endl;
                continue;
            }

            const string filename = name.substr( db.size() + 1 );

            //if a particular collections is specified, and it's not this one, skip it
            if ( _coll != "" && db + "." + _coll != name && _coll != name )
                continue;

            // raise error before writing collection with non-permitted filename chars in the name
            size_t hasBadChars = name.find_first_of("/\0");
            if (hasBadChars != string::npos){
              error() << "Cannot dump "  << name << ". Collection has '/' or null in the collection name." << endl;
              continue;
            }
            
            // Don't dump indexes
            if ( endsWith(name.c_str(), ".system.indexes") ) {
              continue;
            }
            
            if ( _coll != "" && db + "." + _coll != name && _coll != name )
              continue;
            
            collections.push_back(name);
        }
        
        for (vector<string>::iterator it = collections.begin(); it != collections.end(); ++it) {
            string name = *it;
            const string filename = name.substr( db.size() + 1 );
            writeCollectionFile( name , outdir / ( filename + ".bson" ) );
            writeMetadataFile( name, outdir / (filename + ".metadata.json"), collectionOptions, indexes);
        }

    }
void WriteErrorDetail::setErrInfo(const BSONObj& errInfo) {
    _errInfo = errInfo.getOwned();
    _isErrInfoSet = true;
}
PlanStage::StageState DeleteStage::work(WorkingSetID* out) {
    ++_commonStats.works;

    // Adds the amount of time taken by work() to executionTimeMillis.
    ScopedTimer timer(&_commonStats.executionTimeMillis);

    if (isEOF()) {
        return PlanStage::IS_EOF;
    }
    invariant(_collection);  // If isEOF() returns false, we must have a collection.

    // It is possible that after a delete was executed, a WriteConflictException occurred
    // and prevented us from returning ADVANCED with the old version of the document.
    if (_idReturning != WorkingSet::INVALID_ID) {
        // We should only get here if we were trying to return something before.
        invariant(_params.returnDeleted);

        WorkingSetMember* member = _ws->get(_idReturning);
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        *out = _idReturning;
        _idReturning = WorkingSet::INVALID_ID;
        ++_commonStats.advanced;
        return PlanStage::ADVANCED;
    }

    // Either retry the last WSM we worked on or get a new one from our child.
    WorkingSetID id;
    if (_idRetrying != WorkingSet::INVALID_ID) {
        id = _idRetrying;
        _idRetrying = WorkingSet::INVALID_ID;
    } else {
        auto status = child()->work(&id);

        switch (status) {
            case PlanStage::ADVANCED:
                break;

            case PlanStage::FAILURE:
            case PlanStage::DEAD:
                *out = id;

                // If a stage fails, it may create a status WSM to indicate why it failed, in which
                // case 'id' is valid.  If ID is invalid, we create our own error message.
                if (WorkingSet::INVALID_ID == id) {
                    const std::string errmsg = "delete stage failed to read in results from child";
                    *out = WorkingSetCommon::allocateStatusMember(
                        _ws, Status(ErrorCodes::InternalError, errmsg));
                }
                return status;

            case PlanStage::NEED_TIME:
                ++_commonStats.needTime;
                return status;

            case PlanStage::NEED_YIELD:
                *out = id;
                ++_commonStats.needYield;
                return status;

            case PlanStage::IS_EOF:
                return status;

            default:
                MONGO_UNREACHABLE;
        }
    }

    // We advanced, or are retrying, and id is set to the WSM to work on.
    WorkingSetMember* member = _ws->get(id);

    // We want to free this member when we return, unless we need to retry it.
    ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id);

    if (!member->hasLoc()) {
        // We expect to be here because of an invalidation causing a force-fetch.

        // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will
        // not produce any more results even if there is another matching document. Throw a WCE here
        // so that these operations get another chance to find a matching document. The
        // findAndModify command should automatically retry if it gets a WCE.
        // TODO: this is not necessary if there was no sort specified.
        if (_params.returnDeleted) {
            throw WriteConflictException();
        }

        ++_specificStats.nInvalidateSkips;
        ++_commonStats.needTime;
        return PlanStage::NEED_TIME;
    }
    RecordId rloc = member->loc;
    // Deletes can't have projections. This means that covering analysis will always add
    // a fetch. We should always get fetched data, and never just key data.
    invariant(member->hasObj());

    try {
        // If the snapshot changed, then we have to make sure we have the latest copy of the
        // doc and that it still matches.
        std::unique_ptr<SeekableRecordCursor> cursor;
        if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) {
            cursor = _collection->getCursor(getOpCtx());
            if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) {
                // Doc is already deleted. Nothing more to do.
                ++_commonStats.needTime;
                return PlanStage::NEED_TIME;
            }

            // Make sure the re-fetched doc still matches the predicate.
            if (_params.canonicalQuery &&
                !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) {
                // Doesn't match.
                ++_commonStats.needTime;
                return PlanStage::NEED_TIME;
            }
        }

        // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState()
        // is allowed to free the memory.
        if (_params.returnDeleted) {
            // Save a copy of the document that is about to get deleted, but keep it in the
            // LOC_AND_OBJ state in case we need to retry deleting it.
            BSONObj deletedDoc = member->obj.value();
            member->obj.setValue(deletedDoc.getOwned());
        }

        // TODO: Do we want to buffer docs and delete them in a group rather than
        // saving/restoring state repeatedly?

        try {
            if (supportsDocLocking()) {
                // Doc-locking engines require this before saveState() since they don't use
                // invalidations.
                WorkingSetCommon::prepareForSnapshotChange(_ws);
            }
            child()->saveState();
        } catch (const WriteConflictException& wce) {
            std::terminate();
        }

        // Do the write, unless this is an explain.
        if (!_params.isExplain) {
            WriteUnitOfWork wunit(getOpCtx());
            _collection->deleteDocument(getOpCtx(), rloc);
            wunit.commit();
        }

        ++_specificStats.docsDeleted;
    } catch (const WriteConflictException& wce) {
        // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will
        // not produce any more results even if there is another matching document. Re-throw the WCE
        // here so that these operations get another chance to find a matching document. The
        // findAndModify command should automatically retry if it gets a WCE.
        // TODO: this is not necessary if there was no sort specified.
        if (_params.returnDeleted) {
            throw;
        }
        _idRetrying = id;
        memberFreer.Dismiss();  // Keep this member around so we can retry deleting it.
        *out = WorkingSet::INVALID_ID;
        _commonStats.needYield++;
        return NEED_YIELD;
    }

    if (_params.returnDeleted) {
        // After deleting the document, the RecordId associated with this member is invalid.
        // Remove the 'loc' from the WorkingSetMember before returning it.
        member->loc = RecordId();
        member->transitionToOwnedObj();
    }

    //  As restoreState may restore (recreate) cursors, cursors are tied to the
    //  transaction in which they are created, and a WriteUnitOfWork is a
    //  transaction, make sure to restore the state outside of the WritUnitOfWork.
    try {
        child()->restoreState();
    } catch (const WriteConflictException& wce) {
        // Note we don't need to retry anything in this case since the delete already
        // was committed. However, we still need to return the deleted document
        // (if it was requested).
        if (_params.returnDeleted) {
            // member->obj should refer to the deleted document.
            invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

            _idReturning = id;
            // Keep this member around so that we can return it on the next work() call.
            memberFreer.Dismiss();
        }
        *out = WorkingSet::INVALID_ID;
        _commonStats.needYield++;
        return NEED_YIELD;
    }

    if (_params.returnDeleted) {
        // member->obj should refer to the deleted document.
        invariant(member->getState() == WorkingSetMember::OWNED_OBJ);

        memberFreer.Dismiss();  // Keep this member around so we can return it.
        *out = id;
        ++_commonStats.advanced;
        return PlanStage::ADVANCED;
    }

    ++_commonStats.needTime;
    return PlanStage::NEED_TIME;
}
Esempio n. 5
0
    CollectionMetadata* CollectionMetadata::cloneSplit( const ChunkType& chunk,
                                                        const vector<BSONObj>& splitKeys,
                                                        const ChunkVersion& newShardVersion,
                                                        string* errMsg ) const {
        // The error message string is optional.
        string dummy;
        if (errMsg == NULL) {
            errMsg = &dummy;
        }

        // The version required in both resulting chunks could be simply an increment in the
        // minor portion of the current version.  However, we are enforcing uniqueness over the
        // attributes <ns, version> of the configdb collection 'chunks'.  So in practice, a
        // migrate somewhere may force this split to pick up a version that has the major
        // portion higher than the one that this shard has been using.
        //
        // TODO drop the uniqueness constraint and tighten the check below so that only the
        // minor portion of version changes
        if (newShardVersion <= _shardVersion) {

            *errMsg = stream() << "cannot split chunk "
                               << rangeToString( chunk.getMin(), chunk.getMax() )
                               << ", new shard version "
                               << newShardVersion.toString()
                               << " is not greater than current version "
                               << _shardVersion.toString();

            warning() << *errMsg << endl;
            return NULL;
        }

        // Check that we have the exact chunk that will be subtracted.
        if ( !rangeMapContains( _chunksMap, chunk.getMin(), chunk.getMax() ) ) {

            *errMsg = stream() << "cannot split chunk "
                               << rangeToString( chunk.getMin(), chunk.getMax() )
                               << ", this shard does not contain the chunk";

            if ( rangeMapOverlaps( _chunksMap, chunk.getMin(), chunk.getMax() ) ) {

                RangeVector overlap;
                getRangeMapOverlap( _chunksMap, chunk.getMin(), chunk.getMax(), &overlap );

                *errMsg += stream() << " and it overlaps " << overlapToString( overlap );
            }

            warning() << *errMsg << endl;
            return NULL;
        }

        // Check that the split key is valid
        for ( vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it )
        {
            if (!rangeContains(chunk.getMin(), chunk.getMax(), *it)) {

                *errMsg = stream() << "cannot split chunk "
                                   << rangeToString( chunk.getMin(), chunk.getMax() ) << " at key "
                                   << *it;

                warning() << *errMsg << endl;
                return NULL;
            }
        }

        auto_ptr<CollectionMetadata> metadata(new CollectionMetadata);
        metadata->_keyPattern = this->_keyPattern;
        metadata->_keyPattern.getOwned();
        metadata->fillKeyPatternFields();
        metadata->_pendingMap = this->_pendingMap;
        metadata->_chunksMap = this->_chunksMap;
        metadata->_shardVersion = newShardVersion; // will increment 2nd, 3rd,... chunks below

        BSONObj startKey = chunk.getMin();
        for ( vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end();
                ++it ) {
            BSONObj split = *it;
            metadata->_chunksMap[chunk.getMin()] = split.getOwned();
            metadata->_chunksMap.insert( make_pair( split.getOwned(), chunk.getMax().getOwned() ) );
            metadata->_shardVersion.incMinor();
            startKey = split;
        }

        metadata->_collVersion =
                metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion;
        metadata->fillRanges();

        dassert(metadata->isValid());
        return metadata.release();
    }
Esempio n. 6
0
    /**
     * Run a query -- includes checking for and running a Command.
     * @return points to ns if exhaust mode. 0=normal mode
     * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
     * @asserts on scan and order memory exhaustion and other cases.
     */
    string runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
        ParsedQuery& pq( *pq_shared );
        BSONObj jsobj = q.query;
        int queryOptions = q.queryOptions;
        const char *ns = q.ns;

        uassert( 16332 , "can't have an empty ns" , ns[0] );

        if( logLevel >= 2 )
            log() << "runQuery called " << ns << " " << jsobj << endl;

        curop.debug().ns = ns;
        curop.debug().ntoreturn = pq.getNumToReturn();
        curop.debug().query = jsobj;
        curop.setQuery(jsobj);

        uassert( 16256, str::stream() << "Invalid ns [" << ns << "]", NamespaceString::isValid(ns) );

        // Run a command.
        
        if ( pq.couldBeCommand() ) {
            curop.markCommand();
            BufBuilder bb;
            bb.skip(sizeof(QueryResult));
            BSONObjBuilder cmdResBuf;
            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
                curop.debug().iscommand = true;
                curop.debug().query = jsobj;

                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = 1;
                result.setData( qr.release(), true );
            }
            else {
                uasserted(13530, "bad or malformed command request?");
            }
            return "";
        }

        const bool explain = pq.isExplain();
        const bool tailable = pq.hasOption(QueryOption_CursorTailable);
        BSONObj order = pq.getOrder();
        BSONObj query = pq.getFilter();

        /* The ElemIter will not be happy if this isn't really an object. So throw exception
           here when that is true.
           (Which may indicate bad data from client.)
        */
        if ( query.objsize() == 0 ) {
            out() << "Bad query object?\n  jsobj:";
            out() << jsobj.toString() << "\n  query:";
            out() << query.toString() << endl;
            uassert( 10110 , "bad query object", false);
        }

        // Tailable cursors need to read newly written entries from the tail
        // of the collection. They manually arbitrate with the collection over
        // what data is readable and when, so we choose read uncommited isolation.
        OpSettings settings;
        settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR);
        settings.setBulkFetch(true);
        settings.setCappedAppendPK(pq.hasOption(QueryOption_AddHiddenPK));
        cc().setOpSettings(settings);

        // If our caller has a transaction, it's multi-statement.
        const bool inMultiStatementTxn = cc().hasTxn();
        if (tailable) {
            // Because it's easier to disable this. It shouldn't be happening in a normal system.
            uassert(16812, "May not perform a tailable query in a multi-statement transaction.",
                           !inMultiStatementTxn);
        }

        // Begin a read-only, snapshot transaction under normal circumstances.
        // If the cursor is tailable, we need to be able to read uncommitted data.
        const int txnFlags = (tailable ? DB_READ_UNCOMMITTED : DB_TXN_SNAPSHOT) | DB_TXN_READ_ONLY;
        LOCK_REASON(lockReason, "query");
        Client::ReadContext ctx(ns, lockReason);
        scoped_ptr<Client::Transaction> transaction(!inMultiStatementTxn ?
                                                    new Client::Transaction(txnFlags) : NULL);

        bool hasRetried = false;
        while ( 1 ) {
            try {
                replVerifyReadsOk(&pq);

                // Fast-path for primary key queries.
                if (!explain && !tailable) {
                    replVerifyReadsOk(&pq);
                    if (_tryQueryByPKHack(ns, query, pq, curop, result)) {
                        if (transaction) {
                            transaction->commit();
                        }
                        return "";
                    }
                }

                // sanity check the query and projection
                if (pq.getFields() != NULL) {
                    pq.getFields()->validateQuery( query );
                }

                        
                if (tailable) {
                    Collection *cl = getCollection( ns );
                    if (cl != NULL && !(cl->isCapped() || str::equals(ns, rsoplog))) {
                        uasserted( 13051, "tailable cursor requested on non-capped, non-oplog collection" );
                    }
                    const BSONObj nat1 = BSON( "$natural" << 1 );
                    if ( order.isEmpty() ) {
                        order = nat1;
                    } else {
                        uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
                    }
                }
                    
                // Run a regular query.

                // these now may stored in a ClientCursor or somewhere else,
                // so make sure we use a real copy
                jsobj = jsobj.getOwned();
                query = query.getOwned();
                order = order.getOwned();
                const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );
                const bool getCachedExplainPlan = ! hasRetried && explain && ! pq.hasIndexSpecifier();
                const bool savedCursor = queryWithQueryOptimizer( queryOptions, ns, jsobj, curop, query,
                                                                  order, pq_shared, shardingVersionAtStart,
                                                                  getCachedExplainPlan, inMultiStatementTxn,
                                                                  result );
                // Did not save the cursor, so we can commit the transaction now if it exists.
                if (transaction && !savedCursor) {
                    transaction->commit();
                }
                return curop.debug().exhaust ? ns : "";
            }
            catch ( const QueryRetryException & ) {
                // In some cases the query may be retried if there is an in memory sort size assertion.
                verify( ! hasRetried );
                hasRetried = true;
            }
        }
    }
Esempio n. 7
0
    bool runParsed(OperationContext* txn,
                   const NamespaceString& origNss,
                   const AggregationRequest& request,
                   BSONObj& cmdObj,
                   string& errmsg,
                   BSONObjBuilder& result) {
        // For operations on views, this will be the underlying namespace.
        const NamespaceString& nss = request.getNamespaceString();

        // Set up the ExpressionContext.
        intrusive_ptr<ExpressionContext> expCtx = new ExpressionContext(txn, request);
        expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp";

        // Parse the pipeline.
        auto statusWithPipeline = Pipeline::parse(request.getPipeline(), expCtx);
        if (!statusWithPipeline.isOK()) {
            return appendCommandStatus(result, statusWithPipeline.getStatus());
        }
        auto pipeline = std::move(statusWithPipeline.getValue());

        auto resolvedNamespaces = resolveInvolvedNamespaces(txn, pipeline, expCtx);
        if (!resolvedNamespaces.isOK()) {
            return appendCommandStatus(result, resolvedNamespaces.getStatus());
        }
        expCtx->resolvedNamespaces = std::move(resolvedNamespaces.getValue());

        unique_ptr<ClientCursorPin> pin;  // either this OR the exec will be non-null
        unique_ptr<PlanExecutor> exec;
        auto curOp = CurOp::get(txn);
        {
            // This will throw if the sharding version for this connection is out of date. If the
            // namespace is a view, the lock will be released before re-running the aggregation.
            // Otherwise, the lock must be held continuously from now until we have we created both
            // the output ClientCursor and the input executor. This ensures that both are using the
            // same sharding version that we synchronize on here. This is also why we always need to
            // create a ClientCursor even when we aren't outputting to a cursor. See the comment on
            // ShardFilterStage for more details.
            AutoGetCollectionOrViewForRead ctx(txn, nss);
            Collection* collection = ctx.getCollection();

            // If running $collStats on a view, we do not resolve the view since we want stats
            // on this view namespace.
            auto startsWithCollStats = [&pipeline]() {
                const Pipeline::SourceContainer& sources = pipeline->getSources();
                return !sources.empty() &&
                    dynamic_cast<DocumentSourceCollStats*>(sources.front().get());
            };

            // If this is a view, resolve it by finding the underlying collection and stitching view
            // pipelines and this request's pipeline together. We then release our locks before
            // recursively calling run, which will re-acquire locks on the underlying collection.
            // (The lock must be released because recursively acquiring locks on the database will
            // prohibit yielding.)
            auto view = ctx.getView();
            if (view && !startsWithCollStats()) {
                auto viewDefinition =
                    ViewShardingCheck::getResolvedViewIfSharded(txn, ctx.getDb(), view);
                if (!viewDefinition.isOK()) {
                    return appendCommandStatus(result, viewDefinition.getStatus());
                }

                if (!viewDefinition.getValue().isEmpty()) {
                    ViewShardingCheck::appendShardedViewStatus(viewDefinition.getValue(), &result);
                    return false;
                }

                auto resolvedView = ctx.getDb()->getViewCatalog()->resolveView(txn, nss);
                if (!resolvedView.isOK()) {
                    return appendCommandStatus(result, resolvedView.getStatus());
                }

                // With the view resolved, we can relinquish locks.
                ctx.releaseLocksForView();

                // Parse the resolved view into a new aggregation request.
                auto newCmd = resolvedView.getValue().asExpandedViewAggregation(request);
                if (!newCmd.isOK()) {
                    return appendCommandStatus(result, newCmd.getStatus());
                }
                auto newNss = resolvedView.getValue().getNamespace();
                auto newRequest = AggregationRequest::parseFromBSON(newNss, newCmd.getValue());
                if (!newRequest.isOK()) {
                    return appendCommandStatus(result, newRequest.getStatus());
                }

                bool status = runParsed(
                    txn, origNss, newRequest.getValue(), newCmd.getValue(), errmsg, result);
                {
                    // Set the namespace of the curop back to the view namespace so ctx records
                    // stats on this view namespace on destruction.
                    stdx::lock_guard<Client>(*txn->getClient());
                    curOp->setNS_inlock(nss.ns());
                }
                return status;
            }

            // If the pipeline does not have a user-specified collation, set it from the collection
            // default.
            if (request.getCollation().isEmpty() && collection &&
                collection->getDefaultCollator()) {
                invariant(!expCtx->getCollator());
                expCtx->setCollator(collection->getDefaultCollator()->clone());
            }

            // Propagate the ExpressionContext throughout all of the pipeline's stages and
            // expressions.
            pipeline->injectExpressionContext(expCtx);

            // The pipeline must be optimized after the correct collator has been set on it (by
            // injecting the ExpressionContext containing the collator). This is necessary because
            // optimization may make string comparisons, e.g. optimizing {$eq: [<str1>, <str2>]} to
            // a constant.
            pipeline->optimizePipeline();

            if (kDebugBuild && !expCtx->isExplain && !expCtx->inShard) {
                // Make sure all operations round-trip through Pipeline::serialize() correctly by
                // re-parsing every command in debug builds. This is important because sharded
                // aggregations rely on this ability.  Skipping when inShard because this has
                // already been through the transformation (and this un-sets expCtx->inShard).
                pipeline = reparsePipeline(pipeline, request, expCtx);
            }

            // This does mongod-specific stuff like creating the input PlanExecutor and adding
            // it to the front of the pipeline if needed.
            PipelineD::prepareCursorSource(collection, pipeline);

            // Create the PlanExecutor which returns results from the pipeline. The WorkingSet
            // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created
            // PlanExecutor.
            auto ws = make_unique<WorkingSet>();
            auto proxy = make_unique<PipelineProxyStage>(txn, pipeline, ws.get());

            auto statusWithPlanExecutor = (NULL == collection)
                ? PlanExecutor::make(
                      txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL)
                : PlanExecutor::make(
                      txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL);
            invariant(statusWithPlanExecutor.isOK());
            exec = std::move(statusWithPlanExecutor.getValue());

            {
                auto planSummary = Explain::getPlanSummary(exec.get());
                stdx::lock_guard<Client>(*txn->getClient());
                curOp->setPlanSummary_inlock(std::move(planSummary));
            }

            if (collection) {
                PlanSummaryStats stats;
                Explain::getSummaryStats(*exec, &stats);
                collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed);
            }

            if (collection) {
                const bool isAggCursor = true;  // enable special locking behavior
                ClientCursor* cursor =
                    new ClientCursor(collection->getCursorManager(),
                                     exec.release(),
                                     nss.ns(),
                                     txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                                     0,
                                     cmdObj.getOwned(),
                                     isAggCursor);
                pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid()));
                // Don't add any code between here and the start of the try block.
            }

            // At this point, it is safe to release the collection lock.
            // - In the case where we have a collection: we will need to reacquire the
            //   collection lock later when cleaning up our ClientCursorPin.
            // - In the case where we don't have a collection: our PlanExecutor won't be
            //   registered, so it will be safe to clean it up outside the lock.
            invariant(!exec || !collection);
        }

        try {
            // Unless set to true, the ClientCursor created above will be deleted on block exit.
            bool keepCursor = false;

            // Use of the aggregate command without specifying to use a cursor is deprecated.
            // Applications should migrate to using cursors. Cursors are strictly more useful than
            // outputting the results as a single document, since results that fit inside a single
            // BSONObj will also fit inside a single batch.
            //
            // We occasionally log a deprecation warning.
            if (!request.isCursorCommand()) {
                RARELY {
                    warning()
                        << "Use of the aggregate command without the 'cursor' "
                           "option is deprecated. See "
                           "http://dochub.mongodb.org/core/aggregate-without-cursor-deprecation.";
                }
            }

            // If both explain and cursor are specified, explain wins.
            if (expCtx->isExplain) {
                result << "stages" << Value(pipeline->writeExplainOps());
            } else if (request.isCursorCommand()) {
                keepCursor = handleCursorCommand(txn,
                                                 origNss.ns(),
                                                 pin.get(),
                                                 pin ? pin->c()->getExecutor() : exec.get(),
                                                 request,
                                                 result);
            } else {
                pipeline->run(result);
            }

            if (!expCtx->isExplain) {
                PlanSummaryStats stats;
                Explain::getSummaryStats(pin ? *pin->c()->getExecutor() : *exec.get(), &stats);
                curOp->debug().setPlanSummaryMetrics(stats);
                curOp->debug().nreturned = stats.nReturned;
            }

            // Clean up our ClientCursorPin, if needed.  We must reacquire the collection lock
            // in order to do so.
            if (pin) {
                // We acquire locks here with DBLock and CollectionLock instead of using
                // AutoGetCollectionForRead.  AutoGetCollectionForRead will throw if the
                // sharding version is out of date, and we don't care if the sharding version
                // has changed.
                Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS);
                Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS);
                if (keepCursor) {
                    pin->release();
                } else {
                    pin->deleteUnderlying();
                }
            }
        } catch (...) {
Esempio n. 8
0
    /**
     * Do the initial sync for this member.
     */
    void ReplSetImpl::_syncDoInitialSync() {
        replset::InitialSync init(replset::BackgroundSync::get());
        sethbmsg("initial sync pending",0);

        // if this is the first node, it may have already become primary
        if ( box.getState().primary() ) {
            sethbmsg("I'm already primary, no need for initial sync",0);
            return;
        }

        const Member *source = getMemberToSyncTo();
        if (!source) {
            sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0);
            sleepsecs(15);
            return;
        }

        string sourceHostname = source->h().toString();
        init.setHostname(sourceHostname);
        OplogReader r;
        if( !r.connect(sourceHostname) ) {
            sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0);
            sleepsecs(15);
            return;
        }

        BSONObj lastOp = r.getLastOp(rsoplog);
        if( lastOp.isEmpty() ) {
            sethbmsg("initial sync couldn't read remote oplog", 0);
            sleepsecs(15);
            return;
        }

        if (replSettings.fastsync) {
            log() << "fastsync: skipping database clone" << rsLog;

            // prime oplog
            init.oplogApplication(lastOp, lastOp);
            return;
        }
        else {
            sethbmsg("initial sync drop all databases", 0);
            dropAllDatabasesExceptLocal();

            sethbmsg("initial sync clone all databases", 0);

            list<string> dbs = r.conn()->getDatabaseNames();

            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }

            sethbmsg("initial sync data copy, starting syncup",0);
            
            BSONObj minValid;
            if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
                return;
            }

            lastOp = minValid;
            // its currently important that lastOp is equal to the last op we actually pulled
            // this is because the background thread only pulls each op once now
            // so if its now, we'll be waiting forever
            {
                // this takes whatever the last op the we got is
                // and stores it locally before we wipe it out below
                Lock::DBRead lk(rsoplog);
                Helpers::getLast(rsoplog, lastOp);
                lastOp = lastOp.getOwned();
            }

            // reset state, as that "didn't count"
            emptyOplog(); 
            lastOpTimeWritten = OpTime();
            lastH = 0;

            sethbmsg("initial sync building indexes",0);
            if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {
                veto(source->fullName(), 600);
                sleepsecs(300);
                return;
            }
        }

        sethbmsg("initial sync query minValid",0);

        BSONObj minValid;
        if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) {
            return;
        }
        
        // ---------


        sethbmsg("initial sync finishing up",0);

        verify( !box.getState().primary() ); // wouldn't make sense if we were.

        {
            Client::WriteContext cx( "local." );
            cx.ctx().db()->flushFiles(true);
            try {
                log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog;
            }
            catch(...) { }

            theReplSet->setMinValid(minValid);

            cx.ctx().db()->flushFiles(true);
        }

        changeState(MemberState::RS_RECOVERING);
        sethbmsg("initial sync done",0);
    }
Esempio n. 9
0
    int ConfigDiffTracker<ValType,ShardType>::
        calculateConfigDiff( DBClientCursorInterface& diffCursor )
    {
        MONGO_LOG_DEFAULT_COMPONENT_LOCAL(::mongo::logger::LogComponent::kSharding);

        verifyAttached();

        // Apply the chunk changes to the ranges and versions

        //
        // Overall idea here is to work in two steps :
        // 1. For all the new chunks we find, increment the maximum version per-shard and
        //    per-collection, and remove any conflicting chunks from the ranges
        // 2. For all the new chunks we're interested in (all of them for mongos, just chunks on the
        //    shard for mongod) add them to the ranges
        //

        vector<BSONObj> newTracked;
        // Store epoch now so it doesn't change when we change max
        OID currEpoch = _maxVersion->epoch();

        _validDiffs = 0;
        while( diffCursor.more() ){

            BSONObj diffChunkDoc = diffCursor.next();

            ChunkVersion chunkVersion = ChunkVersion::fromBSON(diffChunkDoc, ChunkType::DEPRECATED_lastmod());

            if( diffChunkDoc[ChunkType::min()].type() != Object ||
                diffChunkDoc[ChunkType::max()].type() != Object ||
                diffChunkDoc[ChunkType::shard()].type() != String )
            {
                warning() << "got invalid chunk document " << diffChunkDoc
                          << " when trying to load differing chunks" << endl;
                continue;
            }

            if( ! chunkVersion.isSet() || ! chunkVersion.hasEqualEpoch( currEpoch ) ){

                warning() << "got invalid chunk version " << chunkVersion << " in document " << diffChunkDoc
                          << " when trying to load differing chunks at version "
                          << ChunkVersion( _maxVersion->majorVersion(),
                                           _maxVersion->minorVersion(),
                                           currEpoch ) << endl;

                // Don't keep loading, since we know we'll be broken here
                return -1;
            }

            _validDiffs++;

            // Get max changed version and chunk version
            if( chunkVersion > *_maxVersion ) *_maxVersion = chunkVersion;

            // Chunk version changes
            ShardType shard = shardFor( diffChunkDoc[ChunkType::shard()].String() );
            typename map<ShardType, ChunkVersion>::iterator shardVersionIt = _maxShardVersions->find( shard );
            if( shardVersionIt == _maxShardVersions->end() || shardVersionIt->second < chunkVersion ){
                (*_maxShardVersions)[ shard ] = chunkVersion;
            }

            // See if we need to remove any chunks we are currently tracking b/c of this chunk's changes
            removeOverlapping(diffChunkDoc[ChunkType::min()].Obj(),
                              diffChunkDoc[ChunkType::max()].Obj());

            // Figure out which of the new chunks we need to track
            // Important - we need to actually own this doc, in case the cursor decides to getMore or unbuffer
            if( isTracked( diffChunkDoc ) ) newTracked.push_back( diffChunkDoc.getOwned() );
        }

        LOG(3) << "found " << _validDiffs
               << " new chunks for collection " << _ns
               << " (tracking " << newTracked.size()
               << "), new version is " << *_maxVersion
               << endl;

        for( vector<BSONObj>::iterator it = newTracked.begin(); it != newTracked.end(); it++ ){

            BSONObj chunkDoc = *it;

            // Important - we need to make sure we actually own the min and max here
            BSONObj min = chunkDoc[ChunkType::min()].Obj().getOwned();
            BSONObj max = chunkDoc[ChunkType::max()].Obj().getOwned();

            // Invariant enforced by sharding
            // It's possible to read inconsistent state b/c of getMore() and yielding, so we want
            // to detect as early as possible.
            // TODO: This checks for overlap, we also should check for holes here iff we're tracking
            // all chunks
            if( isOverlapping( min, max ) ) return -1;

            _currMap->insert( rangeFor( chunkDoc, min, max ) );
        }

        return _validDiffs;
    }
Esempio n. 10
0
    bool ClientInfo::getLastError( const BSONObj& options , BSONObjBuilder& result , bool fromWriteBackListener ) {
        set<string> * shards = getPrev();

        if ( shards->size() == 0 ) {
            result.appendNull( "err" );
            return true;
        }

        vector<WBInfo> writebacks;

        // handle single server
        if ( shards->size() == 1 ) {
            string theShard = *(shards->begin() );



            BSONObj res;
            bool ok = false;
            {
                ShardConnection conn( theShard , "" );
                try {
                    ok = conn->runCommand( "admin" , options , res );
                }
                catch( std::exception &e ) {

                    warning() << "could not get last error from shard " << theShard << causedBy( e ) << endl;

                    // Catch everything that happens here, since we need to ensure we return our connection when we're
                    // finished.
                    conn.done();

                    return false;
                }


                res = res.getOwned();
                conn.done();
            }

            _addWriteBack( writebacks , res );

            // hit other machines just to block
            for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
                string temp = *i;
                if ( temp == theShard )
                    continue;

                try {
                    ShardConnection conn( temp , "" );
                    ON_BLOCK_EXIT_OBJ( conn, &ShardConnection::done );
                    _addWriteBack( writebacks , conn->getLastErrorDetailed() );

                }
                catch( std::exception &e ){
                    warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl;
                }

            }
            clearSinceLastGetError();

            if ( writebacks.size() ){
                vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener );
                if ( v.size() == 0 && fromWriteBackListener ) {
                    // ok
                }
                else {
                    // this will usually be 1
                    // it can be greater than 1 if a write to a different shard
                    // than the last write op had a writeback
                    // all we're going to report is the first
                    // since that's the current write
                    // but we block for all
                    verify( v.size() >= 1 );
                    result.appendElements( v[0] );
                    result.appendElementsUnique( res );
                    result.append( "writebackGLE" , v[0] );
                    result.append( "initialGLEHost" , theShard );
                }
            }
            else {
                result.append( "singleShard" , theShard );
                result.appendElements( res );
            }

            return ok;
        }

        BSONArrayBuilder bbb( result.subarrayStart( "shards" ) );
        BSONObjBuilder shardRawGLE;

        long long n = 0;

        int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true

        // hit each shard
        vector<string> errors;
        vector<BSONObj> errorObjects;
        for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) {
            string theShard = *i;
            bbb.append( theShard );
            boost::scoped_ptr<ShardConnection> conn;
            BSONObj res;
            bool ok = false;
            try {
                conn.reset( new ShardConnection( theShard , "" ) ); // constructor can throw if shard is down
                ok = (*conn)->runCommand( "admin" , options , res );
                shardRawGLE.append( theShard , res );
            }
            catch( std::exception &e ){

              // Safe to return here, since we haven't started any extra processing yet, just collecting
              // responses.

              warning() << "could not get last error from a shard " << theShard << causedBy( e ) << endl;
                conn->done();

                return false;
            }

            _addWriteBack( writebacks, res );

            string temp = DBClientWithCommands::getLastErrorString( res );
            if ( (*conn)->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) {
                errors.push_back( temp );
                errorObjects.push_back( res );
            }

            n += res["n"].numberLong();
            if ( res["updatedExisting"].type() ) {
                if ( res["updatedExisting"].trueValue() )
                    updatedExistingStat = 1;
                else if ( updatedExistingStat == 0 )
                    updatedExistingStat = -1;
            }

            conn->done();
        }

        bbb.done();
        result.append( "shardRawGLE" , shardRawGLE.obj() );

        result.appendNumber( "n" , n );
        if ( updatedExistingStat )
            result.appendBool( "updatedExisting" , updatedExistingStat > 0 );

        // hit other machines just to block
        for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
            string temp = *i;
            if ( shards->count( temp ) )
                continue;

            ShardConnection conn( temp , "" );
            try {
                _addWriteBack( writebacks, conn->getLastErrorDetailed() );
            }
            catch( std::exception &e ){
                warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl;
            }
            conn.done();
        }
        clearSinceLastGetError();

        if ( errors.size() == 0 ) {
            result.appendNull( "err" );
            _handleWriteBacks( writebacks , fromWriteBackListener );
            return true;
        }

        result.append( "err" , errors[0].c_str() );

        {
            // errs
            BSONArrayBuilder all( result.subarrayStart( "errs" ) );
            for ( unsigned i=0; i<errors.size(); i++ ) {
                all.append( errors[i].c_str() );
            }
            all.done();
        }

        {
            // errObjects
            BSONArrayBuilder all( result.subarrayStart( "errObjects" ) );
            for ( unsigned i=0; i<errorObjects.size(); i++ ) {
                all.append( errorObjects[i] );
            }
            all.done();
        }
        _handleWriteBacks( writebacks , fromWriteBackListener );
        return true;
    }
Esempio n. 11
0
    PlanStage::StageState IndexScan::work(WorkingSetID* out) {
        ++_commonStats.works;

        // Adds the amount of time taken by work() to executionTimeMillis.
        ScopedTimer timer(&_commonStats.executionTimeMillis);

        // If we examined multiple keys in a prior work cycle, make up for it here by returning
        // NEED_TIME. This is done for plan ranking. Refer to the comment for '_checkEndKeys'
        // in the .h for details.
        if (_checkEndKeys > 0) {
            --_checkEndKeys;
            ++_commonStats.needTime;
            return PlanStage::NEED_TIME;
        }

        if (NULL == _indexCursor.get()) {
            // First call to work().  Perform possibly heavy init.
            initIndexScan();
            checkEnd();
            ++_commonStats.needTime;
            return PlanStage::NEED_TIME;
        }
        else if (_yieldMovedCursor) {
            _yieldMovedCursor = false;
            // Note that we're not calling next() here.  We got the next thing when we recovered
            // from yielding.
        }

        if (isEOF()) { return PlanStage::IS_EOF; }

        // Grab the next (key, value) from the index.
        BSONObj keyObj = _indexCursor->getKey();
        DiskLoc loc = _indexCursor->getValue();

        // Move to the next result.
        // The underlying IndexCursor points at the *next* thing we want to return.  We do this so
        // that if we're scanning an index looking for docs to delete we don't continually clobber
        // the thing we're pointing at.
        _indexCursor->next();
        checkEnd();

        if (_shouldDedup) {
            ++_specificStats.dupsTested;
            if (_returned.end() != _returned.find(loc)) {
                ++_specificStats.dupsDropped;
                ++_commonStats.needTime;
                return PlanStage::NEED_TIME;
            }
            else {
                _returned.insert(loc);
            }
        }

        if (Filter::passes(keyObj, _keyPattern, _filter)) {
            if (NULL != _filter) {
                ++_specificStats.matchTested;
            }

            // We must make a copy of the on-disk data since it can mutate during the execution of
            // this query.
            BSONObj ownedKeyObj = keyObj.getOwned();

            // Fill out the WSM.
            WorkingSetID id = _workingSet->allocate();
            WorkingSetMember* member = _workingSet->get(id);
            member->loc = loc;
            member->keyData.push_back(IndexKeyDatum(_keyPattern, ownedKeyObj));
            member->state = WorkingSetMember::LOC_AND_IDX;

            if (_params.addKeyMetadata) {
                BSONObjBuilder bob;
                bob.appendKeys(_keyPattern, ownedKeyObj);
                member->addComputed(new IndexKeyComputedData(bob.obj()));
            }

            *out = id;
            ++_commonStats.advanced;
            return PlanStage::ADVANCED;
        }

        ++_commonStats.needTime;
        return PlanStage::NEED_TIME;
    }
Esempio n. 12
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& jsobj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        const std::string ns = parseNs(dbname, jsobj);

        md5digest d;
        md5_state_t st;
        md5_init(&st);

        int n = 0;

        bool partialOk = jsobj["partialOk"].trueValue();
        if (partialOk) {
            // WARNING: This code depends on the binary layout of md5_state. It will not be
            // compatible with different md5 libraries or work correctly in an environment with
            // mongod's of different endians. It is ok for mongos to be a different endian since
            // it just passes the buffer through to another mongod.
            BSONElement stateElem = jsobj["md5state"];
            if (!stateElem.eoo()) {
                int len;
                const char* data = stateElem.binDataClean(len);
                massert(16247, "md5 state not correct size", len == sizeof(st));
                memcpy(&st, data, sizeof(st));
            }
            n = jsobj["startAt"].numberInt();
        }

        BSONObj query = BSON("files_id" << jsobj["filemd5"] << "n" << GTE << n);
        BSONObj sort = BSON("files_id" << 1 << "n" << 1);

        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            CanonicalQuery* cq;
            if (!CanonicalQuery::canonicalize(ns, query, sort, BSONObj(), &cq).isOK()) {
                uasserted(17240, "Can't canonicalize query " + query.toString());
                return 0;
            }

            // Check shard version at startup.
            // This will throw before we've done any work if shard version is outdated
            // We drop and re-acquire these locks every document because md5'ing is expensive
            unique_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, ns));
            Collection* coll = ctx->getCollection();

            PlanExecutor* rawExec;
            if (!getExecutor(txn,
                             coll,
                             cq,
                             PlanExecutor::YIELD_MANUAL,
                             &rawExec,
                             QueryPlannerParams::NO_TABLE_SCAN).isOK()) {
                uasserted(17241, "Can't get executor for query " + query.toString());
                return 0;
            }

            unique_ptr<PlanExecutor> exec(rawExec);
            // Process notifications when the lock is released/reacquired in the loop below
            exec->registerExec();

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                BSONElement ne = obj["n"];
                verify(ne.isNumber());
                int myn = ne.numberInt();
                if (n != myn) {
                    if (partialOk) {
                        break;  // skipped chunk is probably on another shard
                    }
                    log() << "should have chunk: " << n << " have:" << myn << endl;
                    dumpChunks(txn, ns, query, sort);
                    uassert(10040, "chunks out of order", n == myn);
                }

                // make a copy of obj since we access data in it while yielding locks
                BSONObj owned = obj.getOwned();
                exec->saveState();
                // UNLOCKED
                ctx.reset();

                int len;
                const char* data = owned["data"].binDataClean(len);
                // This is potentially an expensive operation, so do it out of the lock
                md5_append(&st, (const md5_byte_t*)(data), len);
                n++;

                try {
                    // RELOCKED
                    ctx.reset(new AutoGetCollectionForRead(txn, ns));
                } catch (const SendStaleConfigException& ex) {
                    LOG(1) << "chunk metadata changed during filemd5, will retarget and continue";
                    break;
                }

                // Have the lock again. See if we were killed.
                if (!exec->restoreState(txn)) {
                    if (!partialOk) {
                        uasserted(13281, "File deleted during filemd5 command");
                    }
                }
            }

            if (partialOk)
                result.appendBinData("md5state", sizeof(st), BinDataGeneral, &st);

            // This must be *after* the capture of md5state since it mutates st
            md5_finish(&st, d);

            result.append("numChunks", n);
            result.append("md5", digestToString(d));
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "filemd5", dbname);
        return true;
    }
Esempio n. 13
0
 void ParsedQuery::initFields( const BSONObj& fields ) {
     if ( fields.isEmpty() )
         return;
     _fields.reset( new Projection() );
     _fields->init( fields.getOwned() );
 }
    Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace( OperationContext* txn,
                                                              const StringData& fromNS,
                                                              const StringData& toNS,
                                                              bool stayTemp ) {
        // some sanity checking
        NamespaceDetails* fromDetails = _namespaceIndex.details( fromNS );
        if ( !fromDetails )
            return Status( ErrorCodes::BadValue, "from namespace doesn't exist" );

        if ( _namespaceIndex.details( toNS ) )
            return Status( ErrorCodes::BadValue, "to namespace already exists" );

        _removeFromCache( fromNS );

        // at this point, we haven't done anything destructive yet

        // ----
        // actually start moving
        // ----

        // this could throw, but if it does we're ok
        _namespaceIndex.add_ns( txn, toNS, fromDetails );
        NamespaceDetails* toDetails = _namespaceIndex.details( toNS );

        try {
            toDetails->copyingFrom(txn,
                                   toNS,
                                   _namespaceIndex,
                                   fromDetails); // fixes extraOffset
        }
        catch( DBException& ) {
            // could end up here if .ns is full - if so try to clean up / roll back a little
            _namespaceIndex.kill_ns( txn, toNS );
            throw;
        }

        // at this point, code .ns stuff moved

        _namespaceIndex.kill_ns( txn, fromNS );
        fromDetails = NULL;

        // fix system.namespaces
        BSONObj newSpec;
        DiskLoc oldSpecLocation;
        {

            BSONObj oldSpec;
            {
                RecordStoreV1Base* rs = _getNamespaceRecordStore();
                scoped_ptr<RecordIterator> it( rs->getIterator(txn) );
                while ( !it->isEOF() ) {
                    DiskLoc loc = it->getNext();
                    BSONObj entry = it->dataFor( loc ).toBson();
                    if ( fromNS == entry["name"].String() ) {
                        oldSpecLocation = loc;
                        oldSpec = entry.getOwned();
                        break;
                    }
                }
            }
            invariant( !oldSpec.isEmpty() );
            invariant( !oldSpecLocation.isNull() );

            BSONObjBuilder b;
            BSONObjIterator i( oldSpec.getObjectField( "options" ) );
            while( i.more() ) {
                BSONElement e = i.next();
                if ( strcmp( e.fieldName(), "create" ) != 0 ) {
                    if (stayTemp || (strcmp(e.fieldName(), "temp") != 0))
                        b.append( e );
                }
                else {
                    b << "create" << toNS;
                }
            }
            newSpec = b.obj();
        }

        _addNamespaceToNamespaceCollection( txn, toNS, newSpec.isEmpty() ? 0 : &newSpec );

        _getNamespaceRecordStore()->deleteRecord( txn, oldSpecLocation );

        boost::mutex::scoped_lock lk( _collectionsLock );
        Entry*& entry = _collections[toNS.toString()];
        invariant( entry == NULL );
        entry = new Entry();
        _fillInEntry_inlock( txn, toNS, entry );

        return Status::OK();
    }
Esempio n. 15
0
        bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            if( cmdObj["replSetReconfig"].type() != Object ) {
                errmsg = "no configuration specified";
                return false;
            }

            // We might want to add the protocol version of theReplSet->config() if it exists,
            // instead of just blindly adding our compiled-in CURRENT_PROTOCOL_VERSION.  But for
            // TokuMX 1.0 it doesn't matter.
            BSONObj configObj = ReplSetConfig::addProtocolVersionIfMissing(cmdObj["replSetReconfig"].Obj());

            bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue();
            if( force && !theReplSet ) {
                replSettings.reconfig = configObj.getOwned();
                result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds");
                return true;
            }

            if ( !check(errmsg, result) ) {
                return false;
            }

            if( !force && !theReplSet->box.getState().primary() ) {
                errmsg = "replSetReconfig command must be sent to the current replica set primary.";
                return false;
            }

            {
                // just make sure we can get a write lock before doing anything else.  we'll reacquire one
                // later.  of course it could be stuck then, but this check lowers the risk if weird things
                // are up - we probably don't want a change to apply 30 minutes after the initial attempt.
                time_t t = time(0);
                Lock::GlobalWrite lk;
                if( time(0)-t > 20 ) {
                    errmsg = "took a long time to get write lock, so not initiating.  Initiate when server less busy?";
                    return false;
                }
            }

            try {
                scoped_ptr<ReplSetConfig> newConfig
                        (ReplSetConfig::make(configObj, force));

                log() << "replSet replSetReconfig config object parses ok, " <<
                        newConfig->members.size() << " members specified" << rsLog;

                if( !ReplSetConfig::legalChange(theReplSet->getConfig(), *newConfig, errmsg) ) {
                    return false;
                }

                checkMembersUpForConfigChange(*newConfig, result, false);

                log() << "replSet replSetReconfig [2]" << rsLog;

                theReplSet->haveNewConfig(*newConfig, true);
                ReplSet::startupStatusMsg.set("replSetReconfig'd");
            }
            catch( DBException& e ) {
                log() << "replSet replSetReconfig exception: " << e.what() << rsLog;
                throw;
            }
            catch( string& se ) {
                log() << "replSet reconfig exception: " << se << rsLog;
                errmsg = se;
                return false;
            }

            resetSlaveCache();
            return true;
        }
Esempio n. 16
0
void BatchedUpdateDocument::setQuery(const BSONObj& query) {
    _query = query.getOwned();
    _isQuerySet = true;
}
Esempio n. 17
0
    /**
     * Run a query with a cursor provided by the query optimizer, or FindingStartCursor.
     * @returns true if client cursor was saved, false if the query has completed.
     */
    bool queryWithQueryOptimizer( int queryOptions, const string& ns,
                                  const BSONObj &jsobj, CurOp& curop,
                                  const BSONObj &query, const BSONObj &order,
                                  const shared_ptr<ParsedQuery> &pq_shared,
                                  const ConfigVersion &shardingVersionAtStart,
                                  const bool getCachedExplainPlan,
                                  const bool inMultiStatementTxn,
                                  Message &result ) {

        const ParsedQuery &pq( *pq_shared );
        shared_ptr<Cursor> cursor;
        QueryPlanSummary queryPlan;

        const bool tailable = pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1;
        
        LOG(1) << "query beginning read-only transaction. tailable: " << tailable << endl;
        
        BSONObj oldPlan;
        if (getCachedExplainPlan) {
            scoped_ptr<MultiPlanScanner> mps( MultiPlanScanner::make( ns.c_str(), query, order ) );
            oldPlan = mps->cachedPlanExplainSummary();
        }
        
        cursor = getOptimizedCursor( ns.c_str(), query, order, QueryPlanSelectionPolicy::any(),
                                     pq_shared, false, &queryPlan );
        verify( cursor );

        // Tailable cursors must be marked as such before any use. This is so that
        // the implementation knows that uncommitted data cannot be returned.
        if ( tailable ) {
            cursor->setTailable();
        }

        scoped_ptr<QueryResponseBuilder> queryResponseBuilder
                ( QueryResponseBuilder::make( pq, cursor, queryPlan, oldPlan ) );
        bool saveClientCursor = false;
        int options = QueryOption_NoCursorTimeout;
        if (pq.hasOption( QueryOption_OplogReplay )) {
            options |= QueryOption_OplogReplay;
        }
        // create a client cursor that does not create a cursorID.
        // The cursor ID will be created if and only if we save
        // the client cursor further below
        ClientCursor::Holder ccPointer(
            new ClientCursor( options, cursor, ns, BSONObj(), false, false )
            );

        // for oplog cursors, we check if we are reading data that is too old and might
        // be stale.
        bool opChecked = false;
        bool slaveLocationUpdated = false;
        BSONObj last;
        bool lastBSONObjSet = false;
        for ( ; cursor->ok(); cursor->advance() ) {

            if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) {
                break;
            }
            
            if ( !queryResponseBuilder->addMatch() ) {
                continue;
            }
            
            // Note slave's position in the oplog.
            if ( pq.hasOption( QueryOption_OplogReplay ) ) {
                BSONObj current = cursor->current();
                last = current.copy();
                lastBSONObjSet = true;
                
                // the first row returned is equal to the last element that
                // the slave has synced up to, so we might as well update
                // the slave location
                if (!slaveLocationUpdated) {
                    ccPointer->updateSlaveLocation(curop);
                    slaveLocationUpdated = true;
                }
                // check if data we are about to return may be too stale
                if (!opChecked) {
                    ccPointer->storeOpForSlave(current);
                    uassert(16785, "oplog cursor reading data that is too old", !ccPointer->lastOpForSlaveTooOld());
                    opChecked = true;
                }
            }
            
            if ( pq.isExplain() ) {
                if ( queryResponseBuilder->enoughTotalResults() ) {
                    break;
                }
            }
            else if ( queryResponseBuilder->enoughForFirstBatch() ) {
                // if only 1 requested, no cursor saved for efficiency...we assume it is findOne()
                if ( pq.wantMore() && pq.getNumToReturn() != 1 ) {
                    queryResponseBuilder->finishedFirstBatch();
                    if ( cursor->advance() ) {
                        saveClientCursor = true;
                    }
                }
                break;
            }
        }

        // If the tailing request succeeded
        if ( cursor->tailable() ) {
            saveClientCursor = true;
        }
        
        if ( ! shardingState.getVersion( ns ).isWriteCompatibleWith( shardingVersionAtStart ) ) {
            // if the version changed during the query
            // we might be missing some data
            // and its safe to send this as mongos can resend
            // at this point
            throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) );
        }
        
        int nReturned = queryResponseBuilder->handoff( result );

        ccPointer.reset();
        long long cursorid = 0;
        if ( saveClientCursor ) {
            // Create a new ClientCursor, with a default timeout.
            ccPointer.reset( new ClientCursor( queryOptions, cursor, ns,
                                               jsobj.getOwned(), inMultiStatementTxn ) );
            cursorid = ccPointer->cursorid();
            DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
            
            if ( !ccPointer->ok() && ccPointer->c()->tailable() ) {
                DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
            }
            
            if( queryOptions & QueryOption_Exhaust ) {
                curop.debug().exhaust = true;
            }
            
            // Set attributes for getMore.
            ccPointer->setChunkManager( queryResponseBuilder->chunkManager() );
            ccPointer->setPos( nReturned );
            ccPointer->pq = pq_shared;
            ccPointer->fields = pq.getFieldPtr();
            if (pq.hasOption( QueryOption_OplogReplay ) && lastBSONObjSet) {
                ccPointer->storeOpForSlave(last);
            }
            if (!inMultiStatementTxn) {
                // This cursor is not part of a multi-statement transaction, so
                // we pass off the current client's transaction stack to the
                // cursor so that it may be live as long as the cursor.
                cc().swapTransactionStack(ccPointer->transactions);
                verify(!cc().hasTxn());
            }
            ccPointer.release();
        }

        QueryResult *qr = (QueryResult *) result.header();
        qr->cursorId = cursorid;
        curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId );
        qr->setResultFlagsToOk();
        // qr->len is updated automatically by appendData()
        curop.debug().responseLength = qr->len;
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = nReturned;

        curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = nReturned;

        return saveClientCursor;
    }
Esempio n. 18
0
void BatchedUpdateDocument::setUpdateExpr(const BSONObj& updateExpr) {
    _updateExpr = updateExpr.getOwned();
    _isUpdateExprSet = true;
}
Esempio n. 19
0
 /**
  * Invariant: idObj should belong to a document that is part of the active chunk being migrated
  */
 LogOpForShardingHandler(MigrationSourceManager* migrateSourceManager,
                         const BSONObj& idObj,
                         const char op)
     : _migrationSourceManager(migrateSourceManager), _idObj(idObj.getOwned()), _op(op) {}
Esempio n. 20
0
void BatchedUpdateDocument::setCollation(const BSONObj& collation) {
    _collation = collation.getOwned();
    _isCollationSet = true;
}
Esempio n. 21
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            context.relocked();

            bool createdCollection = false;
            Collection* collection = NULL;

            while( i.moreInCurrentBatch() ) {
                if ( numSeen % 128 == 127 /*yield some*/ ) {
                    collection = NULL;
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) {
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << numSeen << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                if ( isindex == false && collection == NULL ) {
                    collection = context.db()->getCollection( to_collection );
                    if ( !collection ) {
                        massert( 17321,
                                 str::stream()
                                 << "collection dropped during clone ["
                                 << to_collection << "]",
                                 !createdCollection );
                        createdCollection = true;
                        collection = context.db()->createCollection( to_collection );
                        verify( collection );
                    }
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                const Status status = validateBSON(tmp.objdata(), tmp.objsize());
                if (!status.isOK()) {
                    out() << "Cloner: skipping corrupt object from " << from_collection
                          << ": " << status.reason();
                    continue;
                }

                ++numSeen;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify(nsToCollectionSubstring(from_collection) == "system.indexes");
                    js = fixindex(tmp);
                    indexesToBuild->push_back( js.getOwned() );
                    continue;
                }

                verify(nsToCollectionSubstring(from_collection) != "system.indexes");

                StatusWith<DiskLoc> loc = collection->insertDocument( js, true );
                if ( !loc.isOK() ) {
                    error() << "error: exception cloning object in " << from_collection
                            << ' ' << loc.toString() << " obj:" << js;
                }
                uassertStatusOK( loc.getStatus() );
                if ( logForRepl )
                    logOp("i", to_collection, js);

                getDur().commitIfNeeded();

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << numSeen << " objects cloned so far from collection " << from_collection;
                    saveLast = time( 0 );
                }
            }
        }
Esempio n. 22
0
void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) {
    ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
    if (replCoord->getSettings().usingReplSets()) {
        IsMasterResponse isMasterResponse;
        replCoord->fillIsMasterForReplSet(&isMasterResponse);
        result.appendElements(isMasterResponse.toBSON());
        if (level) {
            replCoord->appendSlaveInfoData(&result);
        }
        return;
    }

    // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed
    if (replAllDead) {
        result.append("ismaster", 0);
        string s = string("dead: ") + replAllDead;
        result.append("info", s);
    } else {
        result.appendBool("ismaster",
                          getGlobalReplicationCoordinator()->isMasterForReportingPurposes());
    }

    if (level) {
        BSONObjBuilder sources(result.subarrayStart("sources"));

        int n = 0;
        list<BSONObj> src;
        {
            const char* localSources = "local.sources";
            AutoGetCollectionForRead ctx(txn, localSources);
            unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan(
                txn, localSources, ctx.getCollection(), PlanExecutor::YIELD_MANUAL));
            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                src.push_back(obj.getOwned());
            }

            // Non-yielding collection scans from InternalPlanner will never error.
            invariant(PlanExecutor::IS_EOF == state);
        }

        for (list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++) {
            BSONObj s = *i;
            BSONObjBuilder bb;
            bb.append(s["host"]);
            string sourcename = s["source"].valuestr();
            if (sourcename != "main")
                bb.append(s["source"]);
            {
                BSONElement e = s["syncedTo"];
                BSONObjBuilder t(bb.subobjStart("syncedTo"));
                t.appendDate("time", e.timestampTime());
                t.append("inc", e.timestampInc());
                t.done();
            }

            if (level > 1) {
                wassert(!txn->lockState()->isLocked());
                // note: there is no so-style timeout on this connection; perhaps we should have
                // one.
                ScopedDbConnection conn(s["host"].valuestr());

                DBClientConnection* cliConn = dynamic_cast<DBClientConnection*>(&conn.conn());
                if (cliConn && replAuthenticate(cliConn)) {
                    BSONObj first = conn->findOne((string) "local.oplog.$" + sourcename,
                                                  Query().sort(BSON("$natural" << 1)));
                    BSONObj last = conn->findOne((string) "local.oplog.$" + sourcename,
                                                 Query().sort(BSON("$natural" << -1)));
                    bb.appendDate("masterFirst", first["ts"].timestampTime());
                    bb.appendDate("masterLast", last["ts"].timestampTime());
                    const auto lag = (last["ts"].timestampTime() - s["syncedTo"].timestampTime());
                    bb.append("lagSeconds", durationCount<Milliseconds>(lag) / 1000.0);
                }
                conn.done();
            }

            sources.append(BSONObjBuilder::numStr(n++), bb.obj());
        }

        sources.done();

        replCoord->appendSlaveInfoData(&result);
    }
}
Esempio n. 23
0
    bool ClientInfo::getLastError( const string& dbName,
                                   const BSONObj& options,
                                   BSONObjBuilder& result,
                                   string& errmsg,
                                   bool fromWriteBackListener)
    {

        scoped_ptr<TimerHolder> gleTimerHolder;
        if ( ! fromWriteBackListener ) {
            bool doTiming = false;
            const BSONElement& e = options["w"];
            if ( e.isNumber() ) {
                doTiming = e.numberInt() > 1;
            }
            else if ( e.type() == String ) {
                doTiming = true;
            }
            if ( doTiming ) {
                gleTimerHolder.reset( new TimerHolder( &gleWtimeStats ) );
            }
        }


        set<string> * shards = getPrev();

        if ( shards->size() == 0 ) {
            result.appendNull( "err" );
            return true;
        }

        vector<WBInfo> writebacks;

        //
        // TODO: These branches should be collapsed into a single codepath
        //

        // handle single server
        if ( shards->size() == 1 ) {
            string theShard = *(shards->begin() );

            BSONObj res;
            bool ok = false;
            {
                LOG(5) << "gathering response for gle from: " << theShard << endl;

                ShardConnection conn( theShard , "" );
                try {
                    ok = conn->runCommand( dbName , options , res );
                }
                catch( std::exception &e ) {

                    string message =
                            str::stream() << "could not get last error from shard " << theShard
                                          << causedBy( e );

                    warning() << message << endl;
                    errmsg = message;

                    // Catch everything that happens here, since we need to ensure we return our connection when we're
                    // finished.
                    conn.done();

                    return false;
                }


                res = res.getOwned();
                conn.done();
            }

            _addWriteBack( writebacks, res, true );

            LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for"
                   << " gle (" << theShard << ")" << endl;

            // hit other machines just to block
            for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
                string temp = *i;
                if ( temp == theShard )
                    continue;

                LOG(5) << "gathering writebacks for single-shard gle from: " << temp << endl;

                try {
                    ShardConnection conn( temp , "" );
                    ON_BLOCK_EXIT_OBJ( conn, &ShardConnection::done );
                    _addWriteBack( writebacks, conn->getLastErrorDetailed(), false );

                }
                catch( std::exception &e ){
                    warning() << "could not clear last error from shard " << temp << causedBy( e ) << endl;
                }

            }
            clearSinceLastGetError();

            LOG(4) << "checking " << writebacks.size() << " writebacks for"
                   << " gle (" << theShard << ")" << endl;

            if ( writebacks.size() ){
                vector<BSONObj> v = _handleWriteBacks( writebacks , fromWriteBackListener );
                if ( v.size() == 0 && fromWriteBackListener ) {
                    // ok
                }
                else {
                    // this will usually be 1
                    // it can be greater than 1 if a write to a different shard
                    // than the last write op had a writeback
                    // all we're going to report is the first
                    // since that's the current write
                    // but we block for all
                    verify( v.size() >= 1 );

                    if ( res["writebackSince"].numberInt() > 0 ) {
                        // got writeback from older op
                        // ignore the result from it, just needed to wait
                        result.appendElements( res );
                    }
                    else if ( writebacks[0].fromLastOperation ) {
                        result.appendElements( v[0] );
                        result.appendElementsUnique( res );
                        result.append( "writebackGLE" , v[0] );
                        result.append( "initialGLEHost" , theShard );
                        result.append( "initialGLE", res );
                    }
                    else {
                        // there was a writeback
                        // but its from an old operations
                        // so all that's important is that we block, not that we return stats
                        result.appendElements( res );
                    }
                }
            }
            else {
                result.append( "singleShard" , theShard );
                result.appendElements( res );
            }

            return ok;
        }

        BSONArrayBuilder bbb( result.subarrayStart( "shards" ) );
        BSONObjBuilder shardRawGLE;

        long long n = 0;

        int updatedExistingStat = 0; // 0 is none, -1 has but false, 1 has true

        // hit each shard
        vector<string> errors;
        vector<BSONObj> errorObjects;
        for ( set<string>::iterator i = shards->begin(); i != shards->end(); i++ ) {
            string theShard = *i;
            bbb.append( theShard );

            LOG(5) << "gathering a response for gle from: " << theShard << endl;

            boost::scoped_ptr<ShardConnection> conn;
            BSONObj res;
            bool ok = false;
            try {
                conn.reset( new ShardConnection( theShard , "" ) ); // constructor can throw if shard is down
                ok = (*conn)->runCommand( dbName , options , res );
                shardRawGLE.append( theShard , res );
            }
            catch( std::exception &e ){

                // Safe to return here, since we haven't started any extra processing yet, just collecting
                // responses.

                string message =
                        str::stream() << "could not get last error from a shard " << theShard
                                      << causedBy( e );

                warning() << message << endl;
                errmsg = message;

                if (conn)
                    conn->done();

                return false;
            }

            _addWriteBack( writebacks, res, true );

            string temp = DBClientWithCommands::getLastErrorString( res );
            if ( (*conn)->type() != ConnectionString::SYNC && ( ok == false || temp.size() ) ) {
                errors.push_back( temp );
                errorObjects.push_back( res );
            }

            n += res["n"].numberLong();
            if ( res["updatedExisting"].type() ) {
                if ( res["updatedExisting"].trueValue() )
                    updatedExistingStat = 1;
                else if ( updatedExistingStat == 0 )
                    updatedExistingStat = -1;
            }

            conn->done();
        }

        bbb.done();
        result.append( "shardRawGLE" , shardRawGLE.obj() );

        result.appendNumber( "n" , n );
        if ( updatedExistingStat )
            result.appendBool( "updatedExisting" , updatedExistingStat > 0 );

        LOG(4) << "gathering writebacks from " << sinceLastGetError().size() << " hosts for"
               << " gle (" << shards->size() << " shards)" << endl;

        // hit other machines just to block
        for ( set<string>::const_iterator i=sinceLastGetError().begin(); i!=sinceLastGetError().end(); ++i ) {
            string temp = *i;
            if ( shards->count( temp ) )
                continue;

            LOG(5) << "gathering writebacks for multi-shard gle from: " << temp << endl;

            ShardConnection conn( temp , "" );
            try {
                _addWriteBack( writebacks, conn->getLastErrorDetailed(), false );
            }
            catch( std::exception &e ){
                warning() << "could not clear last error from a shard " << temp << causedBy( e ) << endl;
            }
            conn.done();
        }
        clearSinceLastGetError();

        LOG(4) << "checking " << writebacks.size() << " writebacks for"
                << " gle (" << shards->size() << " shards)" << endl;

        if ( errors.size() == 0 ) {
            result.appendNull( "err" );
            _handleWriteBacks( writebacks , fromWriteBackListener );
            return true;
        }

        result.append( "err" , errors[0].c_str() );

        {
            // errs
            BSONArrayBuilder all( result.subarrayStart( "errs" ) );
            for ( unsigned i=0; i<errors.size(); i++ ) {
                all.append( errors[i].c_str() );
            }
            all.done();
        }

        {
            // errObjects
            BSONArrayBuilder all( result.subarrayStart( "errObjects" ) );
            for ( unsigned i=0; i<errorObjects.size(); i++ ) {
                all.append( errorObjects[i] );
            }
            all.done();
        }

        _handleWriteBacks( writebacks , fromWriteBackListener );
        return true;
    }
Esempio n. 24
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& cmdObj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        BSONElement first = cmdObj.firstElement();
        uassert(28528,
                str::stream() << "Argument to listIndexes must be of type String, not "
                              << typeName(first.type()),
                first.type() == String);
        StringData collectionName = first.valueStringData();
        uassert(28529,
                str::stream() << "Argument to listIndexes must be a collection name, "
                              << "not the empty string",
                !collectionName.empty());
        const NamespaceString ns(dbname, collectionName);

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        AutoGetCollectionForRead autoColl(txn, ns);
        if (!autoColl.getDb()) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no database"));
        }

        const Collection* collection = autoColl.getCollection();
        if (!collection) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no collection"));
        }

        const CollectionCatalogEntry* cce = collection->getCatalogEntry();
        invariant(cce);

        vector<string> indexNames;
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            indexNames.clear();
            cce->getAllIndexes(txn, &indexNames);
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

        auto ws = make_unique<WorkingSet>();
        auto root = make_unique<QueuedDataStage>(txn, ws.get());

        for (size_t i = 0; i < indexNames.size(); i++) {
            BSONObj indexSpec;
            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                indexSpec = cce->getIndexSpec(txn, indexNames[i]);
            }
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->keyData.clear();
            member->loc = RecordId();
            member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned());
            member->transitionToOwnedObj();
            root->pushBack(id);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "."
                                                    << ns.coll();
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS());
        dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            exec->detachFromOperationContext();
            ClientCursor* cursor =
                new ClientCursor(CursorManager::getGlobalCursorManager(),
                                 exec.release(),
                                 cursorNamespace,
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot());
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
Esempio n. 25
0
    void WriteBackListener::run() {

        int secsToSleep = 0;
        scoped_ptr<ShardChunkVersion> lastNeededVersion;
        int lastNeededCount = 0;

        while ( ! inShutdown() ) {

            if ( ! Shard::isAShardNode( _addr ) ) {
                LOG(1) << _addr << " is not a shard node" << endl;
                sleepsecs( 60 );
                continue;
            }

            try {
                scoped_ptr<ScopedDbConnection> conn(
                        ScopedDbConnection::getInternalScopedDbConnection( _addr ) );

                BSONObj result;

                {
                    BSONObjBuilder cmd;
                    cmd.appendOID( "writebacklisten" , &serverID ); // Command will block for data
                    if ( ! conn->get()->runCommand( "admin" , cmd.obj() , result ) ) {
                        result = result.getOwned();
                        log() <<  "writebacklisten command failed!  "  << result << endl;
                        conn->done();
                        continue;
                    }

                }
                conn->done();

                LOG(1) << "writebacklisten result: " << result << endl;

                BSONObj data = result.getObjectField( "data" );
                if ( data.getBoolField( "writeBack" ) ) {
                    string ns = data["ns"].valuestrsafe();

                    ConnectionIdent cid( "" , 0 );
                    OID wid;
                    if ( data["connectionId"].isNumber() && data["id"].type() == jstOID ) {
                        string s = "";
                        if ( data["instanceIdent"].type() == String )
                            s = data["instanceIdent"].String();
                        cid = ConnectionIdent( s , data["connectionId"].numberLong() );
                        wid = data["id"].OID();
                    }
                    else {
                        warning() << "mongos/mongod version mismatch (1.7.5 is the split)" << endl;
                    }

                    int len; // not used, but needed for next call
                    Message msg( (void*)data["msg"].binData( len ) , false );
                    massert( 10427 ,  "invalid writeback message" , msg.header()->valid() );

                    DBConfigPtr db = grid.getDBConfig( ns );
                    ShardChunkVersion needVersion = ShardChunkVersion::fromBSON( data, "version" );

                    //
                    // TODO: Refactor the sharded strategy to correctly handle all sharding state changes itself,
                    // we can't rely on WBL to do this for us b/c anything could reset our state in-between.
                    // We should always reload here for efficiency when possible, but staleness is also caught in the
                    // loop below.
                    //

                    ChunkManagerPtr manager;
                    ShardPtr primary;
                    db->getChunkManagerOrPrimary( ns, manager, primary );

                    ShardChunkVersion currVersion;
                    if( manager ) currVersion = manager->getVersion();

                    LOG(1) << "connectionId: " << cid << " writebackId: " << wid << " needVersion : " << needVersion.toString()
                           << " mine : " << currVersion.toString() << endl;

                    LOG(1) << msg.toString() << endl;

                    //
                    // We should reload only if we need to update our version to be compatible *and* we
                    // haven't already done so.  This avoids lots of reloading when we remove/add a sharded collection
                    //

                    bool alreadyReloaded = lastNeededVersion &&
                                           lastNeededVersion->isEquivalentTo( needVersion );

                    if( alreadyReloaded ){

                        LOG(1) << "wbl already reloaded config information for version "
                               << needVersion << ", at version " << currVersion << endl;
                    }
                    else if( lastNeededVersion ) {

                        log() << "new version change detected to " << needVersion.toString()
                              << ", " << lastNeededCount << " writebacks processed at "
                              << lastNeededVersion->toString() << endl;

                        lastNeededCount = 0;
                    }

                    //
                    // Set our lastNeededVersion for next time
                    //

                    lastNeededVersion.reset( new ShardChunkVersion( needVersion ) );
                    lastNeededCount++;

                    //
                    // Determine if we should reload, if so, reload
                    //

                    bool shouldReload = ! needVersion.isWriteCompatibleWith( currVersion ) &&
                                        ! alreadyReloaded;

                    if( shouldReload && currVersion.isSet()
                                     && needVersion.isSet()
                                     && currVersion.hasCompatibleEpoch( needVersion ) )
                    {

                        //
                        // If we disagree about versions only, reload the chunk manager
                        //

                        db->getChunkManagerIfExists( ns, true );
                    }
                    else if( shouldReload ){

                        //
                        // If we disagree about anything else, reload the full db
                        //

                        warning() << "reloading config data for " << db->getName() << ", "
                                  << "wanted version " << needVersion.toString()
                                  << " but currently have version " << currVersion.toString() << endl;

                        db->reload();
                    }

                    // do request and then call getLastError
                    // we have to call getLastError so we can return the right fields to the user if they decide to call getLastError

                    BSONObj gle;
                    int attempts = 0;
                    while ( true ) {
                        attempts++;

                        try {

                            Request r( msg , 0 );
                            r.init();

                            r.d().reservedField() |= Reserved_FromWriteback;

                            ClientInfo * ci = r.getClientInfo();
                            if (!noauth) {
                                // TODO: Figure out why this is 'admin' instead of 'local'.
                                ci->getAuthenticationInfo()->authorize("admin", internalSecurity.user);
                            }
                            ci->noAutoSplit();

                            r.process( attempts );

                            ci->newRequest(); // this so we flip prev and cur shards

                            BSONObjBuilder b;
                            string errmsg;
                            if ( ! ci->getLastError( "admin",
                                                     BSON( "getLastError" << 1 ),
                                                     b,
                                                     errmsg,
                                                     true ) )
                            {
                                b.appendBool( "commandFailed" , true );
                                if( ! b.hasField( "errmsg" ) ){

                                    b.append( "errmsg", errmsg );
                                    gle = b.obj();
                                }
                                else if( errmsg.size() > 0 ){

                                    // Rebuild GLE object with errmsg
                                    // TODO: Make this less clumsy by improving GLE interface
                                    gle = b.obj();

                                    if( gle["errmsg"].type() == String ){

                                        BSONObj gleNoErrmsg =
                                                gle.filterFieldsUndotted( BSON( "errmsg" << 1 ),
                                                                          false );
                                        BSONObjBuilder bb;
                                        bb.appendElements( gleNoErrmsg );
                                        bb.append( "errmsg", gle["errmsg"].String() +
                                                             " ::and:: " +
                                                             errmsg );
                                        gle = bb.obj().getOwned();
                                    }
                                }
                            }
                            else{
                                gle = b.obj();
                            }

                            log() << "GLE is " << gle << endl;

                            if ( gle["code"].numberInt() == 9517 ) {

                                log() << "new version change detected, "
                                      << lastNeededCount << " writebacks processed previously" << endl;

                                lastNeededVersion.reset();
                                lastNeededCount = 1;

                                log() << "writeback failed because of stale config, retrying attempts: " << attempts << endl;
                                LOG(1) << "writeback error : " << gle << endl;

                                //
                                // Bringing this in line with the similar retry logic elsewhere
                                //
                                // TODO: Reloading the chunk manager may not help if we dropped a
                                // collection, but we don't actually have that info in the writeback
                                // error
                                //

                                if( attempts <= 2 ){
                                    db->getChunkManagerIfExists( ns, true );
                                }
                                else{
                                    versionManager.forceRemoteCheckShardVersionCB( ns );
                                    sleepsecs( attempts - 1 );
                                }

                                uassert( 15884, str::stream()
                                         << "Could not reload chunk manager after "
                                         << attempts << " attempts.", attempts <= 4 );

                                continue;
                            }

                            ci->clearSinceLastGetError();
                        }
                        catch ( DBException& e ) {
                            error() << "error processing writeback: " << e << endl;
                            BSONObjBuilder b;
                            e.getInfo().append( b, "err", "code" );
                            gle = b.obj();
                        }

                        break;
                    }

                    {
                        scoped_lock lk( _seenWritebacksLock );
                        WBStatus& s = _seenWritebacks[cid];
                        s.id = wid;
                        s.gle = gle;
                    }
                }
                else if ( result["noop"].trueValue() ) {
                    // no-op
                }
                else {
                    log() << "unknown writeBack result: " << result << endl;
                }

                secsToSleep = 0;
                continue;
            }
            catch ( std::exception& e ) {

                if ( inShutdown() ) {
                    // we're shutting down, so just clean up
                    return;
                }

                log() << "WriteBackListener exception : " << e.what() << endl;

                // It's possible this shard was removed
                Shard::reloadShardInfo();
            }
            catch ( ... ) {
                log() << "WriteBackListener uncaught exception!" << endl;
            }
            secsToSleep++;
            sleepsecs(secsToSleep);
            if ( secsToSleep > 10 )
                secsToSleep = 0;
        }

        log() << "WriteBackListener exiting : address no longer in cluster " << _addr;

    }
Esempio n. 26
0
        void operator()( DBClientCursorBatchIterator &i ) {
            Lock::GlobalWrite lk;
            if ( context ) {
                context->relocked();
            }

            while( i.moreInCurrentBatch() ) {
                if ( n % 128 == 127 /*yield some*/ ) {
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) { 
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << n << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                if ( !tmp.valid() ) {
                    stringstream ss;
                    ss << "Cloner: skipping corrupt object from " << from_collection;
                    BSONElement e = tmp.firstElement();
                    try {
                        e.validate();
                        ss << " firstElement: " << e;
                    }
                    catch( ... ) {
                        ss << " firstElement corrupt";
                    }
                    out() << ss.str() << endl;
                    continue;
                }

                ++n;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify( strstr(from_collection, "system.indexes") );
                    js = fixindex(tmp);
                    storedForLater->push_back( js.getOwned() );
                    continue;
                }

                try {
                    theDataFileMgr.insertWithObjMod(to_collection, js);
                    if ( logForRepl )
                        logOp("i", to_collection, js);

                    getDur().commitIfNeeded();
                }
                catch( UserException& e ) {
                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
                }

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << n << " objects cloned so far from collection " << from_collection << endl;
                    saveLast = time( 0 );
                }
            }
        }
void BatchedInsertRequest::setWriteConcern(const BSONObj& writeConcern) {
    _writeConcern = writeConcern.getOwned();
    _isWriteConcernSet = true;
}
Esempio n. 28
0
    MigrateInfo* BalancerPolicy::balance( const string& ns,
                                          const DistributionStatus& distribution,
                                          int balancedLastTime ) {


        // 1) check for shards that policy require to us to move off of:
        //    draining only
        // 2) check tag policy violations
        // 3) then we make sure chunks are balanced for each tag

        // ----

        // 1) check things we have to move
        {
            const set<string>& shards = distribution.shards();
            for ( set<string>::const_iterator z = shards.begin(); z != shards.end(); ++z ) {
                string shard = *z;
                const ShardInfo& info = distribution.shardInfo( shard );

                if ( ! info.isDraining() )
                    continue;

                if ( distribution.numberOfChunksInShard( shard ) == 0 )
                    continue;

                // now we know we need to move to chunks off this shard
                // we will if we are allowed

                if ( info.hasOpsQueued() ) {
                    warning() << "want to shed load from " << shard << " but can't because it has ops queued" << endl;
                    continue;
                }

                const vector<BSONObj>& chunks = distribution.getChunks( shard );
                unsigned numJumboChunks = 0;

                // since we have to move all chunks, lets just do in order
                for ( unsigned i=0; i<chunks.size(); i++ ) {
                    BSONObj chunkToMove = chunks[i];
                    if ( _isJumbo( chunkToMove ) ) {
                        numJumboChunks++;
                        continue;
                    }

                    string tag = distribution.getTagForChunk( chunkToMove );
                    string to = distribution.getBestReceieverShard( tag );

                    if ( to.size() == 0 ) {
                        warning() << "want to move chunk: " << chunkToMove << "(" << tag << ") "
                                  << "from " << shard << " but can't find anywhere to put it" << endl;
                        continue;
                    }

                    log() << "going to move " << chunkToMove << " from " << shard << "(" << tag << ")" << " to " << to << endl;

                    return new MigrateInfo( ns, to, shard, chunkToMove.getOwned() );
                }

                warning() << "can't find any chunk to move from: " << shard
                          << " but we want to. "
                          << " numJumboChunks: " << numJumboChunks
                          << endl;
            }
        }

        // 2) tag violations
        if ( distribution.tags().size() > 0 ) {
            const set<string>& shards = distribution.shards();

            for ( set<string>::const_iterator i = shards.begin(); i != shards.end(); ++i ) {
                string shard = *i;
                const ShardInfo& info = distribution.shardInfo( shard );

                const vector<BSONObj>& chunks = distribution.getChunks( shard );
                for ( unsigned j = 0; j < chunks.size(); j++ ) {
                    string tag = distribution.getTagForChunk( chunks[j] );

                    if ( info.hasTag( tag ) )
                        continue;

                    // uh oh, this chunk is in the wrong place
                    log() << "chunk " << chunks[j]
                          << " is not on a shard with the right tag: "
                          << tag << endl;

                    if ( _isJumbo( chunks[j] ) ) {
                        warning() << "chunk " << chunks[j] << " is jumbo, so cannot be moved" << endl;
                        continue;
                    }

                    string to = distribution.getBestReceieverShard( tag );
                    if ( to.size() == 0 ) {
                        log() << "no where to put it :(" << endl;
                        continue;
                    }
                    verify( to != shard );
                    log() << " going to move to: " << to << endl;
                    return new MigrateInfo( ns, to, shard, chunks[j].getOwned() );
                }
            }
        }

        // 3) for each tag balance

        int threshold = 8;
        if ( balancedLastTime || distribution.totalChunks() < 20 )
            threshold = 2;
        else if ( distribution.totalChunks() < 80 )
            threshold = 4;

        // randomize the order in which we balance the tags
        // this is so that one bad tag doesn't prevent others from getting balanced
        vector<string> tags;
        {
            set<string> t = distribution.tags();
            for ( set<string>::const_iterator i = t.begin(); i != t.end(); ++i )
                tags.push_back( *i );
            tags.push_back( "" );

            std::random_shuffle( tags.begin(), tags.end() );
        }

        for ( unsigned i=0; i<tags.size(); i++ ) {
            string tag = tags[i];

            string from = distribution.getMostOverloadedShard( tag );
            if ( from.size() == 0 )
                continue;

            unsigned max = distribution.numberOfChunksInShardWithTag( from, tag );
            if ( max == 0 )
                continue;

            string to = distribution.getBestReceieverShard( tag );
            if ( to.size() == 0 ) {
                log() << "no available shards to take chunks for tag [" << tag << "]" << endl;
                return NULL;
            }

            unsigned min = distribution.numberOfChunksInShardWithTag( to, tag );

            const int imbalance = max - min;

            LOG(1) << "collection : " << ns << endl;
            LOG(1) << "donor      : " << from << " chunks on " << max << endl;
            LOG(1) << "receiver   : " << to << " chunks on " << min << endl;
            LOG(1) << "threshold  : " << threshold << endl;

            if ( imbalance < threshold )
                continue;

            const vector<BSONObj>& chunks = distribution.getChunks( from );
            unsigned numJumboChunks = 0;
            for ( unsigned j = 0; j < chunks.size(); j++ ) {
                if ( distribution.getTagForChunk( chunks[j] ) != tag )
                    continue;

                if ( _isJumbo( chunks[j] ) ) {
                    numJumboChunks++;
                    continue;
                }

                log() << " ns: " << ns << " going to move " << chunks[j]
                      << " from: " << from << " to: " << to << " tag [" << tag << "]"
                      << endl;
                return new MigrateInfo( ns, to, from, chunks[j] );
            }

            if ( numJumboChunks ) {
                error() << "shard: " << from << "ns: " << ns
                        << "has too many chunks, but they are all jumbo "
                        << " numJumboChunks: " << numJumboChunks
                        << endl;
                continue;
            }

            verify( false ); // should be impossible
        }

        // Everything is balanced here!
        return NULL;
    }
Esempio n. 29
0
 IndexBuilder::IndexBuilder(const BSONObj& index) :
     BackgroundJob(true /* self-delete */), _index(index.getOwned()),
     _name(str::stream() << "repl index builder " << _indexBuildCount.addAndFetch(1)) {
 }
Esempio n. 30
0
void StorageEngineMetadata::setStorageEngineOptions(const BSONObj& storageEngineOptions) {
    _storageEngineOptions = storageEngineOptions.getOwned();
}