Ejemplo n.º 1
0
    const ConfigVersion ShardingState::getVersion( const string& ns ) const {
        scoped_lock lk(_mutex);

        ChunkManagersMap::const_iterator it = _chunks.find( ns );
        if ( it != _chunks.end() ) {
            ShardChunkManagerPtr p = it->second;
            return p->getVersion();
        } else {
            return 0;
        }
    }
Ejemplo n.º 2
0
    bool ShardingState::hasVersion( const string& ns , ConfigVersion& version ) {
        scoped_lock lk(_mutex);

        ChunkManagersMap::const_iterator it = _chunks.find(ns);
        if ( it == _chunks.end() )
            return false;

        ShardChunkManagerPtr p = it->second;
        version = p->getVersion();
        return true;
    }
Ejemplo n.º 3
0
    void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ShardChunkVersion version ) {
        scoped_lock lk( _mutex );

        ChunkManagersMap::const_iterator it = _chunks.find( ns );
        assert( it != _chunks.end() ) ;
        ShardChunkManagerPtr p = it->second;

        // empty shards should have version 0
        version = ( p->getNumChunks() > 1 ) ? version : ShardChunkVersion( 0 , 0 );

        ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) );
        _chunks[ns] = cloned;
    }
Ejemplo n.º 4
0
    void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) {
        scoped_lock lk( _mutex );

        ChunkManagersMap::const_iterator it = _chunks.find( ns );
        verify( it != _chunks.end() ) ;
        ShardChunkManagerPtr p = it->second;

        // empty shards should have version 0
        version = ( p->getNumChunks() > 1 ) ? version : ChunkVersion( 0 , OID() );

        ShardChunkManagerPtr cloned( p->cloneMinus( min , max , version ) );
        // TODO: a bit dangerous to have two different zero-version states - no-manager and
        // no-version
        _chunks[ns] = cloned;
    }
Ejemplo n.º 5
0
    bool _tryQueryByPKHack(const char *ns, const BSONObj &query,
                           const ParsedQuery &pq, CurOp &curop, Message &result) {
        BSONObj resObject;

        bool found = false;
        Collection *cl = getCollection(ns);
        if (cl == NULL) {
            return false; // ns doesn't exist, fall through to optimizer for legacy reasons
        }
        const BSONObj &pk = cl->getSimplePKFromQuery(query);
        if (pk.isEmpty()) {
            return false; // unable to query by PK - resort to using the optimizer
        }
        found = queryByPKHack(cl, pk, query, resObject);

        if ( shardingState.needShardChunkManager( ns ) ) {
            ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns );
            if ( m && ! m->belongsToMe( resObject ) ) {
                // I have something for this _id
                // but it doesn't belong to me
                // so return nothing
                resObject = BSONObj();
                found = false;
            }
        }

        BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
        bb.skip(sizeof(QueryResult));

        if ( found ) {
            fillQueryResultFromObj( bb , pq.getFields() , resObject );
        }

        auto_ptr< QueryResult > qr( (QueryResult *) bb.buf() );
        bb.decouple();
        qr->setResultFlagsToOk();
        qr->len = bb.len();

        curop.debug().responseLength = bb.len();
        qr->setOperation(opReply);
        qr->cursorId = 0;
        qr->startingFrom = 0;
        qr->nReturned = found ? 1 : 0;

        result.setData( qr.release(), true );
        return true;
    }
Ejemplo n.º 6
0
    void ShardingState::appendInfo( BSONObjBuilder& b ) {
        b.appendBool( "enabled" , _enabled );
        if ( ! _enabled )
            return;

        b.append( "configServer" , _configServer );
        b.append( "shardName" , _shardName );
        b.append( "shardHost" , _shardHost );

        {
            BSONObjBuilder bb( b.subobjStart( "versions" ) );

            scoped_lock lk(_mutex);

            for ( ChunkManagersMap::iterator it = _chunks.begin(); it != _chunks.end(); ++it ) {
                ShardChunkManagerPtr p = it->second;
                bb.appendTimestamp( it->first , p->getVersion() );
            }
            bb.done();
        }

    }
Ejemplo n.º 7
0
    ShardChunkManagerPtr ShardingState::getShardChunkManager( const string& ns ){
        ConfigVersion version;
        { 
            // check cache
            scoped_lock lk( _mutex );

            NSVersionMap::const_iterator it = _versions.find( ns );
            if ( it == _versions.end() ) {
                return ShardChunkManagerPtr();
            }

            version = it->second;

            // TODO SERVER-1849 pending drop work
            // the manager should use the cached version only if the versions match exactly
            ShardChunkManagerPtr p = _chunks[ns];
            if ( p && p->getVersion() >= version ){
                // our cached version is good, so just return
                return p;                
            }
        }

        // load the chunk information for this shard from the config database
        // a reminder: ShardChunkManager may throw on construction
        const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer;
        ShardChunkManagerPtr p( new ShardChunkManager( c , ns , _shardName ) );

        // TODO SERVER-1849 verify that the manager's version is exactly the one requested
        // If not, do update _chunks, but fail the request.
        { 
            scoped_lock lk( _mutex );
            _chunks[ns] = p;
        }

        return p;
    }
Ejemplo n.º 8
0
    bool ShardingState::trySetVersion( const string& ns , ConfigVersion& version /* IN-OUT */ ) {

        // Currently this function is called after a getVersion(), which is the first "check", and the assumption here
        // is that we don't do anything nearly as long as a remote query in a thread between then and now.
        // Otherwise it may be worth adding an additional check without the _configServerMutex below, since then it
        // would be likely that the version may have changed in the meantime without waiting for or fetching config results.

        // TODO:  Mutex-per-namespace?
        
        LOG( 2 ) << "trying to set shard version of " << version.toString() << " for '" << ns << "'" << endl;
        
        _configServerTickets.waitForTicket();
        TicketHolderReleaser needTicketFrom( &_configServerTickets );

        // fast path - double-check if requested version is at the same version as this chunk manager before verifying
        // against config server
        //
        // This path will short-circuit the version set if another thread already managed to update the version in the
        // meantime.  First check is from getVersion().
        //
        // cases:
        //   + this shard updated the version for a migrate's commit (FROM side)
        //     a client reloaded chunk state from config and picked the newest version
        //   + two clients reloaded
        //     one triggered the 'slow path' (below)
        //     when the second's request gets here, the version is already current
        ConfigVersion storedVersion;
        ShardChunkManagerPtr currManager;
        {
            scoped_lock lk( _mutex );
            ChunkManagersMap::const_iterator it = _chunks.find( ns );
            if( it == _chunks.end() ){

                // TODO: We need better semantic distinction between *no manager found* and
                // *manager of version zero found*
                log() << "no current chunk manager found for this shard, will initialize" << endl;
            }
            else{
                currManager = it->second;
                if( ( storedVersion = it->second->getVersion() ).isEquivalentTo( version ) )
                    return true;
            }
        }
        
        LOG( 2 ) << "verifying cached version " << storedVersion.toString() << " and new version " << version.toString() << " for '" << ns << "'" << endl;

        // slow path - requested version is different than the current chunk manager's, if one exists, so must check for
        // newest version in the config server
        //
        // cases:
        //   + a chunk moved TO here
        //     (we don't bump up the version on the TO side but the commit to config does use higher version)
        //     a client reloads from config an issued the request
        //   + there was a take over from a secondary
        //     the secondary had no state (managers) at all, so every client request will fall here
        //   + a stale client request a version that's not current anymore

        // Can't lock default mutex while creating ShardChunkManager, b/c may have to create a new connection to myself
        const string c = (_configServer == _shardHost) ? "" /* local */ : _configServer;

        // If our epochs aren't compatible, it's not useful to use the old manager for chunk diffs
        if( currManager && ! currManager->getCollVersion().hasCompatibleEpoch( version ) ){

            warning() << "detected incompatible version epoch in new version " << version
                      << ", old version was " << currManager->getCollVersion() << endl;

            currManager.reset();
        }

        ShardChunkManagerPtr p( ShardChunkManager::make( c , ns , _shardName, currManager ) );

        // Handle the case where the collection isn't sharded more gracefully
        if( p->getKey().isEmpty() ){
            version = ConfigVersion( 0, OID() );
            // There was an error getting any data for this collection, return false
            return false;
        }

        {
            // NOTE: This lock prevents the ns version from changing while a write operation occurs.
            Lock::DBRead readLk(ns);
            
            // This lock prevents simultaneous metadata changes using the same map
            scoped_lock lk( _mutex );

            // since we loaded the chunk manager unlocked, other thread may have done the same
            // make sure we keep the freshest config info only
            ChunkManagersMap::const_iterator it = _chunks.find( ns );
            if ( it == _chunks.end() || p->getVersion() >= it->second->getVersion() ) {
                _chunks[ns] = p;
            }

            ChunkVersion oldVersion = version;
            version = p->getVersion();
            return oldVersion.isEquivalentTo( version );
        }
    }
Ejemplo n.º 9
0
    QueryResult* processGetMore(const char* ns,
                                int ntoreturn,
                                long long cursorid,
                                CurOp& curop,
                                int pass,
                                bool& exhaust,
                                bool* isCursorAuthorized ) {
        exhaust = false;
        ClientCursor::Pin p(cursorid);
        ClientCursor *client_cursor = p.c();

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        if ( unlikely(!client_cursor) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, client_cursor->ns().c_str()));
            uassert(16784, "oplog cursor reading data that is too old", !client_cursor->lastOpForSlaveTooOld());

            int queryOptions = client_cursor->queryOptions();
            OpSettings settings;
            settings.setBulkFetch(true);
            settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR);
            settings.setCappedAppendPK(queryOptions & QueryOption_AddHiddenPK);
            cc().setOpSettings(settings);

            // Check if the cursor is part of a multi-statement transaction. If it is
            // and this is not the right client (meaning the current transaction stack
            // does not match that in the cursor), it will uassert. If the cursor is
            // not part of a multi-statement transaction, then we need to use the stack
            // in the cursor for this scope.
            const bool cursorPartOfMultiStatementTxn = client_cursor->checkMultiStatementTxn();
            scoped_ptr<Client::WithTxnStack> wts;
            if (!cursorPartOfMultiStatementTxn) {
                // For simplicity, prevent multi-statement transactions from
                // reading cursors it didn't create.
                uassert(16813, "Cannot getMore() on a cursor not created by this multi-statement transaction",
                           !cc().hasTxn());
                wts.reset(new Client::WithTxnStack(client_cursor->transactions)); 
            }

            *isCursorAuthorized = true;

            if (pass == 0) {
                client_cursor->updateSlaveLocation( curop );
            }
            
            curop.debug().query = client_cursor->query();

            start = client_cursor->pos();
            Cursor *c = client_cursor->c();

            // This manager may be stale, but it's the state of chunking when the cursor was created.
            ShardChunkManagerPtr manager = client_cursor->getChunkManager();

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                           return true.  that's what we are doing here.
                           */
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();

                    // Done with this cursor, steal transaction stack back to commit or abort it here.
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    client_cursor = 0;
                    break;
                }

                MatchDetails details;
                if ( client_cursor->fields && client_cursor->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
                    // field projection specified, and contains an array operator
                    details.requestElemMatchKey();
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches( &details ) ) {
                }
                else if ( manager && ! manager->belongsToMe( client_cursor ) ){
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                }
                else {
                    if( c->getsetdup(c->currPK()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        // save this so that at the end of the loop,
                        // we can update the location for write concern
                        // in replication. Note that if this cursor is not
                        // doing replication, this is pointless
                        if ( client_cursor->queryOptions() & QueryOption_OplogReplay ) {
                            client_cursor->storeOpForSlave( c->current() );
                        }
                        n++;

                        client_cursor->fillQueryResultFromObj( b, &details );

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            client_cursor->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();
            }
            
            if ( client_cursor ) {
                client_cursor->resetIdleAge();
                exhaust = client_cursor->queryOptions() & QueryOption_Exhaust;
            } else if (!cursorPartOfMultiStatementTxn) {
                // This cursor is done and it wasn't part of a multi-statement
                // transaction. We can commit the transaction now.
                cc().commitTopTxn();
                wts->release();
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 10
0
    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
        exhaust = false;
        ClientCursor::Pointer p(cursorid);
        ClientCursor *cc = p.c();

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        if ( unlikely(!cc) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));

            if ( pass == 0 )
                cc->updateSlaveLocation( curop );

            int queryOptions = cc->queryOptions();
            
            curop.debug().query = cc->query();

            start = cc->pos();
            Cursor *c = cc->c();
            c->recoverFromYield();
            DiskLoc last;

            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );

            // This manager may be stale, but it's the state of chunking when the cursor was created.
            ShardChunkManagerPtr manager = cc->getChunkManager();

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                           return true.  that's what we are doing here.
                           */
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches() ) {
                }
                else if ( manager && ! manager->belongsToMe( cc ) ){
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                }
                else {
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        last = c->currLoc();
                        n++;

                        if ( keyFieldsOnly ) {
                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
                        }
                        else {
                            BSONObj js = c->current();
                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
                        }

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            cc->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();

                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
                    ClientCursor::erase(cursorid);
                    cursorid = 0;
                    cc = 0;
                    p.deleted();
                    break;
                }
            }
            
            if ( cc ) {
                if ( c->supportYields() ) {
                    ClientCursor::YieldData data;
                    verify( cc->prepareToYield( data ) );
                }
                else {
                    cc->c()->noteLocation();
                }
                cc->mayUpgradeStorage();
                cc->storeOpForSlave( last );
                exhaust = cc->queryOptions() & QueryOption_Exhaust;
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 11
0
    /**
     * Run a query -- includes checking for and running a Command.
     * @return points to ns if exhaust mode. 0=normal mode
     * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
     * @yields the db mutex periodically after acquiring it.
     * @asserts on scan and order memory exhaustion and other cases.
     */
    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
        ParsedQuery& pq( *pq_shared );
        BSONObj jsobj = q.query;
        int queryOptions = q.queryOptions;
        const char *ns = q.ns;

        if( logLevel >= 2 )
            log() << "runQuery called " << ns << " " << jsobj << endl;

        curop.debug().ns = ns;
        curop.debug().ntoreturn = pq.getNumToReturn();
        curop.debug().query = jsobj;
        curop.setQuery(jsobj);

        // Run a command.
        
        if ( pq.couldBeCommand() ) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult));
            BSONObjBuilder cmdResBuf;
            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
                curop.debug().iscommand = true;
                curop.debug().query = jsobj;
                curop.markCommand();

                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = 1;
                result.setData( qr.release(), true );
            }
            else {
                uasserted(13530, "bad or malformed command request?");
            }
            return 0;
        }

        bool explain = pq.isExplain();
        BSONObj order = pq.getOrder();
        BSONObj query = pq.getFilter();

        /* The ElemIter will not be happy if this isn't really an object. So throw exception
           here when that is true.
           (Which may indicate bad data from client.)
        */
        if ( query.objsize() == 0 ) {
            out() << "Bad query object?\n  jsobj:";
            out() << jsobj.toString() << "\n  query:";
            out() << query.toString() << endl;
            uassert( 10110 , "bad query object", false);
        }

        Client::ReadContext ctx( ns , dbpath ); // read locks
        const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );

        replVerifyReadsOk(&pq);

        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
            NamespaceDetails *d = nsdetails( ns );
            uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() );
            const BSONObj nat1 = BSON( "$natural" << 1 );
            if ( order.isEmpty() ) {
                order = nat1;
            }
            else {
                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
            }
        }

        // Run a simple id query.
        
        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {

            int n = 0;
            bool nsFound = false;
            bool indexFound = false;

            BSONObj resObject;
            Client& c = cc();
            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
            if ( nsFound == false || indexFound == true ) {
                
                if ( shardingState.needShardChunkManager( ns ) ) {
                    ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns );
                    if ( m && ! m->belongsToMe( resObject ) ) {
                        // I have something this _id
                        // but it doesn't belong to me
                        // so return nothing
                        resObject = BSONObj();
                        found = false;
                    }
                }

                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
                bb.skip(sizeof(QueryResult));
                
                curop.debug().idhack = true;
                if ( found ) {
                    n = 1;
                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
                }
                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = n;
                result.setData( qr.release(), true );
                return NULL;
            }
        }
        
        // Run a regular query.
        
        BSONObj oldPlan;
        if ( explain && ! pq.hasIndexSpecifier() ) {
            MultiPlanScanner mps( ns, query, order );
            if ( mps.usingCachedPlan() ) {
                oldPlan =
                mps.oldExplain().firstElement().embeddedObject()
                .firstElement().embeddedObject().getOwned();
            }
        }

        // In some cases the query may be retried if there is an in memory sort size assertion.
        for( int retry = 0; retry < 2; ++retry ) {
            try {
                return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order,
                                               pq_shared, oldPlan, shardingVersionAtStart, result );
            } catch ( const QueryRetryException & ) {
                verify( retry == 0 );
            }
        }
        verify( false );
        return 0;
    }