Ejemplo n.º 1
0
int BandwidthController::enableBandwidthControl(void) {
    int res;

    /* Let's pretend we started from scratch ... */
    sharedQuotaIfaces.clear();
    quotaIfaces.clear();
    naughtyAppUids.clear();
    globalAlertBytes = 0;
    globalAlertTetherCount = 0;
    sharedQuotaBytes = sharedAlertBytes = 0;


    /* Some of the initialCommands are allowed to fail */
    runCommands(sizeof(IPT_CLEANUP_COMMANDS) / sizeof(char*),
            IPT_CLEANUP_COMMANDS, RunCmdFailureOk);
    runCommands(sizeof(IPT_SETUP_COMMANDS) / sizeof(char*),
            IPT_SETUP_COMMANDS, RunCmdFailureOk);
    res = runCommands(sizeof(IPT_BASIC_ACCOUNTING_COMMANDS) / sizeof(char*),
            IPT_BASIC_ACCOUNTING_COMMANDS, RunCmdFailureBad);

    setupOemIptablesHook();

    return res;

}
int BandwidthController::enableBandwidthControl(bool force) {
    int res;
    char value[PROPERTY_VALUE_MAX];

    if (!force) {
            property_get("persist.bandwidth.enable", value, "1");
            if (!strcmp(value, "0"))
                    return 0;
    }

    /* Let's pretend we started from scratch ... */
    sharedQuotaIfaces.clear();
    quotaIfaces.clear();
    naughtyAppUids.clear();
    globalAlertBytes = 0;
    globalAlertTetherCount = 0;
    sharedQuotaBytes = sharedAlertBytes = 0;

    res = runCommands(sizeof(IPT_FLUSH_COMMANDS) / sizeof(char*),
            IPT_FLUSH_COMMANDS, RunCmdFailureOk);

    res |= runCommands(sizeof(IPT_BASIC_ACCOUNTING_COMMANDS) / sizeof(char*),
            IPT_BASIC_ACCOUNTING_COMMANDS, RunCmdFailureBad);

    return res;

}
Ejemplo n.º 3
0
Archivo: main.c Proyecto: mfdeakin/heap
int main(int argc, char **argv)
{
	heap *hp = hpCreate((int (*)(const void *, const void *))strcmp);
	if (isatty(fileno(stdin))) {
		return runCommands(hp, true);
	}
	else {
		return runCommands(hp, false);
	}
}
Ejemplo n.º 4
0
static bool migrateDatabase(SQLiteDatabase& sqliteDatabase)
{
    if (!sqliteDatabase.tableExists("MetaData")) {
        if (!createMetaDataTable(sqliteDatabase))
            return false;
    }

    int databaseVersion;
    if (!getDatabaseVersion(sqliteDatabase, &databaseVersion))
        return false;

    if (databaseVersion == 1) {
        static const char* commands[] = {
            "DROP TABLE IF EXISTS ObjectStoreData2",
            "CREATE TABLE ObjectStoreData2 (id INTEGER PRIMARY KEY, objectStoreId INTEGER NOT NULL REFERENCES ObjectStore(id), keyString TEXT, keyDate REAL, keyNumber REAL, value TEXT NOT NULL)",
            "INSERT INTO ObjectStoreData2 SELECT * FROM ObjectStoreData",
            "DROP TABLE ObjectStoreData", // This depends on SQLite not enforcing referential consistency.
            "ALTER TABLE ObjectStoreData2 RENAME TO ObjectStoreData",
            "CREATE UNIQUE INDEX ObjectStoreData_composit ON ObjectStoreData(keyString, keyDate, keyNumber, objectStoreId)",
            "DROP TABLE IF EXISTS IndexData2", // This depends on SQLite not enforcing referential consistency.
            "CREATE TABLE IndexData2 (id INTEGER PRIMARY KEY, indexId INTEGER NOT NULL REFERENCES Indexes(id), keyString TEXT, keyDate REAL, keyNumber REAL, objectStoreDataId INTEGER NOT NULL REFERENCES ObjectStoreData(id))",
            "INSERT INTO IndexData2 SELECT * FROM IndexData",
            "DROP TABLE IndexData",
            "ALTER TABLE IndexData2 RENAME TO IndexData",
            "CREATE INDEX IndexData_composit ON IndexData(keyString, keyDate, keyNumber, indexId)",
            "CREATE INDEX IndexData_objectStoreDataId ON IndexData(objectStoreDataId)",
            "CREATE INDEX IndexData_indexId ON IndexData(indexId)",
            "UPDATE MetaData SET value = 2 WHERE name = 'version'",
        };

        if (!runCommands(sqliteDatabase, commands, sizeof(commands) / sizeof(commands[0])))
            return false;

        databaseVersion = 2;
    }

    if (databaseVersion == 2) {
        // We need to make the ObjectStoreData.value be a BLOB instead of TEXT.
        static const char* commands[] = {
            "DROP TABLE IF EXISTS ObjectStoreData", // This drops associated indices.
            "CREATE TABLE ObjectStoreData (id INTEGER PRIMARY KEY, objectStoreId INTEGER NOT NULL REFERENCES ObjectStore(id), keyString TEXT, keyDate REAL, keyNumber REAL, value BLOB NOT NULL)",
            "CREATE UNIQUE INDEX ObjectStoreData_composit ON ObjectStoreData(keyString, keyDate, keyNumber, objectStoreId)",
            "UPDATE MetaData SET value = 3 WHERE name = 'version'",
        };

        if (!runCommands(sqliteDatabase, commands, sizeof(commands) / sizeof(commands[0])))
            return false;

        databaseVersion = 3;
    }

    return true;
}
void BandwidthController::flushCleanTables(bool doClean) {
    /* Flush and remove the bw_costly_<iface> tables */
    flushExistingCostlyTables(doClean);

    /* Some of the initialCommands are allowed to fail */
    runCommands(sizeof(IPT_FLUSH_COMMANDS) / sizeof(char*),
            IPT_FLUSH_COMMANDS, RunCmdFailureOk);

    if (doClean) {
        runCommands(sizeof(IPT_CLEANUP_COMMANDS) / sizeof(char*),
                IPT_CLEANUP_COMMANDS, RunCmdFailureOk);
    }
}
int BandwidthController::setupIptablesHooks(void) {

    /* Some of the initialCommands are allowed to fail */
    runCommands(sizeof(IPT_FLUSH_COMMANDS) / sizeof(char*),
            IPT_FLUSH_COMMANDS, RunCmdFailureOk);

    runCommands(sizeof(IPT_CLEANUP_COMMANDS) / sizeof(char*),
            IPT_CLEANUP_COMMANDS, RunCmdFailureOk);

    runCommands(sizeof(IPT_SETUP_COMMANDS) / sizeof(char*),
            IPT_SETUP_COMMANDS, RunCmdFailureBad);

    return 0;
}
Ejemplo n.º 7
0
int BandwidthController::disableBandwidthControl(void) {
    /* The IPT_CLEANUP_COMMANDS are allowed to fail. */
    runCommands(sizeof(IPT_CLEANUP_COMMANDS) / sizeof(char*),
            IPT_CLEANUP_COMMANDS, RunCmdFailureOk);
    setupOemIptablesHook();
    return 0;
}
Ejemplo n.º 8
0
static bool createTables(SQLiteDatabase& sqliteDatabase)
{
    if (sqliteDatabase.tableExists("Databases"))
        return true;
    static const char* commands[] = {
        "CREATE TABLE Databases (id INTEGER PRIMARY KEY, name TEXT NOT NULL, description TEXT NOT NULL, version TEXT NOT NULL)",
        "CREATE UNIQUE INDEX Databases_name ON Databases(name)",

        "CREATE TABLE ObjectStores (id INTEGER PRIMARY KEY, name TEXT NOT NULL, keyPath TEXT, doAutoIncrement INTEGER NOT NULL, databaseId INTEGER NOT NULL REFERENCES Databases(id))",
        "CREATE UNIQUE INDEX ObjectStores_composit ON ObjectStores(databaseId, name)",

        "CREATE TABLE Indexes (id INTEGER PRIMARY KEY, objectStoreId INTEGER NOT NULL REFERENCES ObjectStore(id), name TEXT NOT NULL, keyPath TEXT, isUnique INTEGER NOT NULL)",
        "CREATE UNIQUE INDEX Indexes_composit ON Indexes(objectStoreId, name)",

        "CREATE TABLE ObjectStoreData (id INTEGER PRIMARY KEY, objectStoreId INTEGER NOT NULL REFERENCES ObjectStore(id), keyString TEXT, keyDate INTEGER, keyNumber INTEGER, value TEXT NOT NULL)",
        "CREATE UNIQUE INDEX ObjectStoreData_composit ON ObjectStoreData(keyString, keyDate, keyNumber, objectStoreId)",

        "CREATE TABLE IndexData (id INTEGER PRIMARY KEY, indexId INTEGER NOT NULL REFERENCES Indexes(id), keyString TEXT, keyDate INTEGER, keyNumber INTEGER, objectStoreDataId INTEGER NOT NULL REFERENCES ObjectStoreData(id))",
        "CREATE INDEX IndexData_composit ON IndexData(keyString, keyDate, keyNumber, indexId)",
        "CREATE INDEX IndexData_objectStoreDataId ON IndexData(objectStoreDataId)",
        "CREATE INDEX IndexData_indexId ON IndexData(indexId)",
    };

    return runCommands(sqliteDatabase, commands, sizeof(commands) / sizeof(commands[0]));
}
int BandwidthController::setupIptablesHooks(void) {

    /* flush+clean is allowed to fail */
    flushCleanTables(true);
    runCommands(sizeof(IPT_SETUP_COMMANDS) / sizeof(char*),
            IPT_SETUP_COMMANDS, RunCmdFailureBad);

    return 0;
}
Ejemplo n.º 10
0
static bool createMetaDataTable(SQLiteDatabase& sqliteDatabase)
{
    static const char* commands[] = {
        "CREATE TABLE MetaData (name TEXT PRIMARY KEY, value NONE)",
        "INSERT INTO MetaData VALUES ('version', 1)",
    };

    return runCommands(sqliteDatabase, commands, sizeof(commands) / sizeof(commands[0]));
}
Ejemplo n.º 11
0
void IMAP::unblockCommands()
{
    if ( d->state != NotAuthenticated )
        while ( d->commands.firstElement() &&
                d->commands.firstElement()->state() == Command::Retired )
            d->commands.shift();
    if ( d->runningCommands )
        d->runCommandsAgain = true;
    else
        runCommands();
}
Ejemplo n.º 12
0
/* This function runs the pipeline of commands. If there are no pipes, the command
 * simply executes. Otherwise, the last command spawns a child process, sets up
 * a pipe with it and waits for it to run (recursively)*/
int runCommands(char** tokens, char** pipes, int numTokens, int numPipes){
	
	/* Simple case*/
	if(numPipes == 0){
		forkProcess(tokens, numTokens);
		exit(0);
	}else{
		
		int i,pid;
		int fd[2];
		
		pipe(fd);

		int commandTokens = 0;
		
		/* Count from the back*/
		for(i = numTokens-1; tokens[i] > pipes[numPipes-1];--i){
			++commandTokens;
		}
		
		pid	= fork();
		
		/* The child is the command before the last one */
		if(pid==0){
			dup2(fd[1],1);
			close(fd[0]);
			/* It sees everything before the last pipe */
			runCommands(tokens, pipes, numTokens-commandTokens, numPipes - 1);
			exit(0);
		}else if(pid == -1){
			perror("fork");
			/* The parent waits on output from the child and then executes*/
		}else{
			dup2(fd[0],0);
			close(fd[1]);
			int got_pid;
			waitAround(pid);
			/* Execute everything after the last pipe*/
			forkProcess(tokens + (char)(numTokens-commandTokens),commandTokens);
			exit(0);
		}
	}
	
}
Ejemplo n.º 13
0
void IMAP::react( Event e )
{
    d->bytesArrived += readBuffer()->size();
    switch ( e ) {
    case Read:
        parse();
        if ( d->bytesArrived > 32768 && state() == NotAuthenticated ) {
            log( ">32k received before login" );
            enqueue( "* BYE overlong login sequence\r\n" );
            Connection::setState( Closing );
            if ( d->reader ) {
                Scope s( d->reader->log() );
                d->reader->read();
            }
        }
        break;

    case Timeout:
        if ( state() != Logout ) {
            log( "Idle timeout" );
            enqueue( "* BYE Tempus fugit\r\n" );
        }
        Connection::setState( Closing );
        if ( d->reader ) {
            Scope s( d->reader->log() );
            d->reader->read();
        }
        setSession( 0 );
        break;

    case Connect:
        break;

    case Error:
    case Close:
        if ( session() ) {
            log( "Unexpected close by client" );
            setSession( 0 );
        }
        if ( !d->commands.isEmpty() ) {
            List<Command>::Iterator i( d->commands );
            while ( i ) {
                Command * c = i;
                ++i;
                if ( c->state() == Command::Unparsed ||
                     c->state() == Command::Blocked ||
                     c->state() == Command::Executing )
                    c->error( Command::No,
                              "Unexpected close by client" );
            }
        }
        break;

    case Shutdown:
        enqueue( "* BYE server shutdown\r\n" );
        if ( session() && d->commands.isEmpty() )
            setSession( 0 );
        break;
    }

    runCommands();

    d->bytesArrived -= readBuffer()->size();

    if ( timeout() == 0 ||
         ( e == Read && state() != NotAuthenticated ) ) {
        switch ( state() ) {
        case NotAuthenticated:
            setTimeoutAfter( 120 );
            break;
        case Authenticated:
        case Selected:
            if ( idle() )
                setTimeoutAfter( 3600 ); // one hour while IDLE
            else
                setTimeoutAfter( 1860 ); // a half-hour without
            break;
        case Logout:
            break;
        }

    }
}
Ejemplo n.º 14
0
    /**
     * Run a query -- includes checking for and running a Command.
     * @return points to ns if exhaust mode. 0=normal mode
     * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
     * @yields the db mutex periodically after acquiring it.
     * @asserts on scan and order memory exhaustion and other cases.
     */
    const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
        ParsedQuery& pq( *pq_shared );
        BSONObj jsobj = q.query;
        int queryOptions = q.queryOptions;
        const char *ns = q.ns;

        if( logLevel >= 2 )
            log() << "runQuery called " << ns << " " << jsobj << endl;

        curop.debug().ns = ns;
        curop.debug().ntoreturn = pq.getNumToReturn();
        curop.debug().query = jsobj;
        curop.setQuery(jsobj);

        // Run a command.
        
        if ( pq.couldBeCommand() ) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult));
            BSONObjBuilder cmdResBuf;
            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
                curop.debug().iscommand = true;
                curop.debug().query = jsobj;
                curop.markCommand();

                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = 1;
                result.setData( qr.release(), true );
            }
            else {
                uasserted(13530, "bad or malformed command request?");
            }
            return 0;
        }

        bool explain = pq.isExplain();
        BSONObj order = pq.getOrder();
        BSONObj query = pq.getFilter();

        /* The ElemIter will not be happy if this isn't really an object. So throw exception
           here when that is true.
           (Which may indicate bad data from client.)
        */
        if ( query.objsize() == 0 ) {
            out() << "Bad query object?\n  jsobj:";
            out() << jsobj.toString() << "\n  query:";
            out() << query.toString() << endl;
            uassert( 10110 , "bad query object", false);
        }

        Client::ReadContext ctx( ns , dbpath ); // read locks
        const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );

        replVerifyReadsOk(&pq);

        if ( pq.hasOption( QueryOption_CursorTailable ) ) {
            NamespaceDetails *d = nsdetails( ns );
            uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() );
            const BSONObj nat1 = BSON( "$natural" << 1 );
            if ( order.isEmpty() ) {
                order = nat1;
            }
            else {
                uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
            }
        }

        // Run a simple id query.
        
        if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) {

            int n = 0;
            bool nsFound = false;
            bool indexFound = false;

            BSONObj resObject;
            Client& c = cc();
            bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound );
            if ( nsFound == false || indexFound == true ) {
                
                if ( shardingState.needShardChunkManager( ns ) ) {
                    ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns );
                    if ( m && ! m->belongsToMe( resObject ) ) {
                        // I have something this _id
                        // but it doesn't belong to me
                        // so return nothing
                        resObject = BSONObj();
                        found = false;
                    }
                }

                BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32);
                bb.skip(sizeof(QueryResult));
                
                curop.debug().idhack = true;
                if ( found ) {
                    n = 1;
                    fillQueryResultFromObj( bb , pq.getFields() , resObject );
                }
                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = n;
                result.setData( qr.release(), true );
                return NULL;
            }
        }
        
        // Run a regular query.
        
        BSONObj oldPlan;
        if ( explain && ! pq.hasIndexSpecifier() ) {
            MultiPlanScanner mps( ns, query, order );
            if ( mps.usingCachedPlan() ) {
                oldPlan =
                mps.oldExplain().firstElement().embeddedObject()
                .firstElement().embeddedObject().getOwned();
            }
        }

        // In some cases the query may be retried if there is an in memory sort size assertion.
        for( int retry = 0; retry < 2; ++retry ) {
            try {
                return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order,
                                               pq_shared, oldPlan, shardingVersionAtStart, result );
            } catch ( const QueryRetryException & ) {
                verify( retry == 0 );
            }
        }
        verify( false );
        return 0;
    }
Ejemplo n.º 15
0
int main(){
	/* The raw input is stored in a buffer. The locations of the first character
	 * of the tokens and the pipes is given by a pointer*/
	char input[INPUT_BUFFER];
	char* tokens[MAX_TOKENS];
	char* pipes[MAX_TOKENS];
	int numTokens;
	int numPipes;
	
	/* Assign names and function pointers to the global array above*/
	buildFunctionTable();

	while(1){

		/* See header for more info. This function prompts the user for input.
		 * a return of -1 means no non-whitespace input*/
		if(getInput(input)==-1){
			continue;
		}
		
		/* Break the input into tokens and pipes. Return an error code for
		 * different types of invalid input (or 0 on good input) */
		int valid = parseInput(input, tokens, pipes, &numTokens, &numPipes);
		if(valid<0){
			switch(valid){
				case LINE_ENDING_INVALID:
					fprintf(stderr, "Error: There was an issue with line endings\n");
					break;
				case MAX_ARGS_ERROR:
					fprintf(stderr, "Error: Too many arguments\n");
					break;
				case QUOTE_ERROR:
					fprintf(stderr, "Error: Unmatched Quotes\n");
					break;
				default:
					fprintf(stderr, "Error: Invalid Input\n");
					break;
			}
			continue;
		}
		
		/* If there's more than one command, check for built-in commands first
		 * and then set up to execute an external command */
		if(numTokens > 0){
			
			if(checkBuiltIn(tokens,numTokens)!=0){
				int pid;
				pid	= fork();
				
				if(pid==0){
					/* The child runs the command */
					runCommands(tokens,pipes,numTokens,numPipes);
					exit(0);
				}else if(pid == -1){
					perror("fork");
				}else{
					/* The parent gives the return message */
					waitAround(pid);
				}
			}
		}
	}
	
	return 0;
}
Ejemplo n.º 16
0
int
main (int argc, char* argv[])
{
    bkupInfo info;
    bkupType type;
    int      len;
    int      rst;               /* return status */


    if (getuid() != ROOT_UID) {
        fprintf(stderr, "must be root\n");
        exit(1);
    }
    if (argc <= 2) {
        usage();
    }
    umask(defUmask);
    memset(&info, 0, sizeof(info));

    info.dest = argv[2];
    info.src  = index(argv[1], ':');
    if (info.src) {
        info.host = index(argv[1], '@');
        if (!info.host || info.host > info.src) {
            /* no user specified, or `@' appeared after `:'
             */
            info.host = argv[1];
            info.user = DefaultUser;
        } else {
            *info.host = '\0';
            ++info.host;
            info.user = argv[1];
        }
        *info.src = '\0';
        ++info.src;
    } else {
        info.src  = argv[1];
    }
    if (*info.src  != '/') goto errorExit;  /* src  must be full path */
    if (*info.dest != '/') goto errorExit;  /* dest must be full path */
    len = strlen(info.src) - 1;
    if (info.src[len] == '/') {  /* strip tail '/' */
        info.src[len] = '\0';
    }
    len = strlen(info.dest) - 1;
    if (info.dest[len] == '/') {
        info.dest[len] = '\0';
    }

    if (info.host) {
        makeSshKey(&info);
    }
    chkDest(&info);
    if (info.host) {
        exit(doRemote(&info));
    }

    type = chkSource(&info);
    openFilesLocal(&info);

    switch (type) {
    case bkupFirstTime:
        info.func = firstTimeBackup;
        break;
    case bkupRecurrent:
        info.func = recurrentBackup;
        break;
    default:
        errExit(("wrong bkup type (%d)", type));
    }
    rst = dirwalk(info.src, &info);
    closeFiles(&info);
    if ((type == bkupRecurrent) && unlink(info.oldJpath)) {
        errSysRet(("unlink(%s)", info.oldJpath));
    }
    rst = runCommands(&info);
    removeFiles(&info);
    if (type == bkupFirstTime && !rst) {
        if (info.jpath && unlink(info.jpath)) {
            errSysRet(("unlink(%s)", info.jpath));
        }
    }
    exit(!rst);


errorExit:
    fprintf(stderr,
            "Source and Destination directories must be full path\n");
    exit(1);
    return 0;                   /* to make gcc happy */
}
Ejemplo n.º 17
0
    /**
     * Run a query -- includes checking for and running a Command.
     * @return points to ns if exhaust mode. 0=normal mode
     * @locks the db mutex for reading (and potentially for writing temporarily to create a new db).
     * @asserts on scan and order memory exhaustion and other cases.
     */
    string runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) );
        ParsedQuery& pq( *pq_shared );
        BSONObj jsobj = q.query;
        int queryOptions = q.queryOptions;
        const char *ns = q.ns;

        uassert( 16332 , "can't have an empty ns" , ns[0] );

        if( logLevel >= 2 )
            log() << "runQuery called " << ns << " " << jsobj << endl;

        curop.debug().ns = ns;
        curop.debug().ntoreturn = pq.getNumToReturn();
        curop.debug().query = jsobj;
        curop.setQuery(jsobj);

        uassert( 16256, str::stream() << "Invalid ns [" << ns << "]", NamespaceString::isValid(ns) );

        // Run a command.
        
        if ( pq.couldBeCommand() ) {
            curop.markCommand();
            BufBuilder bb;
            bb.skip(sizeof(QueryResult));
            BSONObjBuilder cmdResBuf;
            if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) {
                curop.debug().iscommand = true;
                curop.debug().query = jsobj;

                auto_ptr< QueryResult > qr;
                qr.reset( (QueryResult *) bb.buf() );
                bb.decouple();
                qr->setResultFlagsToOk();
                qr->len = bb.len();
                curop.debug().responseLength = bb.len();
                qr->setOperation(opReply);
                qr->cursorId = 0;
                qr->startingFrom = 0;
                qr->nReturned = 1;
                result.setData( qr.release(), true );
            }
            else {
                uasserted(13530, "bad or malformed command request?");
            }
            return "";
        }

        const bool explain = pq.isExplain();
        const bool tailable = pq.hasOption(QueryOption_CursorTailable);
        BSONObj order = pq.getOrder();
        BSONObj query = pq.getFilter();

        /* The ElemIter will not be happy if this isn't really an object. So throw exception
           here when that is true.
           (Which may indicate bad data from client.)
        */
        if ( query.objsize() == 0 ) {
            out() << "Bad query object?\n  jsobj:";
            out() << jsobj.toString() << "\n  query:";
            out() << query.toString() << endl;
            uassert( 10110 , "bad query object", false);
        }

        // Tailable cursors need to read newly written entries from the tail
        // of the collection. They manually arbitrate with the collection over
        // what data is readable and when, so we choose read uncommited isolation.
        OpSettings settings;
        settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR);
        settings.setBulkFetch(true);
        settings.setCappedAppendPK(pq.hasOption(QueryOption_AddHiddenPK));
        cc().setOpSettings(settings);

        // If our caller has a transaction, it's multi-statement.
        const bool inMultiStatementTxn = cc().hasTxn();
        if (tailable) {
            // Because it's easier to disable this. It shouldn't be happening in a normal system.
            uassert(16812, "May not perform a tailable query in a multi-statement transaction.",
                           !inMultiStatementTxn);
        }

        // Begin a read-only, snapshot transaction under normal circumstances.
        // If the cursor is tailable, we need to be able to read uncommitted data.
        const int txnFlags = (tailable ? DB_READ_UNCOMMITTED : DB_TXN_SNAPSHOT) | DB_TXN_READ_ONLY;
        LOCK_REASON(lockReason, "query");
        Client::ReadContext ctx(ns, lockReason);
        scoped_ptr<Client::Transaction> transaction(!inMultiStatementTxn ?
                                                    new Client::Transaction(txnFlags) : NULL);

        bool hasRetried = false;
        while ( 1 ) {
            try {
                replVerifyReadsOk(&pq);

                // Fast-path for primary key queries.
                if (!explain && !tailable) {
                    replVerifyReadsOk(&pq);
                    if (_tryQueryByPKHack(ns, query, pq, curop, result)) {
                        if (transaction) {
                            transaction->commit();
                        }
                        return "";
                    }
                }

                // sanity check the query and projection
                if (pq.getFields() != NULL) {
                    pq.getFields()->validateQuery( query );
                }

                        
                if (tailable) {
                    Collection *cl = getCollection( ns );
                    if (cl != NULL && !(cl->isCapped() || str::equals(ns, rsoplog))) {
                        uasserted( 13051, "tailable cursor requested on non-capped, non-oplog collection" );
                    }
                    const BSONObj nat1 = BSON( "$natural" << 1 );
                    if ( order.isEmpty() ) {
                        order = nat1;
                    } else {
                        uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 );
                    }
                }
                    
                // Run a regular query.

                // these now may stored in a ClientCursor or somewhere else,
                // so make sure we use a real copy
                jsobj = jsobj.getOwned();
                query = query.getOwned();
                order = order.getOwned();
                const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns );
                const bool getCachedExplainPlan = ! hasRetried && explain && ! pq.hasIndexSpecifier();
                const bool savedCursor = queryWithQueryOptimizer( queryOptions, ns, jsobj, curop, query,
                                                                  order, pq_shared, shardingVersionAtStart,
                                                                  getCachedExplainPlan, inMultiStatementTxn,
                                                                  result );
                // Did not save the cursor, so we can commit the transaction now if it exists.
                if (transaction && !savedCursor) {
                    transaction->commit();
                }
                return curop.debug().exhaust ? ns : "";
            }
            catch ( const QueryRetryException & ) {
                // In some cases the query may be retried if there is an in memory sort size assertion.
                verify( ! hasRetried );
                hasRetried = true;
            }
        }
    }
Ejemplo n.º 18
0
    std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
            bb.decouple();
            qr->setResultFlagsToOk();
            qr->len = bb.len();
            curop.debug().responseLength = bb.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            result.setData(qr, true);
            return "";
        }

        // This is a read lock.  We require this because if we're parsing a $where, the
        // where-specific parsing code assumes we have a lock and creates execution machinery that
        // requires it.
        Client::ReadContext ctx(q.ns);
        Collection* collection = ctx.ctx().db()->getCollection( ns );

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(q, &cq);
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }
        verify(cq);

        QLOG() << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (collection == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(collection, cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getRunner has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get()));
        runner->setYieldPolicy(Runner::YIELD_AUTO);

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        // Have we retrieved info about which plan the runner will
        // use to execute the query yet?
        bool gotPlanInfo = false;
        PlanInfo* rawInfo;
        boost::scoped_ptr<PlanInfo> planInfo;

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // In the case of the multi plan runner, we may not be able to
            // successfully retrieve plan info until after the query starts
            // to run. This is because the multi plan runner doesn't know what
            // plan it will end up using until it runs candidates and selects
            // the best.
            //
            // TODO: Do we ever want to output what the MPR is comparing?
            if (!gotPlanInfo) {
                Status infoStatus = runner->getInfo(NULL, &rawInfo);
                if (infoStatus.isOK()) {
                    gotPlanInfo = true;
                    planInfo.reset(rawInfo);
                    // planSummary is really a ThreadSafeString which copies the data from
                    // the provided pointer.
                    curop.debug().planSummary = planInfo->planSummary.c_str();
                }
            }

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // Try to get information about the plan which the runner
        // will use to execute the query, it we don't have it already.
        if (!gotPlanInfo) {
            Status infoStatus = runner->getInfo(NULL, &rawInfo);
            if (infoStatus.isOK()) {
                gotPlanInfo = true;
                planInfo.reset(rawInfo);
                // planSummary is really a ThreadSafeString which copies the data from
                // the provided pointer.
                curop.debug().planSummary = planInfo->planSummary.c_str();
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases.
        if (Runner::RUNNER_ERROR == state) {
            TypeExplain* bareExplain;
            Status res = runner->getInfo(&bareExplain, NULL);
            if (res.isOK()) {
                boost::scoped_ptr<TypeExplain> errorExplain(bareExplain);
                error() << "Runner error, stats:\n"
                        << errorExplain->stats.jsonString(Strict, true);
            }
            uasserted(17144, "Runner error: " + WorkingSetCommon::toStatusString(obj));
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Used to fill in explain and to determine if the query is slow enough to be logged.
        int elapsedMillis = curop.elapsedMillis();

        // Get explain information if:
        // 1) it is needed by an explain query;
        // 2) profiling is enabled; or
        // 3) profiling is disabled but we still need explain details to log a "slow" query.
        // Producing explain information is expensive and should be done only if we are certain
        // the information will be used.
        boost::scoped_ptr<TypeExplain> explain(NULL);
        if (isExplain ||
            ctx.ctx().db()->getProfilingLevel() > 0 ||
            elapsedMillis > serverGlobalParams.slowMS) {
            // Ask the runner to produce explain information.
            TypeExplain* bareExplain;
            Status res = runner->getInfo(&bareExplain, NULL);
            if (res.isOK()) {
                explain.reset(bareExplain);
            }
            else if (isExplain) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
        }

        // Fill in the missing run-time fields in explain, starting with propeties of
        // the process running the query.
        if (isExplain && NULL != explain.get()) {
            std::string server = mongoutils::str::stream()
                << getHostNameCached() << ":" << serverGlobalParams.port;
            explain->setServer(server);

            // We might have skipped some results due to chunk migration etc. so our count is
            // correct.
            explain->setN(numResults);

            // Clock the whole operation.
            explain->setMillis(elapsedMillis);

            BSONObj explainObj = explain->toBSON();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            // The explain output is actually a result.
            numResults = 1;
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection, runner.get(),
                                                cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "Not caching runner but returning " << numResults << " results.\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        // Set debug information for consumption by the profiler.
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;
        if (NULL != explain.get()) {
            if (explain->isScanAndOrderSet()) {
                curop.debug().scanAndOrder = explain->getScanAndOrder();
            }
            else {
                curop.debug().scanAndOrder = false;
            }

            if (explain->isNScannedSet()) {
                curop.debug().nscanned = explain->getNScanned();
            }

            if (explain->isNScannedObjectsSet()) {
                curop.debug().nscannedObjects = explain->getNScannedObjects();
            }

            if (explain->isIDHackSet()) {
                curop.debug().idhack = explain->getIDHack();
            }

            if (!explain->stats.isEmpty()) {
                // execStats is a CachedBSONObj because it lives in the race-prone
                // curop.
                curop.debug().execStats.set(explain->stats);

                // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
                if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) {
                    BSONObjBuilder bob;
                    bob.append("summary", curop.debug().planSummary.toString());
                    curop.debug().execStats.set(bob.done());
                }

            }
        }

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Ejemplo n.º 19
0
void POP::parse()
{
    Buffer *b = readBuffer();

    while ( b->size() > 0 ) {
        if ( !d->reader ) {
            if ( d->reserved )
                break;

            EString * s = b->removeLine( 255 );

            if ( !s && b->size() < 255 )
                return;

            if ( !s ) {
                log( "Connection closed due to overlong line (" +
                     fn( b->size() ) + " bytes)", Log::Error );
                err( "Line too long. Closing connection." );
                Connection::setState( Closing );
                return;
            }

            bool unknown = false;

            EStringList * args = EStringList::split( ' ', *s );
            EString cmd = args->take( args->first() )->lower();

            if ( d->sawUser && !( cmd == "quit" || cmd == "pass" ) ) {
                d->sawUser = false;
                unknown = true;
            }
            else if ( cmd == "quit" && args->isEmpty() ) {
                newCommand( d->commands, this, PopCommand::Quit );
            }
            else if ( cmd == "capa" && args->isEmpty() ) {
                newCommand( d->commands, this, PopCommand::Capa );
            }
            else if ( d->state == Authorization ) {
                if ( cmd == "stls" ) {
                    if ( hasTls() )
                        err( "Nested STLS" );
                    else
                        newCommand( d->commands, this, PopCommand::Stls );
                }
                else if ( cmd == "auth" ) {
                    newCommand( d->commands, this, PopCommand::Auth, args );
                }
                else if ( cmd == "user" && args->count() == 1 ) {
                    d->sawUser = true;
                    newCommand( d->commands, this, PopCommand::User, args );
                }
                else if ( d->sawUser && cmd == "pass" && args->count() >= 1 ) {
                    d->sawUser = false;
                    newCommand( d->commands, this, PopCommand::Pass, args );
                }
                else if ( cmd == "apop" && args->count() == 2 ) {
                    newCommand( d->commands, this, PopCommand::Apop, args );
                }
                else {
                    unknown = true;
                }
            }
            else if ( d->state == Transaction ) {
                if ( cmd == "stat" && args->isEmpty() ) {
                    newCommand( d->commands, this, PopCommand::Stat );
                }
                else if ( cmd == "list" && args->count() < 2 ) {
                    newCommand( d->commands, this, PopCommand::List, args );
                }
                else if ( cmd == "top" && args->count() == 2 ) {
                    newCommand( d->commands, this, PopCommand::Top, args );
                }
                else if ( cmd == "retr" && args->count() == 1 ) {
                    newCommand( d->commands, this, PopCommand::Retr, args );
                }
                else if ( cmd == "dele" && args->count() == 1 ) {
                    newCommand( d->commands, this, PopCommand::Dele, args );
                }
                else if ( cmd == "noop" && args->isEmpty() ) {
                    newCommand( d->commands, this, PopCommand::Noop );
                }
                else if ( cmd == "rset" && args->isEmpty() ) {
                    newCommand( d->commands, this, PopCommand::Rset );
                }
                else if ( cmd == "uidl" && args->count() < 2 ) {
                    newCommand( d->commands, this, PopCommand::Uidl, args );
                }
                else {
                    unknown = true;
                }
            }
            else {
                unknown = true;
            }

            if ( unknown ) {
                err( "Bad command" );
                recordSyntaxError();
            }
        }
        else {
            d->reader->read();
        }

        runCommands();
    }
}
Ejemplo n.º 20
0
    std::string newRunQuery(OperationContext* txn,
                            Message& m,
                            QueryMessage& q,
                            CurOp& curop,
                            Message &result,
                            bool fromDBDirectClient) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        const NamespaceString nss(q.ns);

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(
                                    q, &cq, WhereCallbackReal(txn, StringData(nss.db())));
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }

        QLOG() << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        PlanExecutor* rawExec = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        AutoGetCollectionForRead ctx(txn, nss);

        const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() :
                                                             serverGlobalParams.defaultProfile;

        Collection* collection = ctx.getCollection();

        // We'll now try to get the query executor that will execute this query for us. There
        // are a few cases in which we know upfront which executor we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we use an EOFStage.
        //
        // (b) if the query is a replication's initial sync one, we use a specifically designed
        // stage that skips extents faster (see details in exec/oplogstart.h).
        //
        // Otherwise we go through the selection of which executor is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (NULL != collection && pq.getOptions().oplogReplay) {
            // Takes ownership of 'cq'.
            status = getOplogStartHack(txn, collection, cq, &rawExec);
        }
        else {
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            // Takes ownership of 'cq'.
            status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getExecutor has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawExec);
        auto_ptr<PlanExecutor> exec(rawExec);

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref();
        status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                NamespaceString(cq->ns()),
                slaveOK);
        uassertStatusOK(status);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the PlanExecutor in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.getOptions().oplogReplay) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (!supportsGetMore && (enough(pq, numResults)
                                     || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " executor EOF=" << exec->isEOF() << endl;
                    saveClientCursor = !exec->isEOF();
                }
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::EXEC_ERROR == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // Why save a dead executor?
        if (PlanExecutor::DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.getOptions().tailable) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery;
        const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1);

        PlanSummaryStats summaryStats;
        Explain::getSummaryStats(exec.get(), &summaryStats);

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;
        curop.debug().scanAndOrder = summaryStats.hasSortStage;
        curop.debug().nscanned = summaryStats.totalKeysExamined;
        curop.debug().nscannedObjects = summaryStats.totalDocsExamined;
        curop.debug().idhack = summaryStats.isIdhack;

        // Set debug information for consumption by the profiler.
        if (dbProfilingLevel > 0 ||
            curop.elapsedMillis() > serverGlobalParams.slowMS ||
            logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) {
            // Get BSON stats.
            scoped_ptr<PlanStageStats> execStats(exec->getStats());
            BSONObjBuilder statsBob;
            Explain::statsToBSON(*execStats, &statsBob);
            curop.debug().execStats.set(statsBob.obj());

            // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
            if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) {
                BSONObjBuilder bob;
                bob.append("summary", curop.debug().planSummary.toString());
                curop.debug().execStats.set(bob.done());
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection, exec.get(),
                                                cq->getParsed().getOptions().toInt(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            if (fromDBDirectClient) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
                txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn));
            }

            QLOG() << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of executor.  Release to make sure it's not deleted.
            exec.release();

            // TODO document
            if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.getOptions().exhaust) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "Not caching executor but returning " << numResults << " results.\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
int BandwidthController::disableBandwidthControl(void) {
    runCommands(sizeof(IPT_FLUSH_COMMANDS) / sizeof(char*),
            IPT_FLUSH_COMMANDS, RunCmdFailureOk);
    return 0;
}