Status FTDCFileWriter::writeMetadata(const BSONObj& metadata) { BSONObj wrapped = FTDCBSONUtil::createBSONMetadataDocument(metadata); return writeArchiveFileBuffer({wrapped.objdata(), static_cast<size_t>(wrapped.objsize())}); }
void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) { uassert( 10074 , "need values" , tuples.size() ); int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128; BSONObjBuilder reduceArgs( sizeEstimate ); boost::scoped_ptr<BSONArrayBuilder> valueBuilder; int sizeSoFar = 0; unsigned n = 0; for ( ; n<tuples.size(); n++ ) { BSONObjIterator j(tuples[n]); BSONElement keyE = j.next(); if ( n == 0 ) { reduceArgs.append( keyE ); key = keyE.wrap(); sizeSoFar = 5 + keyE.size(); valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) )); } BSONElement ee = j.next(); uassert( 13070 , "value to large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) ); if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) { assert( n > 1 ); // if not, inf. loop break; } valueBuilder->append( ee ); sizeSoFar += ee.size(); } assert(valueBuilder); valueBuilder->done(); BSONObj args = reduceArgs.obj(); Scope * s = _func.scope(); s->invokeSafe( _func.func() , args ); if ( s->type( "return" ) == Array ) { uasserted( 10075 , "reduce -> multiple not supported yet"); return; } endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() ); if ( n == tuples.size() ) return; // the input list was too large BSONList x; for ( ; n < tuples.size(); n++ ) { x.push_back( tuples[n] ); } BSONObjBuilder temp( endSizeEstimate ); temp.append( key.firstElement() ); s->append( temp , "1" , "return" ); x.push_back( temp.obj() ); _reduce( x , key , endSizeEstimate ); }
static void insert( const BSONObj &o, bool god = false ) { Lock::DBWrite lk(ns()); Client::Context ctx( ns() ); theDataFileMgr.insert( ns(), o.objdata(), o.objsize(), god ); }
static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBWrite lk("local"); static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) { resetSlaveCache(); } return; } mutex::scoped_lock lk2(OpTime::m); const OpTime ts = OpTime::now(lk2); Client::Context context("",0,false); /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ bufbuilder.reset(); BSONObjBuilder b(bufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); // partial is everything except the o:... part. int po_sz = partial.objsize(); int len = po_sz + obj.objsize() + 1 + 2 /*o:*/; Record *r; if( logNS == 0 ) { logNS = "local.oplog.$main"; if ( localOplogMainDetails == 0 ) { Client::Context ctx( logNS , dbpath, false); localDB = ctx.db(); verify( localDB ); localOplogMainDetails = nsdetails(logNS); verify( localOplogMainDetails ); } Client::Context ctx( logNS , localDB, false ); r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len); } else { Client::Context ctx( logNS, dbpath, false ); verify( nsdetails( logNS ) ); // first we allocate the space, then we fill it below. r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len); } append_O_Obj(r->data(), partial, obj); context.getClient()->setLastOp( ts ); LOG( 6 ) << "logging op:" << BSONObj::make(r) << endl; }
/* Prepare to build an index. Does not actually build it (except for a special _id case). - We validate that the params are good - That the index does not already exist - Creates the source collection if it DNE example of 'io': { ns : 'test.foo', name : 'z', key : { z : 1 } } throws DBException @param sourceNS - source NS we are indexing @param sourceCollection - its details ptr @return true if ok to continue. when false we stop/fail silently (index already exists) */ bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) { sourceCollection = 0; // logical name of the index. todo: get rid of the name, we don't need it! const char *name = io.getStringField("name"); uassert(12523, "no index name specified", *name); // the collection for which we are building an index sourceNS = io.getStringField("ns"); uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos); uassert(10097, "bad table to index name on add index attempt", cc().database()->name == nsToDatabase(sourceNS.c_str())); BSONObj key = io.getObjectField("key"); uassert(12524, "index key pattern too large", key.objsize() <= 2048); if( !validKeyPattern(key) ) { string s = string("bad index key pattern ") + key.toString(); uasserted(10098 , s.c_str()); } if ( sourceNS.empty() || key.isEmpty() ) { log(2) << "bad add index attempt name:" << (name?name:"") << "\n ns:" << sourceNS << "\n idxobj:" << io.toString() << endl; string s = "bad add index attempt " + sourceNS + " key:" + key.toString(); uasserted(12504, s); } sourceCollection = nsdetails(sourceNS.c_str()); if( sourceCollection == 0 ) { // try to create it string err; if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) { problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl; return false; } sourceCollection = nsdetails(sourceNS.c_str()); tlog() << "info: creating collection " << sourceNS << " on add index" << endl; assert( sourceCollection ); } if ( sourceCollection->findIndexByName(name) >= 0 ) { // index already exists. return false; } if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) { log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl; return false; } if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) { stringstream ss; ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString(); string s = ss.str(); log() << s << '\n'; uasserted(12505,s); } /* we can't build a new index for the ns if a build is already in progress in the background - EVEN IF this is a foreground build. */ uassert(12588, "cannot add index with a background operation in progress", !BackgroundOperation::inProgForNs(sourceNS.c_str())); /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to all be treated as the same pattern. */ if ( IndexDetails::isIdIndexPattern(key) ) { if( !god ) { ensureHaveIdIndex( sourceNS.c_str() ); return false; } } else { /* is buildIndexes:false set for this replica set member? if so we don't build any indexes except _id */ if( theReplSet && !theReplSet->buildIndexes() ) return false; } string pluginName = IndexPlugin::findPluginName( key ); IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0; if ( plugin ) { fixedIndexObject = plugin->adjustIndexSpec( io ); } else if ( io["v"].eoo() ) { // add "v" if it doesn't exist // if it does - leave whatever value was there // this is for testing and replication BSONObjBuilder b( io.objsize() + 32 ); b.appendElements( io ); b.append( "v" , 0 ); fixedIndexObject = b.obj(); } return true; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if (!cmdObj["start"].eoo()) { errmsg = "using deprecated 'start' argument to geoNear"; return false; } const NamespaceString nss(parseNs(dbname, cmdObj)); AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); if ( !collection ) { errmsg = "can't find ns"; return false; } IndexCatalog* indexCatalog = collection->getIndexCatalog(); // cout << "raw cmd " << cmdObj.toString() << endl; // We seek to populate this. string nearFieldName; bool using2DIndex = false; if (!getFieldName(txn, collection, indexCatalog, &nearFieldName, &errmsg, &using2DIndex)) { return false; } PointWithCRS point; uassert(17304, "'near' field must be point", GeoParser::parseQueryPoint(cmdObj["near"], &point).isOK()); bool isSpherical = cmdObj["spherical"].trueValue(); if (!using2DIndex) { uassert(17301, "2dsphere index must have spherical: true", isSpherical); } // Build the $near expression for the query. BSONObjBuilder nearBob; if (isSpherical) { nearBob.append("$nearSphere", cmdObj["near"].Obj()); } else { nearBob.append("$near", cmdObj["near"].Obj()); } if (!cmdObj["maxDistance"].eoo()) { uassert(17299, "maxDistance must be a number",cmdObj["maxDistance"].isNumber()); nearBob.append("$maxDistance", cmdObj["maxDistance"].number()); } if (!cmdObj["minDistance"].eoo()) { uassert(17298, "minDistance doesn't work on 2d index", !using2DIndex); uassert(17300, "minDistance must be a number",cmdObj["minDistance"].isNumber()); nearBob.append("$minDistance", cmdObj["minDistance"].number()); } if (!cmdObj["uniqueDocs"].eoo()) { warning() << nss << ": ignoring deprecated uniqueDocs option in geoNear command"; } // And, build the full query expression. BSONObjBuilder queryBob; queryBob.append(nearFieldName, nearBob.obj()); if (!cmdObj["query"].eoo() && cmdObj["query"].isABSONObj()) { queryBob.appendElements(cmdObj["query"].Obj()); } BSONObj rewritten = queryBob.obj(); // cout << "rewritten query: " << rewritten.toString() << endl; long long numWanted = 100; const char* limitName = !cmdObj["num"].eoo() ? "num" : "limit"; BSONElement eNumWanted = cmdObj[limitName]; if (!eNumWanted.eoo()) { uassert(17303, "limit must be number", eNumWanted.isNumber()); numWanted = eNumWanted.safeNumberLong(); uassert(17302, "limit must be >=0", numWanted >= 0); } bool includeLocs = false; if (!cmdObj["includeLocs"].eoo()) { includeLocs = cmdObj["includeLocs"].trueValue(); } double distanceMultiplier = 1.0; BSONElement eDistanceMultiplier = cmdObj["distanceMultiplier"]; if (!eDistanceMultiplier.eoo()) { uassert(17296, "distanceMultiplier must be a number", eDistanceMultiplier.isNumber()); distanceMultiplier = eDistanceMultiplier.number(); uassert(17297, "distanceMultiplier must be non-negative", distanceMultiplier >= 0); } BSONObj projObj = BSON("$pt" << BSON("$meta" << LiteParsedQuery::metaGeoNearPoint) << "$dis" << BSON("$meta" << LiteParsedQuery::metaGeoNearDistance)); CanonicalQuery* cq; const WhereCallbackReal whereCallback(txn, nss.db()); if (!CanonicalQuery::canonicalize(nss, rewritten, BSONObj(), projObj, 0, numWanted, BSONObj(), &cq, whereCallback).isOK()) { errmsg = "Can't parse filter / create query"; return false; } // Prevent chunks from being cleaned up during yields - this allows us to only check the // version on initial entry into geoNear. RangePreserver preserver(collection); PlanExecutor* rawExec; if (!getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, 0).isOK()) { errmsg = "can't get query executor"; return false; } scoped_ptr<PlanExecutor> exec(rawExec); double totalDistance = 0; BSONObjBuilder resultBuilder(result.subarrayStart("results")); double farthestDist = 0; BSONObj currObj; long long results = 0; while ((results < numWanted) && PlanExecutor::ADVANCED == exec->getNext(&currObj, NULL)) { // Come up with the correct distance. double dist = currObj["$dis"].number() * distanceMultiplier; totalDistance += dist; if (dist > farthestDist) { farthestDist = dist; } // Strip out '$dis' and '$pt' from the result obj. The rest gets added as 'obj' // in the command result. BSONObjIterator resIt(currObj); BSONObjBuilder resBob; while (resIt.more()) { BSONElement elt = resIt.next(); if (!mongoutils::str::equals("$pt", elt.fieldName()) && !mongoutils::str::equals("$dis", elt.fieldName())) { resBob.append(elt); } } BSONObj resObj = resBob.obj(); // Don't make a too-big result object. if (resultBuilder.len() + resObj.objsize()> BSONObjMaxUserSize) { warning() << "Too many geoNear results for query " << rewritten.toString() << ", truncating output."; break; } // Add the next result to the result builder. BSONObjBuilder oneResultBuilder( resultBuilder.subobjStart(BSONObjBuilder::numStr(results))); oneResultBuilder.append("dis", dist); if (includeLocs) { oneResultBuilder.appendAs(currObj["$pt"], "loc"); } oneResultBuilder.append("obj", resObj); oneResultBuilder.done(); ++results; } resultBuilder.done(); // Fill out the stats subobj. BSONObjBuilder stats(result.subobjStart("stats")); // Fill in nscanned from the explain. PlanSummaryStats summary; Explain::getSummaryStats(exec.get(), &summary); stats.appendNumber("nscanned", summary.totalKeysExamined); stats.appendNumber("objectsLoaded", summary.totalDocsExamined); stats.append("avgDistance", totalDistance / results); stats.append("maxDistance", farthestDist); stats.append("time", txn->getCurOp()->elapsedMillis()); stats.done(); return true; }
bool ShardedClientCursor::sendNextBatch( Request& r , int ntoreturn , BufBuilder& buffer, int& docCount ) { uassert( 10191 , "cursor already done" , ! _done ); int maxSize = 1024 * 1024; if ( _totalSent > 0 ) maxSize *= 3; docCount = 0; // If ntoreturn is negative, it means that we should send up to -ntoreturn results // back to the client, and that we should only send a *single batch*. An ntoreturn of // 1 is also a special case which means "return up to 1 result in a single batch" (so // that +1 actually has the same meaning of -1). For all other values of ntoreturn, we // may have to return multiple batches. const bool sendMoreBatches = ntoreturn == 0 || ntoreturn > 1; ntoreturn = abs( ntoreturn ); bool cursorHasMore = true; while ( ( cursorHasMore = _cursor->more() ) ) { BSONObj o = _cursor->next(); buffer.appendBuf( (void*)o.objdata() , o.objsize() ); docCount++; // Ensure that the next batch will never wind up requesting more docs from the shard // than are remaining to satisfy the initial ntoreturn. if (ntoreturn != 0) { _cursor->setBatchSize(ntoreturn - docCount); } if ( buffer.len() > maxSize ) { break; } if ( docCount == ntoreturn ) { // soft limit aka batch size break; } if ( ntoreturn == 0 && _totalSent == 0 && docCount >= 100 ) { // first batch should be max 100 unless batch size specified break; } } // We need to request another batch if the following two conditions hold: // // 1. ntoreturn is positive and not equal to 1 (see the comment above). This condition // is stored in 'sendMoreBatches'. // // 2. The last call to _cursor->more() was true (i.e. we never explicitly got a false // value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server // hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the // query response to indicate that there are no more results. In this case, _cursor->more() // will be explicitly false, and we know for sure that we do not have to send more batches. // // On the other hand, if _cursor->more() is true there may or may not be more results. // Suppose that the mongod generates enough results to fill this batch. In this case it // does not know whether not there are more, because doing so would require requesting an // extra result and seeing whether we get EOF. The mongod sends a valid cursorId to // indicate that there may be more. We do the same here: we indicate that there may be // more results to retrieve by setting 'hasMoreBatches' to true. bool hasMoreBatches = sendMoreBatches && cursorHasMore; LOG(5) << "\t hasMoreBatches: " << hasMoreBatches << " sendMoreBatches: " << sendMoreBatches << " cursorHasMore: " << cursorHasMore << " ntoreturn: " << ntoreturn << " num: " << docCount << " id:" << getId() << " totalSent: " << _totalSent << endl; _totalSent += docCount; _done = ! hasMoreBatches; return hasMoreBatches; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); // TODO: Document. replVerifyReadsOk(); ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; startingResult = cc->pos(); Runner* runner = cc->getRunner(); const ParsedQuery& pq = runner->getQuery().getParsed(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = pq.hasOption(QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; Status status = getRunner(q, &rawRunner); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const ParsedQuery& pq = runner->getQuery().getParsed(); // TODO: Document why we do this. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; const bool isExplain = pq.isExplain(); if (isExplain && pq.enoughForExplain(numResults)) { break; } else if (!supportsGetMore && (pq.enough(numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (pq.enoughForFirstBatch(numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // TODO: Stage creation can set tailable depending on what's in the parsed query. We have // the full parsed query available during planning...set it there. // // TODO: If we're tailable we want to save the client cursor. Make sure we do this later. //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // Allocate a new ClientCursor. ClientCursorHolder ccHolder; ccHolder.reset(new ClientCursor(runner.get())); ccId = ccHolder->cursorid(); // We won't use the runner until it's getMore'd. runner->saveState(); // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { ccHolder->slaveReadTill(slaveReadTill); } if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. ccHolder->setCollMetadata(collMetadata); ccHolder->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). ccHolder->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); // Give up our reference to the CC. ccHolder.release(); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
int run() { if ( ! hasParam( "from" ) ) { log() << "need to specify --from" << endl; return -1; } Client::initThread( "oplogreplay" ); log() << "going to connect" << endl; OplogReader r(false); r.setTailingQueryOptions( QueryOption_SlaveOk | QueryOption_AwaitData ); r.connect( getParam( "from" ) ); log() << "connected" << endl; OpTime start( time(0) - getParam( "seconds" , 86400 ) , 0 ); log() << "starting from " << start.toStringPretty() << endl; string ns = getParam( "oplogns" ); r.tailingQueryGTE( ns.c_str() , start ); bool legacyApplyOps = (versionCmp(mongodVersion(), "2.2.0") < 0); int num = 0; while ( r.more() ) { BSONObj o = r.next(); LOG(2) << o << endl; if ( o["$err"].type() ) { log() << "error getting oplog" << endl; log() << o << endl; return -1; } bool print = ++num % 100000 == 0; if ( print ) cout << num << "\t" << o << endl; if ( o["op"].String() == "n" ) continue; string dbname = legacyApplyOps? nsToDatabase(o["ns"].String()) : "admin"; BSONObjBuilder b( o.objsize() + 32 ); BSONArrayBuilder updates( b.subarrayStart( "applyOps" ) ); updates.append( o ); updates.done(); BSONObj c = b.obj(); BSONObj res; bool ok = conn().runCommand( dbname , c , res ); if ( print || ! ok ) log() << res << endl; } return 0; }
void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) { const int flags = d.reservedField(); bool keepGoing = flags & InsertOption_KeepGoing; // modified before assertion if should abort while ( d.moreJSObjs() ) { try { BSONObj o = d.nextJsObj(); if ( ! manager->hasShardKey( o ) ) { bool bad = true; if ( manager->getShardKey().partOfShardKey( "_id" ) ) { BSONObjBuilder b; b.appendOID( "_id" , 0 , true ); b.appendElements( o ); o = b.obj(); bad = ! manager->hasShardKey( o ); } if ( bad ) { log() << "tried to insert object without shard key: " << r.getns() << " " << o << endl; uasserted( 8011 , "tried to insert object without shard key" ); } } // Many operations benefit from having the shard key early in the object o = manager->getShardKey().moveToFront(o); const int maxTries = 30; bool gotThrough = false; for ( int i=0; i<maxTries; i++ ) { try { ChunkPtr c = manager->findChunk( o ); log(4) << " server:" << c->getShard().toString() << " " << o << endl; insert( c->getShard() , r.getns() , o , flags); r.gotInsert(); if ( r.getClientInfo()->autoSplitOk() ) c->splitIfShould( o.objsize() ); gotThrough = true; break; } catch ( StaleConfigException& e ) { int logLevel = i < ( maxTries / 2 ); LOG( logLevel ) << "retrying insert because of StaleConfigException: " << e << " object: " << o << endl; r.reset(); unsigned long long old = manager->getSequenceNumber(); manager = r.getChunkManager(); LOG( logLevel ) << " sequenece number - old: " << old << " new: " << manager->getSequenceNumber() << endl; if (!manager) { keepGoing = false; uasserted(14804, "collection no longer sharded"); } } sleepmillis( i * 20 ); } assert( inShutdown() || gotThrough ); // not caught below } catch (const UserException&){ if (!keepGoing || !d.moreJSObjs()){ throw; } // otherwise ignore and keep going } } }
void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber, vector<IndexAccessMethod*>& indexesToInsertTo, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << diskloc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; size_t length = e->length; touch_pages( reinterpret_cast<const char*>(e), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/t.seconds() << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = getExtentManager()->getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if ( compactOptions->validateDocuments && !objOld.valid() ) { // object is corrupt! log() << "compact skipping corrupt document!"; stats->corruptDocuments++; } else { unsigned docSize = objOld.objsize(); nrecords++; oldObjSize += docSize; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = docSize + Record::HeaderSize; unsigned lenWPadding = lenWHdr; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) ) lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding); break; case CompactOptions::PRESERVE: // if we are preserving the padding, the record should not change size lenWPadding = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: lenWPadding = compactOptions->computeRecordSize(lenWPadding); if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } break; } CompactDocWriter writer( objOld, lenWPadding ); StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 ); uassertStatusOK( status.getStatus() ); datasize += _recordStore->recordFor( status.getValue() )->netLength(); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = true; // in compact we should be doing no checking for ( size_t i = 0; i < indexesToInsertTo.size(); i++ ) { Status idxStatus = indexesToInsertTo[i]->insert( objOld, status.getValue(), options, NULL ); uassertStatusOK( idxStatus ); } } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( details()->firstExtent() == diskloc ); verify( details()->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; details()->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); getExtentManager()->freeExtents( diskloc, diskloc ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; } } }
/** * Run a query -- includes checking for and running a Command. * @return points to ns if exhaust mode. 0=normal mode * @locks the db mutex for reading (and potentially for writing temporarily to create a new db). * @yields the db mutex periodically after acquiring it. * @asserts on scan and order memory exhaustion and other cases. */ const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) ); ParsedQuery& pq( *pq_shared ); BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; if( logLevel >= 2 ) log() << "runQuery called " << ns << " " << jsobj << endl; curop.debug().ns = ns; curop.debug().ntoreturn = pq.getNumToReturn(); curop.debug().query = jsobj; curop.setQuery(jsobj); // Run a command. if ( pq.couldBeCommand() ) { BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { curop.debug().iscommand = true; curop.debug().query = jsobj; curop.markCommand(); auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData( qr.release(), true ); } else { uasserted(13530, "bad or malformed command request?"); } return 0; } bool explain = pq.isExplain(); BSONObj order = pq.getOrder(); BSONObj query = pq.getFilter(); /* The ElemIter will not be happy if this isn't really an object. So throw exception here when that is true. (Which may indicate bad data from client.) */ if ( query.objsize() == 0 ) { out() << "Bad query object?\n jsobj:"; out() << jsobj.toString() << "\n query:"; out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } Client::ReadContext ctx( ns , dbpath ); // read locks const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns ); replVerifyReadsOk(&pq); if ( pq.hasOption( QueryOption_CursorTailable ) ) { NamespaceDetails *d = nsdetails( ns ); uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() ); const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; } else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } // Run a simple id query. if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { int n = 0; bool nsFound = false; bool indexFound = false; BSONObj resObject; Client& c = cc(); bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); if ( nsFound == false || indexFound == true ) { if ( shardingState.needShardChunkManager( ns ) ) { ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns ); if ( m && ! m->belongsToMe( resObject ) ) { // I have something this _id // but it doesn't belong to me // so return nothing resObject = BSONObj(); found = false; } } BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); bb.skip(sizeof(QueryResult)); curop.debug().idhack = true; if ( found ) { n = 1; fillQueryResultFromObj( bb , pq.getFields() , resObject ); } auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = n; result.setData( qr.release(), true ); return NULL; } } // Run a regular query. BSONObj oldPlan; if ( explain && ! pq.hasIndexSpecifier() ) { MultiPlanScanner mps( ns, query, order ); if ( mps.usingCachedPlan() ) { oldPlan = mps.oldExplain().firstElement().embeddedObject() .firstElement().embeddedObject().getOwned(); } } // In some cases the query may be retried if there is an in memory sort size assertion. for( int retry = 0; retry < 2; ++retry ) { try { return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order, pq_shared, oldPlan, shardingVersionAtStart, result ); } catch ( const QueryRetryException & ) { verify( retry == 0 ); } } verify( false ); return 0; }
Status cloneCollectionAsCapped( OperationContext* txn, Database* db, const string& shortFrom, const string& shortTo, double size, bool temp, bool logForReplication ) { string fromNs = db->name() + "." + shortFrom; string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection( txn, fromNs ); if ( !fromCollection ) return Status( ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist" ); if ( db->getCollection( toNs ) ) return Status( ErrorCodes::NamespaceExists, "to collection already exists" ); // create new collection { Client::Context ctx( toNs ); BSONObjBuilder spec; spec.appendBool( "capped", true ); spec.append( "size", size ); if ( temp ) spec.appendBool( "temp", true ); Status status = userCreateNS( txn, ctx.db(), toNs, spec.done(), logForReplication ); if ( !status.isOK() ) return status; } Collection* toCollection = db->getCollection( txn, toNs ); invariant( toCollection ); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long excessSize = static_cast<long long>( fromCollection->dataSize() - ( toCollection->getRecordStore()->storageSize() * 2 ) ); scoped_ptr<Runner> runner( InternalPlanner::collectionScan(fromNs, fromCollection, InternalPlanner::FORWARD ) ); while ( true ) { BSONObj obj; Runner::RunnerState state = runner->getNext(&obj, NULL); switch( state ) { case Runner::RUNNER_EOF: return Status::OK(); case Runner::RUNNER_DEAD: db->dropCollection( txn, toNs ); return Status( ErrorCodes::InternalError, "runner turned dead while iterating" ); case Runner::RUNNER_ERROR: return Status( ErrorCodes::InternalError, "runner error while iterating" ); case Runner::RUNNER_ADVANCED: if ( excessSize > 0 ) { excessSize -= ( 4 * obj.objsize() ); // 4x is for padding, power of 2, etc... continue; } toCollection->insertDocument( txn, obj, true ); if ( logForReplication ) replset::logOp(txn, "i", toNs.c_str(), obj); txn->recoveryUnit()->commitIfNeeded(); } } invariant( false ); // unreachable }
std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData(qr, true); return ""; } // This is a read lock. We require this because if we're parsing a $where, the // where-specific parsing code assumes we have a lock and creates execution machinery that // requires it. Client::ReadContext ctx(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize(q, &cq); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } verify(cq); QLOG() << "Running query on new system: " << cq->toString(); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { // NOTE: Do not access cq as getRunner has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get())); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
std::string runQuery(OperationContext* opCtx, QueryMessage& q, const NamespaceString& nss, Message& result) { CurOp& curOp = *CurOp::get(opCtx); curOp.ensureStarted(); uassert(ErrorCodes::InvalidNamespace, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set CurOp information. const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip); beginQueryOp(opCtx, nss, upconvertedQuery, q.ntoreturn, q.ntoskip); // Parse the qm into a CanonicalQuery. const boost::intrusive_ptr<ExpressionContext> expCtx; auto statusWithCQ = CanonicalQuery::canonicalize(opCtx, q, expCtx, ExtensionsCallbackReal(opCtx, &nss), MatchExpressionParser::kAllowAllSpecialFeatures); if (!statusWithCQ.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString()); } unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); invariant(cq.get()); LOG(5) << "Running query:\n" << redact(cq->toString()); LOG(2) << "Running query: " << redact(cq->toStringShort()); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForReadCommand ctx(opCtx, nss, AutoGetCollection::ViewMode::kViewsForbidden); Collection* const collection = ctx.getCollection(); { const QueryRequest& qr = cq->getQueryRequest(); // Allow the query to run on secondaries if the read preference permits it. If no read // preference was specified, allow the query to run iff slaveOk has been set. const bool slaveOK = qr.hasReadPref() ? uassertStatusOK(ReadPreferenceSetting::fromContainingBSON(q.query)) .canRunOnSecondary() : qr.isSlaveOk(); uassertStatusOK( repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, slaveOK)); } // We have a parsed query. Time to get the execution plan for it. auto exec = uassertStatusOK(getExecutorLegacyFind(opCtx, collection, nss, std::move(cq))); const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (qr.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages( exec.get(), collection, ExplainOptions::Verbosity::kExecAllPlans, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // Set query result fields. QueryResult::View qr = bb.buf(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curOp.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(bb.release()); return ""; } // Handle query option $maxTimeMS (not used with commands). if (qr.getMaxTimeMS() > 0) { uassert(40116, "Illegal attempt to set operation deadline within DBDirectClient", !opCtx->getClient()->isInDirectClient()); opCtx->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()}); } opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(FindCommon::kInitReplyBufferSize); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; BSONObj obj; PlanExecutor::ExecState state; // Get summary info about which plan the executor is using. { stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) { exec->enqueue(obj); break; } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; if (FindCommon::enoughForFirstBatch(qr, numResults)) { LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore() << " ntoreturn=" << qr.getNToReturn().value_or(0) << " numResults=" << numResults; break; } } // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { error() << "Plan executor error during find: " << PlanExecutor::statestr(state) << ", stats: " << redact(Explain::getWinningPlanStats(exec.get())); uassertStatusOKWithContext(WorkingSetCommon::getMemberObjectStatus(obj), "Executor error during OP_QUERY find"); MONGO_UNREACHABLE; } // Before saving the cursor, ensure that whatever plan we established happened with the expected // collection version auto css = CollectionShardingState::get(opCtx, nss); css->checkShardVersionOrThrow(opCtx); // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(opCtx, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); exec->detachFromOperationContext(); // Allocate a new ClientCursor and register it with the cursor manager. ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor( opCtx, {std::move(exec), nss, AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), opCtx->recoveryUnit()->getReadConcernLevel(), upconvertedQuery}); ccId = pinnedCursor.getCursor()->cursorid(); LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results"; // TODO document if (qr.isExhaust()) { curOp.debug().exhaust = true; } pinnedCursor.getCursor()->setPos(numResults); // We assume that cursors created through a DBDirectClient are always used from their // original OperationContext, so we do not need to move time to and from the cursor. if (!opCtx->getClient()->isInDirectClient()) { // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros()); } endQueryOp(opCtx, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId); } else { LOG(5) << "Not caching executor but returning " << numResults << " results."; endQueryOp(opCtx, collection, *exec, numResults, ccId); } // Fill out the output buffer's header. QueryResult::View queryResultView = bb.buf(); queryResultView.setCursorId(ccId); queryResultView.setResultFlagsToOk(); queryResultView.msgdata().setLen(bb.len()); queryResultView.msgdata().setOperation(opReply); queryResultView.setStartingFrom(0); queryResultView.setNReturned(numResults); // Add the results from the query into the output buffer. result.setData(bb.release()); // curOp.debug().exhaust is set above. return curOp.debug().exhaust ? nss.ns() : ""; }
Status IndexBuildInterceptor::drainWritesIntoIndex(OperationContext* opCtx, const InsertDeleteOptions& options, RecoveryUnit::ReadSource readSource) { invariant(!opCtx->lockState()->inAWriteUnitOfWork()); // Callers may request to read at a specific timestamp so that no drained writes are timestamped // earlier than their original write timestamp. Also ensure that leaving this function resets // the ReadSource to its original value. auto resetReadSourceGuard = makeGuard([ opCtx, prevReadSource = opCtx->recoveryUnit()->getTimestampReadSource() ] { opCtx->recoveryUnit()->abandonSnapshot(); opCtx->recoveryUnit()->setTimestampReadSource(prevReadSource); }); if (readSource != RecoveryUnit::ReadSource::kUnset) { opCtx->recoveryUnit()->abandonSnapshot(); opCtx->recoveryUnit()->setTimestampReadSource(readSource); } else { resetReadSourceGuard.dismiss(); } // These are used for logging only. int64_t totalDeleted = 0; int64_t totalInserted = 0; Timer timer; const int64_t appliedAtStart = _numApplied; // Set up the progress meter. This will never be completely accurate, because more writes can be // read from the side writes table than are observed before draining. static const char* curopMessage = "Index Build: draining writes received during build"; ProgressMeterHolder progress; { stdx::unique_lock<Client> lk(*opCtx->getClient()); progress.set(CurOp::get(opCtx)->setProgress_inlock(curopMessage)); } // Force the progress meter to log at the end of every batch. By default, the progress meter // only logs after a large number of calls to hit(), but since we batch inserts by up to // 1000 records, progress would rarely be displayed. progress->reset(_sideWritesCounter.load() - appliedAtStart /* total */, 3 /* secondsBetween */, 1 /* checkInterval */); // Buffer operations into batches to insert per WriteUnitOfWork. Impose an upper limit on the // number of documents and the total size of the batch. const int32_t kBatchMaxSize = 1000; const int64_t kBatchMaxBytes = BSONObjMaxInternalSize; int64_t batchSizeBytes = 0; std::vector<SideWriteRecord> batch; batch.reserve(kBatchMaxSize); // Hold on to documents that would exceed the per-batch memory limit. Always insert this first // into the next batch. boost::optional<SideWriteRecord> stashed; auto cursor = _sideWritesTable->rs()->getCursor(opCtx); bool atEof = false; while (!atEof) { opCtx->checkForInterrupt(); // Stashed records should be inserted into a batch first. if (stashed) { invariant(batch.empty()); batch.push_back(std::move(stashed.get())); stashed.reset(); } auto record = cursor->next(); if (record) { RecordId currentRecordId = record->id; BSONObj docOut = record->data.toBson().getOwned(); // If the total batch size in bytes would be too large, stash this document and let the // current batch insert. int objSize = docOut.objsize(); if (batchSizeBytes + objSize > kBatchMaxBytes) { invariant(!stashed); // Stash this document to be inserted in the next batch. stashed.emplace(currentRecordId, std::move(docOut)); } else { batchSizeBytes += objSize; batch.emplace_back(currentRecordId, std::move(docOut)); // Continue if there is more room in the batch. if (batch.size() < kBatchMaxSize) { continue; } } } else { atEof = true; if (batch.empty()) break; } invariant(!batch.empty()); cursor->save(); // If we are here, either we have reached the end of the table or the batch is full, so // insert everything in one WriteUnitOfWork, and delete each inserted document from the side // writes table. auto status = writeConflictRetry(opCtx, "index build drain", _indexCatalogEntry->ns(), [&] { WriteUnitOfWork wuow(opCtx); for (auto& operation : batch) { auto status = _applyWrite(opCtx, operation.second, options, &totalInserted, &totalDeleted); if (!status.isOK()) { return status; } // Delete the document from the table as soon as it has been inserted into the // index. This ensures that no key is ever inserted twice and no keys are skipped. _sideWritesTable->rs()->deleteRecord(opCtx, operation.first); } // For rollback to work correctly, these writes need to be timestamped. The actual time // is not important, as long as it not older than the most recent visible side write. IndexTimestampHelper::setGhostCommitTimestampForWrite( opCtx, NamespaceString(_indexCatalogEntry->ns())); wuow.commit(); return Status::OK(); }); if (!status.isOK()) { return status; } progress->hit(batch.size()); // Lock yielding will only happen if we are holding intent locks. _tryYield(opCtx); cursor->restore(); // Account for more writes coming in during a batch. progress->setTotalWhileRunning(_sideWritesCounter.loadRelaxed() - appliedAtStart); _numApplied += batch.size(); batch.clear(); batchSizeBytes = 0; } progress->finished(); int logLevel = (_numApplied - appliedAtStart > 0) ? 0 : 1; LOG(logLevel) << "index build: drain applied " << (_numApplied - appliedAtStart) << " side writes (inserted: " << totalInserted << ", deleted: " << totalDeleted << ") for '" << _indexCatalogEntry->descriptor()->indexName() << "' in " << timer.millis() << " ms"; return Status::OK(); }
/* Prepare to build an index. Does not actually build it (except for a special _id case). - We validate that the params are good - That the index does not already exist - Creates the source collection if it DNE example of 'io': { ns : 'test.foo', name : 'z', key : { z : 1 } } throws DBException @param sourceNS - source NS we are indexing @param sourceCollection - its details ptr @return true if ok to continue. when false we stop/fail silently (index already exists) */ bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) { sourceCollection = 0; // logical name of the index. todo: get rid of the name, we don't need it! const char *name = io.getStringField("name"); uassert(12523, "no index name specified", *name); // the collection for which we are building an index sourceNS = io.getStringField("ns"); uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos); massert(10097, str::stream() << "bad table to index name on add index attempt current db: " << cc().database()->name << " source: " << sourceNS , cc().database()->name == nsToDatabase(sourceNS.c_str())); BSONObj key = io.getObjectField("key"); uassert(12524, "index key pattern too large", key.objsize() <= 2048); if( !validKeyPattern(key) ) { string s = string("bad index key pattern ") + key.toString(); uasserted(10098 , s.c_str()); } if ( sourceNS.empty() || key.isEmpty() ) { log(2) << "bad add index attempt name:" << (name?name:"") << "\n ns:" << sourceNS << "\n idxobj:" << io.toString() << endl; string s = "bad add index attempt " + sourceNS + " key:" + key.toString(); uasserted(12504, s); } sourceCollection = nsdetails(sourceNS.c_str()); if( sourceCollection == 0 ) { // try to create it string err; if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) { problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl; return false; } sourceCollection = nsdetails(sourceNS.c_str()); tlog() << "info: creating collection " << sourceNS << " on add index" << endl; verify( sourceCollection ); } if ( sourceCollection->findIndexByName(name) >= 0 ) { // index already exists. return false; } if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) { log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl; return false; } if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) { stringstream ss; ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString(); string s = ss.str(); log() << s << '\n'; uasserted(12505,s); } /* we can't build a new index for the ns if a build is already in progress in the background - EVEN IF this is a foreground build. */ uassert(12588, "cannot add index with a background operation in progress", !BackgroundOperation::inProgForNs(sourceNS.c_str())); /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to all be treated as the same pattern. */ if ( IndexDetails::isIdIndexPattern(key) ) { if( !god ) { ensureHaveIdIndex( sourceNS.c_str() ); return false; } } else { /* is buildIndexes:false set for this replica set member? if so we don't build any indexes except _id */ if( theReplSet && !theReplSet->buildIndexes() ) return false; } string pluginName = IndexPlugin::findPluginName( key ); IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0; { BSONObj o = io; if ( plugin ) { o = plugin->adjustIndexSpec(o); } BSONObjBuilder b; int v = DefaultIndexVersionNumber; if( !o["v"].eoo() ) { double vv = o["v"].Number(); // note (one day) we may be able to fresh build less versions than we can use // isASupportedIndexVersionNumber() is what we can use uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv, vv == 0 || vv == 1); v = (int) vv; } // idea is to put things we use a lot earlier b.append("v", v); b.append(o["key"]); if( o["unique"].trueValue() ) b.appendBool("unique", true); // normalize to bool true in case was int 1 or something... b.append(o["ns"]); { // stripping _id BSONObjIterator i(o); while ( i.more() ) { BSONElement e = i.next(); string s = e.fieldName(); if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" ) b.append(e); } } fixedIndexObject = b.obj(); } return true; }
BSONObj Helpers::toKeyFormat( const BSONObj& o ) { BSONObjBuilder keyObj( o.objsize() ); BSONForEach( e , o ) { keyObj.appendAs( e , "" ); }
/* * Runs the command object cmdobj on the db with name dbname and puts result in result. * @param dbname, name of db * @param cmdobj, object that contains entire command * @param options * @param errmsg, reference to error message * @param result, reference to builder for result * @param fromRepl * @return true if successful, false otherwise */ bool FTSCommand::_run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int cmdOptions, const string& ns, const string& searchString, string language, // "" for not-set int limit, BSONObj& filter, BSONObj& projection, string& errmsg, BSONObjBuilder& result ) { Timer comm; // Rewrite the cmd as a normal query. BSONObjBuilder queryBob; queryBob.appendElements(filter); BSONObjBuilder textBob; textBob.append("$search", searchString); if (!language.empty()) { textBob.append("$language", language); } queryBob.append("$text", textBob.obj()); // This is the query we exec. BSONObj queryObj = queryBob.obj(); // We sort by the score. BSONObj sortSpec = BSON("$s" << BSON("$meta" << LiteParsedQuery::metaTextScore)); // We also project the score into the document and strip it out later during the reformatting // of the results. BSONObjBuilder projBob; projBob.appendElements(projection); projBob.appendElements(sortSpec); BSONObj projObj = projBob.obj(); Client::ReadContext ctx(txn, ns); CanonicalQuery* cq; Status canonicalizeStatus = CanonicalQuery::canonicalize(ns, queryObj, sortSpec, projObj, 0, limit, BSONObj(), &cq, WhereCallbackReal(txn, StringData(dbname))); if (!canonicalizeStatus.isOK()) { errmsg = canonicalizeStatus.reason(); return false; } Runner* rawRunner; Status getRunnerStatus = getRunner(txn, ctx.ctx().db()->getCollection(txn, ns), cq, &rawRunner); if (!getRunnerStatus.isOK()) { errmsg = getRunnerStatus.reason(); return false; } auto_ptr<Runner> runner(rawRunner); BSONArrayBuilder resultBuilder(result.subarrayStart("results")); // Quoth: "leave a mb for other things" int resultSize = 1024 * 1024; int numReturned = 0; BSONObj obj; while (Runner::RUNNER_ADVANCED == runner->getNext(&obj, NULL)) { if ((resultSize + obj.objsize()) >= BSONObjMaxUserSize) { break; } // We return an array of results. Add another element. BSONObjBuilder oneResultBuilder(resultBuilder.subobjStart()); oneResultBuilder.append("score", obj["$s"].number()); // Strip out the score from the returned obj. BSONObjIterator resIt(obj); BSONObjBuilder resBob; while (resIt.more()) { BSONElement elt = resIt.next(); if (!mongoutils::str::equals("$s", elt.fieldName())) { resBob.append(elt); } } oneResultBuilder.append("obj", resBob.obj()); BSONObj addedArrayObj = oneResultBuilder.done(); resultSize += addedArrayObj.objsize(); numReturned++; } resultBuilder.done(); // returns some stats to the user BSONObjBuilder stats(result.subobjStart("stats")); // Fill in nscanned from the explain. TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { auto_ptr<TypeExplain> explain(bareExplain); stats.append("nscanned", explain->getNScanned()); stats.append("nscannedObjects", explain->getNScannedObjects()); } stats.appendNumber( "n" , numReturned ); stats.append( "timeMicros", (int)comm.micros() ); stats.done(); return true; }
static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBWrite lk1("local"); if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) resetSlaveCache(); return; } mutex::scoped_lock lk2(OpTime::m); const OpTime ts = OpTime::now(lk2); long long hashNew; if( theReplSet ) { massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary()); hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId(); } else { // must be initiation verify( *ns == 0 ); hashNew = 0; } /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ logopbufbuilder.reset(); BSONObjBuilder b(logopbufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("h", hashNew); b.append("v", OPLOG_VERSION); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); int posz = partial.objsize(); int len = posz + obj.objsize() + 1 + 2 /*o:*/; Record *r; DEV verify( logNS == 0 ); { const char *logns = rsoplog; if ( rsOplogDetails == 0 ) { Client::Context ctx( logns , dbpath, false); localDB = ctx.db(); verify( localDB ); rsOplogDetails = nsdetails(logns); massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); } Client::Context ctx( logns , localDB, false ); r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. */ if( theReplSet ) { if( !(theReplSet->lastOpTimeWritten<ts) ) { log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog; log() << "replSet " << theReplSet->isPrimary() << rsLog; } theReplSet->lastOpTimeWritten = ts; theReplSet->lastH = hashNew; ctx.getClient()->setLastOp( ts ); } } append_O_Obj(r->data(), partial, obj); if ( logLevel >= 6 ) { LOG( 6 ) << "logOp:" << BSONObj::make(r) << endl; } }
/** object cannot be represented in compact format. so store in traditional bson format with a leading sentinel byte IsBSON to indicate it's in that format. Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here so that we don't have to do an extra malloc. */ void KeyV1Owned::traditional(const BSONObj& obj) { b.reset(); b.appendUChar(IsBSON); b.appendBuf(obj.objdata(), obj.objsize()); _keyData = (const unsigned char *) b.buf(); }
/** @param fromRepl false if from ApplyOpsCmd @return true if was and update should have happened and the document DNE. see replset initial sync code. */ bool applyOperation_inlock(const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) { LOG(3) << "applying op: " << op << endl; bool failedUpdate = false; OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; const char *names[] = { "o", "ns", "op", "b" }; BSONElement fields[4]; op.getFields(4, names, fields); BSONObj o; if( fields[0].isABSONObj() ) o = fields[0].embeddedObject(); const char *ns = fields[1].valuestrsafe(); Lock::assertWriteLocked(ns); NamespaceDetails *nsd = nsdetails(ns); // operation type -- see logOp() comments for types const char *opType = fields[2].valuestrsafe(); if ( *opType == 'i' ) { opCounters->gotInsert(); const char *p = strchr(ns, '.'); if ( p && strcmp(p, ".system.indexes") == 0 ) { // updates aren't allowed for indexes -- so we will do a regular insert. if index already // exists, that is ok. theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize()); } else { // do upserts for inserts as we might get replayed more than once OpDebug debug; BSONElement _id; if( !o.getObjectID(_id) ) { /* No _id. This will be very slow. */ Timer t; updateObjectsForReplication(ns, o, o, true, false, false, debug, false, QueryPlanSelectionPolicy::idElseNatural() ); if( t.millis() >= 2 ) { RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; } } else { // probably don't need this since all replicated colls have _id indexes now // but keep it just in case RARELY if ( nsd && !nsd->isCapped() ) { ensureHaveIdIndex(ns); } /* todo : it may be better to do an insert here, and then catch the dup key exception and do update then. very few upserts will not be inserts... */ BSONObjBuilder b; b.append(_id); updateObjectsForReplication(ns, o, b.done(), true, false, false , debug, false, QueryPlanSelectionPolicy::idElseNatural() ); } } }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; context.relocked(); bool createdCollection = false; Collection* collection = NULL; while( i.moreInCurrentBatch() ) { if ( numSeen % 128 == 127 /*yield some*/ ) { collection = NULL; time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << numSeen << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } if ( isindex == false && collection == NULL ) { collection = context.db()->getCollection( to_collection ); if ( !collection ) { massert( 17321, str::stream() << "collection dropped during clone [" << to_collection << "]", !createdCollection ); createdCollection = true; collection = context.db()->createCollection( txn, to_collection ); verify( collection ); } } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ const Status status = validateBSON(tmp.objdata(), tmp.objsize()); if (!status.isOK()) { out() << "Cloner: skipping corrupt object from " << from_collection << ": " << status.reason(); continue; } ++numSeen; BSONObj js = tmp; if ( isindex ) { verify(nsToCollectionSubstring(from_collection) == "system.indexes"); js = fixindex(context.db()->name(), tmp); indexesToBuild->push_back( js.getOwned() ); continue; } verify(nsToCollectionSubstring(from_collection) != "system.indexes"); StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true ); if ( !loc.isOK() ) { error() << "error: exception cloning object in " << from_collection << ' ' << loc.toString() << " obj:" << js; } uassertStatusOK( loc.getStatus() ); if ( logForRepl ) logOp(txn, "i", to_collection, js); txn->commitIfNeeded(); RARELY if ( time( 0 ) - saveLast > 60 ) { log() << numSeen << " objects cloned so far from collection " << from_collection; saveLast = time( 0 ); } } }
/** * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore * requests). Otherwise, returns false. */ static bool handleCursorCommand(OperationContext* txn, const string& ns, ClientCursorPin* pin, PlanExecutor* exec, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getExecutor() == exec); invariant(cursor->isAggCursor()); } const long long defaultBatchSize = 101; // Same as query. long long batchSize; uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize)); // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = MaxBytesToReturnToClientAtOnce; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineProxyStage may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) { // make it an obvious error to use cursor or executor after this point cursor = NULL; exec = NULL; break; } if (resultsArray.len() + next.objsize() > byteLimit) { // Get the pipeline proxy stage wrapped by this PlanExecutor. PipelineProxyStage* proxy = static_cast<PipelineProxyStage*>(exec->getRootStage()); // too big. next will be the first doc in the second batch proxy->pushBack(next); break; } resultsArray.append(next); } // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && exec && !exec->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted(17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros( txn->getCurOp()->getRemainingMaxTimeMicros() ); if (txn->getClient()->isInDirectClient()) { cursor->setUnownedRecoveryUnit(txn->recoveryUnit()); } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. txn->recoveryUnit()->commitAndRestart(); cursor->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit()); } // Cursor needs to be in a saved state while we yield locks for getmore. State // will be restored in getMore(). exec->saveState(); } const long long cursorId = cursor ? cursor->cursorid() : 0LL; appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result); return static_cast<bool>(cursor); }
std::string DBHashCmd::hashCollection(OperationContext* opCtx, Database* db, const std::string& fullCollectionName, bool* fromCache) { boost::unique_lock<boost::mutex> cachedHashedLock(_cachedHashedMutex, boost::defer_lock); if ( isCachable( fullCollectionName ) ) { cachedHashedLock.lock(); string hash = _cachedHashed[fullCollectionName]; if ( hash.size() > 0 ) { *fromCache = true; return hash; } } *fromCache = false; Collection* collection = db->getCollection( fullCollectionName ); if ( !collection ) return ""; IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex( opCtx ); auto_ptr<PlanExecutor> exec; if ( desc ) { exec.reset(InternalPlanner::indexScan(opCtx, collection, desc, BSONObj(), BSONObj(), false, InternalPlanner::FORWARD, InternalPlanner::IXSCAN_FETCH)); } else if ( collection->isCapped() ) { exec.reset(InternalPlanner::collectionScan(opCtx, fullCollectionName, collection)); } else { log() << "can't find _id index for: " << fullCollectionName << endl; return "no _id _index"; } md5_state_t st; md5_init(&st); long long n = 0; PlanExecutor::ExecState state; BSONObj c; verify(NULL != exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&c, NULL))) { md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() ); n++; } if (PlanExecutor::IS_EOF != state) { warning() << "error while hashing, db dropped? ns=" << fullCollectionName << endl; } md5digest d; md5_finish(&st, d); string hash = digestToString( d ); if (cachedHashedLock.owns_lock()) { _cachedHashed[fullCollectionName] = hash; } return hash; }
/** * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore * requests). Otherwise, returns false. */ static bool handleCursorCommand(OperationContext* txn, const string& ns, ClientCursorPin* pin, PlanExecutor* exec, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getExecutor() == exec); invariant(cursor->isAggCursor()); } const long long defaultBatchSize = 101; // Same as query. long long batchSize; uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize)); // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineProxyStage may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) { // make it an obvious error to use cursor or executor after this point cursor = NULL; exec = NULL; break; } // If adding this object will cause us to exceed the BSON size limit, then we stash it for // later. if (resultsArray.len() + next.objsize() > byteLimit) { exec->enqueue(next); break; } resultsArray.append(next); } // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && exec && !exec->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted( 17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); CurOp::get(txn)->debug().cursorid = cursor->cursorid(); // Cursor needs to be in a saved state while we yield locks for getmore. State // will be restored in getMore(). exec->saveState(); exec->detachFromOperationContext(); } const long long cursorId = cursor ? cursor->cursorid() : 0LL; appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result); return static_cast<bool>(cursor); }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); QLOG() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } if (cc->isAggCursor) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } bool saveClientCursor = false; if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) { // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (Runner::RUNNER_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(Runner::RUNNER_ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ccPin.deleteUnderlying(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended w/state " << Runner::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); QLOG() << "getMore saving client cursor ended w/state " << Runner::statestr(state) << endl; // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
void insert() { Client::Context ctx( cappedNs() ); BSONObj o = BSON(GENOID << "x" << 456); DiskLoc loc = theDataFileMgr.insert( cappedNs().c_str(), o.objdata(), o.objsize(), false ); verify(!loc.isNull()); }
static void handleCursorCommand(const string& ns, ClientCursorPin* pin, PipelineRunner* runner, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getRunner() == runner); invariant(cursor->isAggCursor); } BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize"); const long long batchSize = batchSizeElem.isNumber() ? batchSizeElem.numberLong() : 101; // same as query // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = MaxBytesToReturnToClientAtOnce; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineRunner may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. if (runner->getNext(&next, NULL) != Runner::RUNNER_ADVANCED) { if (pin) pin->deleteUnderlying(); // make it an obvious error to use cursor or runner after this point cursor = NULL; runner = NULL; break; } if (resultsArray.len() + next.objsize() > byteLimit) { // too big. next will be the first doc in the second batch runner->pushBack(next); break; } resultsArray.append(next); } // NOTE: runner->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && runner && !runner->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted(17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() ); } BSONObjBuilder cursorObj(result.subobjStart("cursor")); cursorObj.append("id", cursor ? cursor->cursorid() : 0LL); cursorObj.append("ns", ns); cursorObj.append("firstBatch", resultsArray.arr()); cursorObj.done(); }