void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { ReadPreferenceSetting readPreference(ReadPreference::PrimaryOnly, TagSet::primaryOnly()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK(ClusterFind::runExplain( txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; // Takes ownership of cq. Status status = getRunner(cq, &rawRunner); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + cq->toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); QLOG() << "Running query on new system: " << cq->toString(); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Remove when impl'd if (pq.hasOption(QueryOption_OplogReplay)) { warning() << "haven't implemented findingstartcursor yet\n"; } // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { ++numMisplacedDocs; continue; } } // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable) && (1 != pq.getNumToReturn())) { // If pq.hasOption(tailable) the only plan the planner will output is a collscan with // tailable set. saveClientCursor = true; } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // Fill in the number of documents consummed that were involved in an ongoing // (or aborted) migration. explain->setNChunkSkips(numMisplacedDocs); // We might have skipped some results due to chunk migration etc. so our count is // correct and explain's is not. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); shared_ptr<AssertionException> ex; try { dbresponse.exhaust = runQuery(m, q, op, *resp); assert( !resp->empty() ); } catch ( SendStaleConfigException& e ){ ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) ); ok = false; } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; } if( ex ){ op.debug().exceptionInfo = ex->getInfo(); LOGWITHRATELIMIT { log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } SendStaleConfigException* scex = NULL; if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() ); BSONObjBuilder err; ex->getInfo().append( err ); if( scex ) err.append( "ns", scex->getns() ); BSONObj errObj = err.done(); log() << errObj << endl; BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } op.debug().responseLength = resp->header()->dataLen(); dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
void operator()( DBClientCursorBatchIterator &i ) { Lock::GlobalWrite lk; context.relocked(); bool createdCollection = false; Collection* collection = NULL; while( i.moreInCurrentBatch() ) { if ( numSeen % 128 == 127 /*yield some*/ ) { collection = NULL; time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << numSeen << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } if ( isindex == false && collection == NULL ) { collection = context.db()->getCollection( to_collection ); if ( !collection ) { massert( 17321, str::stream() << "collection dropped during clone [" << to_collection << "]", !createdCollection ); createdCollection = true; collection = context.db()->createCollection( txn, to_collection ); verify( collection ); } } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ const Status status = validateBSON(tmp.objdata(), tmp.objsize()); if (!status.isOK()) { out() << "Cloner: skipping corrupt object from " << from_collection << ": " << status.reason(); continue; } ++numSeen; BSONObj js = tmp; if ( isindex ) { verify(nsToCollectionSubstring(from_collection) == "system.indexes"); js = fixindex(context.db()->name(), tmp); indexesToBuild->push_back( js.getOwned() ); continue; } verify(nsToCollectionSubstring(from_collection) != "system.indexes"); StatusWith<DiskLoc> loc = collection->insertDocument( txn, js, true ); if ( !loc.isOK() ) { error() << "error: exception cloning object in " << from_collection << ' ' << loc.toString() << " obj:" << js; } uassertStatusOK( loc.getStatus() ); if ( logForRepl ) logOp(txn, "i", to_collection, js); getDur().commitIfNeeded(); RARELY if ( time( 0 ) - saveLast > 60 ) { log() << numSeen << " objects cloned so far from collection " << from_collection; saveLast = time( 0 ); } } }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Determine the default read preference mode based on the value of the slaveOk flag. ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk) ? ReadPreference::SecondaryPreferred : ReadPreference::PrimaryOnly; ReadPreferenceSetting readPreference(readPreferenceOption, TagSet()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, ExtensionsCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain command // rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK( Strategy::explainFind(txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses from // the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted. Otherwise we assume that a cursor with the returned id can // be retrieved via the ClusterCursorManager. auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // Fill out the response buffer. int numResults = 0; OpQueryReplyBuilder reply; for (auto&& obj : batch) { obj.appendSelfToBufBuilder(reply.bufBuilderForResults()); numResults++; } reply.send(request.p(), 0, // query result flags request.m(), numResults, 0, // startingFrom cursorId.getValue()); }
/* note: this is only (as-is) called for - not multi - not mods is indexed - not upsert */ static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet* mods, NamespaceDetails* d, NamespaceDetailsTransient *nsdt, bool su, const char* ns, const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug, bool fromMigrate = false) { DiskLoc loc; { IndexDetails& i = d->idx(idIdxNo); BSONObj key = i.getKeyFromQuery( patternOrig ); loc = QueryRunner::fastFindSingle(i, key); if( loc.isNull() ) { // no upsert support in _updateById yet, so we are done. return UpdateResult( 0 , 0 , 0 , BSONObj() ); } } Record* r = loc.rec(); if ( cc().allowedToThrowPageFaultException() && ! r->likelyInPhysicalMemory() ) { throw PageFaultException( r ); } /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some regular ones at the moment. */ BSONObj newObj; if ( isOperatorUpdate ) { const BSONObj& onDisk = loc.obj(); auto_ptr<ModSetState> mss = mods->prepare( onDisk, false /* not an insertion */ ); if( mss->canApplyInPlace() ) { mss->applyModsInPlace(true); debug.fastmod = true; DEBUGUPDATE( "\t\t\t updateById doing in place update" ); newObj = onDisk; } else { newObj = mss->createNewFromMods(); checkTooLarge(newObj); verify(nsdt); theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug); } if ( logop ) { DEV verify( mods->size() ); BSONObj pattern = patternOrig; BSONObj logObj = mss->getOpLogRewrite(); DEBUGUPDATE( "\t rewrite update: " << logObj ); // It is possible that the entire mod set was a no-op over this document. We // would have an empty log record in that case. If we call logOp, with an empty // record, that would be replicated as "clear this record", which is not what // we want. Therefore, to get a no-op in the replica, we simply don't log. if ( logObj.nFields() ) { logOp("u", ns, logObj, &pattern, 0, fromMigrate, &newObj ); } } return UpdateResult( 1 , 1 , 1 , BSONObj() ); } // end $operator update // regular update BSONElementManipulator::lookForTimestamps( updateobj ); checkNoMods( updateobj ); verify(nsdt); theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug ); if ( logop ) { logOp("u", ns, updateobj, &patternOrig, 0, fromMigrate, &updateobj ); } return UpdateResult( 1 , 0 , 1 , BSONObj() ); }
// PD_TRACE_DECLARE_FUNCTION ( SDB__DMSROUNIT_INSRCD, "_dmsReorgUnit::insertRecord" ) INT32 _dmsReorgUnit::insertRecord ( BSONObj &obj, _pmdEDUCB *cb, UINT32 attributes ) { INT32 rc = SDB_OK ; PD_TRACE_ENTRY ( SDB__DMSROUNIT_INSRCD ); UINT32 dmsrecordSize = 0 ; ossValuePtr recordPtr = 0 ; ossValuePtr prevPtr = 0 ; dmsOffset offset = DMS_INVALID_OFFSET ; dmsOffset recordOffset = DMS_INVALID_OFFSET ; dmsExtent *currentExtent = (dmsExtent*)_pCurrentExtent ; BOOLEAN isCompressed = FALSE ; const CHAR *compressedData = NULL ; INT32 compressedDataSize = 0 ; if ( obj.objsize() + DMS_RECORD_METADATA_SZ > DMS_RECORD_MAX_SZ ) { rc = SDB_CORRUPTED_RECORD ; goto error ; } if ( OSS_BIT_TEST ( attributes, DMS_MB_ATTR_COMPRESSED ) ) { rc = dmsCompress ( cb, obj, NULL, 0, &compressedData, &compressedDataSize ) ; PD_RC_CHECK ( rc, PDERROR, "Failed to compress record, rc = %d: %s", rc, obj.toString().c_str() ) ; dmsrecordSize = compressedDataSize + sizeof(INT32) ; if ( dmsrecordSize > (UINT32)(obj.objsize()) ) { dmsrecordSize = obj.objsize() ; } else { isCompressed = TRUE ; } } else { dmsrecordSize = obj.objsize() ; } dmsrecordSize += DMS_RECORD_METADATA_SZ ; dmsrecordSize *= DMS_RECORD_OVERFLOW_RATIO ; dmsrecordSize = OSS_MIN(DMS_RECORD_MAX_SZ, ossAlignX(dmsrecordSize,4)) ; alloc: if ( !_pCurrentExtent ) { rc = _allocateExtent ( dmsrecordSize << DMS_RECORDS_PER_EXTENT_SQUARE ) ; if ( rc ) { PD_LOG ( PDERROR, "Failed to allocate new extent in reorg file, " "rc = %d", rc ) ; goto error ; } currentExtent = (dmsExtent*)_pCurrentExtent ; } if ( dmsrecordSize > (UINT32)currentExtent->_freeSpace ) { rc = _flushExtent () ; if ( rc ) { PD_LOG ( PDERROR, "Failed to flush extent, rc = %d", rc ) ; goto error ; } goto alloc ; } recordOffset = _currentExtentSize - currentExtent->_freeSpace ; recordPtr = ((ossValuePtr)currentExtent) + recordOffset ; if ( currentExtent->_freeSpace - (INT32)dmsrecordSize < (INT32)DMS_MIN_RECORD_SZ && currentExtent->_freeSpace <= (INT32)DMS_RECORD_MAX_SZ ) { dmsrecordSize = (UINT32)currentExtent->_freeSpace ; } DMS_RECORD_SETSTATE ( recordPtr, DMS_RECORD_FLAG_NORMAL ) ; DMS_RECORD_RESETATTR ( recordPtr ) ; DMS_RECORD_SETMYOFFSET ( recordPtr, recordOffset ) ; DMS_RECORD_SETSIZE ( recordPtr, dmsrecordSize ) ; if ( isCompressed ) { DMS_RECORD_SETATTR ( recordPtr, DMS_RECORD_FLAG_COMPRESSED ) ; DMS_RECORD_SETDATA ( recordPtr, compressedData, compressedDataSize ) ; } else { DMS_RECORD_SETDATA ( recordPtr, obj.objdata(), obj.objsize() ) ; } DMS_RECORD_SETNEXTOFFSET ( recordPtr, DMS_INVALID_OFFSET ) ; DMS_RECORD_SETPREVOFFSET ( recordPtr, DMS_INVALID_OFFSET ) ; currentExtent->_recCount ++ ; currentExtent->_freeSpace -= dmsrecordSize ; offset = currentExtent->_lastRecordOffset ; if ( DMS_INVALID_OFFSET != offset ) { prevPtr = ((ossValuePtr)currentExtent) + offset ; DMS_RECORD_SETNEXTOFFSET ( prevPtr, recordOffset ) ; DMS_RECORD_SETPREVOFFSET ( recordPtr, offset ) ; } currentExtent->_lastRecordOffset = recordOffset ; offset = currentExtent->_firstRecordOffset ; if ( DMS_INVALID_OFFSET == offset ) { currentExtent->_firstRecordOffset = recordOffset ; } done : PD_TRACE_EXITRC ( SDB__DMSROUNIT_INSRCD, rc ); return rc ; error : goto done ; }
INT32 catMainController::_processMsg( const NET_HANDLE &handle, MsgHeader *pMsg ) { INT32 rc = SDB_OK ; switch ( pMsg->opCode ) { case MSG_BS_QUERY_REQ: { rc = _processQueryMsg( handle, pMsg ); break; } case MSG_BS_GETMORE_REQ : { rc = _processGetMoreMsg( handle, pMsg ) ; break ; } case MSG_BS_KILL_CONTEXT_REQ: { rc = _processKillContext( handle, pMsg ) ; break; } case MSG_BS_INTERRUPTE : { rc = _processInterruptMsg( handle, pMsg ) ; break ; } case MSG_BS_DISCONNECT : { rc = _processDisconnectMsg( handle, pMsg ) ; break ; } case MSG_CAT_QUERY_DATA_GRP_REQ : { rc = _processQueryDataGrp( handle, pMsg ) ; break ; } case MSG_CAT_QUERY_COLLECTIONS_REQ : { rc = _processQueryCollections( handle, pMsg ) ; break ; } case MSG_CAT_QUERY_COLLECTIONSPACES_REQ : { rc = _processQueryCollectionSpaces ( handle, pMsg ) ; break ; } case MSG_AUTH_VERIFY_REQ : { rc = _processAuthenticate( handle, pMsg ) ; break ; } case MSG_AUTH_CRTUSR_REQ : { _pCatCB->getCatDCMgr()->setImageCommand( TRUE ) ; rc = _processAuthCrt( handle, pMsg ) ; break ; } case MSG_AUTH_DELUSR_REQ : { _pCatCB->getCatDCMgr()->setImageCommand( TRUE ) ; rc = _processAuthDel( handle, pMsg ) ; break ; } case MSG_COOR_CHECK_ROUTEID_REQ : { rc = _processCheckRouteID( handle, pMsg ) ; break; } default : { PD_LOG( PDERROR, "Recieve unknow msg[opCode:(%d)%d, len: %d, " "tid: %d, reqID: %lld, nodeID: %u.%u.%u]", IS_REPLY_TYPE(pMsg->opCode), GET_REQUEST_TYPE(pMsg->opCode), pMsg->messageLength, pMsg->TID, pMsg->requestID, pMsg->routeID.columns.groupID, pMsg->routeID.columns.nodeID, pMsg->routeID.columns.serviceID ) ; rc = SDB_UNKNOWN_MESSAGE ; BSONObj err = utilGetErrorBson( rc, _pEDUCB->getInfo( EDU_INFO_ERROR ) ) ; MsgOpReply reply ; reply.header.opCode = MAKE_REPLY_TYPE( pMsg->opCode ) ; reply.header.messageLength = sizeof( MsgOpReply ) + err.objsize() ; reply.header.requestID = pMsg->requestID ; reply.header.routeID.value = 0 ; reply.header.TID = pMsg->TID ; reply.flags = rc ; reply.contextID = -1 ; reply.numReturned = 1 ; reply.startFrom = 0 ; _pCatCB->netWork()->syncSend( handle, (MsgHeader*)&reply, (void*)err.objdata(), err.objsize() ) ; break ; } } if ( rc && SDB_UNKNOWN_MESSAGE != rc ) { PD_LOG( PDWARNING, "Process msg[opCode:(%d)%d, len: %d, tid: %d, " "reqID: %lld, nodeID: %u.%u.%u] failed, rc: %d", IS_REPLY_TYPE(pMsg->opCode), GET_REQUEST_TYPE(pMsg->opCode), pMsg->messageLength, pMsg->TID, pMsg->requestID, pMsg->routeID.columns.groupID, pMsg->routeID.columns.nodeID, pMsg->routeID.columns.serviceID, rc ) ; } return rc ; }
std::string runQuery(OperationContext* opCtx, QueryMessage& q, const NamespaceString& nss, Message& result) { CurOp& curOp = *CurOp::get(opCtx); curOp.ensureStarted(); uassert(ErrorCodes::InvalidNamespace, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set CurOp information. const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip); beginQueryOp(opCtx, nss, upconvertedQuery, q.ntoreturn, q.ntoskip); // Parse the qm into a CanonicalQuery. const boost::intrusive_ptr<ExpressionContext> expCtx; auto cq = uassertStatusOKWithContext( CanonicalQuery::canonicalize(opCtx, q, expCtx, ExtensionsCallbackReal(opCtx, &nss), MatchExpressionParser::kAllowAllSpecialFeatures), "Can't canonicalize query"); invariant(cq.get()); LOG(5) << "Running query:\n" << redact(cq->toString()); LOG(2) << "Running query: " << redact(cq->toStringShort()); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForReadCommand ctx(opCtx, nss, AutoGetCollection::ViewMode::kViewsForbidden); Collection* const collection = ctx.getCollection(); { const QueryRequest& qr = cq->getQueryRequest(); // Allow the query to run on secondaries if the read preference permits it. If no read // preference was specified, allow the query to run iff slaveOk has been set. const bool slaveOK = qr.hasReadPref() ? uassertStatusOK(ReadPreferenceSetting::fromContainingBSON(q.query)) .canRunOnSecondary() : qr.isSlaveOk(); uassertStatusOK( repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, slaveOK)); } // We have a parsed query. Time to get the execution plan for it. auto exec = uassertStatusOK(getExecutorLegacyFind(opCtx, collection, nss, std::move(cq))); const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (qr.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages( exec.get(), collection, ExplainOptions::Verbosity::kExecAllPlans, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // Set query result fields. QueryResult::View qr = bb.buf(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curOp.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(bb.release()); return ""; } // Handle query option $maxTimeMS (not used with commands). if (qr.getMaxTimeMS() > 0) { uassert(40116, "Illegal attempt to set operation deadline within DBDirectClient", !opCtx->getClient()->isInDirectClient()); opCtx->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()}); } opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(FindCommon::kInitReplyBufferSize); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; BSONObj obj; PlanExecutor::ExecState state; // Get summary info about which plan the executor is using. { stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) { exec->enqueue(obj); break; } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; if (FindCommon::enoughForFirstBatch(qr, numResults)) { LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore() << " ntoreturn=" << qr.getNToReturn().value_or(0) << " numResults=" << numResults; break; } } // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { error() << "Plan executor error during find: " << PlanExecutor::statestr(state) << ", stats: " << redact(Explain::getWinningPlanStats(exec.get())); uassertStatusOKWithContext(WorkingSetCommon::getMemberObjectStatus(obj), "Executor error during OP_QUERY find"); MONGO_UNREACHABLE; } // Before saving the cursor, ensure that whatever plan we established happened with the expected // collection version auto css = CollectionShardingState::get(opCtx, nss); css->checkShardVersionOrThrow(opCtx); // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(opCtx, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); exec->detachFromOperationContext(); // Allocate a new ClientCursor and register it with the cursor manager. ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor( opCtx, {std::move(exec), nss, AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), opCtx->recoveryUnit()->getReadConcernLevel(), upconvertedQuery}); ccId = pinnedCursor.getCursor()->cursorid(); LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results"; // TODO document if (qr.isExhaust()) { curOp.debug().exhaust = true; } pinnedCursor.getCursor()->setPos(numResults); // We assume that cursors created through a DBDirectClient are always used from their // original OperationContext, so we do not need to move time to and from the cursor. if (!opCtx->getClient()->isInDirectClient()) { // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros()); } endQueryOp(opCtx, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId); } else { LOG(5) << "Not caching executor but returning " << numResults << " results."; endQueryOp(opCtx, collection, *exec, numResults, ccId); } // Fill out the output buffer's header. QueryResult::View queryResultView = bb.buf(); queryResultView.setCursorId(ccId); queryResultView.setResultFlagsToOk(); queryResultView.msgdata().setLen(bb.len()); queryResultView.msgdata().setOperation(opReply); queryResultView.setStartingFrom(0); queryResultView.setNReturned(numResults); // Add the results from the query into the output buffer. result.setData(bb.release()); // curOp.debug().exhaust is set above. return curOp.debug().exhaust ? nss.ns() : ""; }
// PD_TRACE_DECLARE_FUNCTION ( SDB__DMSSTORAGELOADEXT__IMPRTBLOCK, "dmsStorageLoadOp::pushToTempDataBlock" ) INT32 dmsStorageLoadOp::pushToTempDataBlock ( dmsMBContext *mbContext, pmdEDUCB *cb, BSONObj &record, BOOLEAN isLast, BOOLEAN isAsynchr ) { INT32 rc = SDB_OK ; PD_TRACE_ENTRY ( SDB__DMSSTORAGELOADEXT__IMPRTBLOCK ); UINT32 dmsrecordSize = 0 ; dmsRecord *pRecord = NULL ; dmsRecord *pPreRecord = NULL ; dmsOffset offset = DMS_INVALID_OFFSET ; dmsOffset recordOffset = DMS_INVALID_OFFSET ; _IDToInsert oid ; idToInsertEle oidEle((CHAR*)(&oid)) ; CHAR *pNewRecordData = NULL ; dmsRecordData recordData ; dmsCompressorEntry *compressorEntry = NULL ; SDB_ASSERT( mbContext, "mb context can't be NULL" ) ; compressorEntry = _su->data()->getCompressorEntry( mbContext->mbID() ) ; /* For concurrency protection with drop CL and set compresor. */ dmsCompressorGuard compGuard( compressorEntry, SHARED ) ; try { recordData.setData( record.objdata(), record.objsize(), FALSE, TRUE ) ; /* (0) */ BSONElement ele = record.getField ( DMS_ID_KEY_NAME ) ; const CHAR *pCheckErr = "" ; if ( !dmsIsRecordIDValid( ele, TRUE, &pCheckErr ) ) { PD_LOG( PDERROR, "Record[%s] _id is error: %s", record.toString().c_str(), pCheckErr ) ; rc = SDB_INVALIDARG ; goto error ; } if ( ele.eoo() ) { oid._oid.init() ; rc = cb->allocBuff( oidEle.size() + record.objsize(), &pNewRecordData ) ; if ( rc ) { PD_LOG( PDERROR, "Alloc memory[size:%u] failed, rc: %d", oidEle.size() + record.objsize(), rc ) ; goto error ; } *(UINT32*)pNewRecordData = oidEle.size() + record.objsize() ; ossMemcpy( pNewRecordData + sizeof(UINT32), oidEle.rawdata(), oidEle.size() ) ; ossMemcpy( pNewRecordData + sizeof(UINT32) + oidEle.size(), record.objdata() + sizeof(UINT32), record.objsize() - sizeof(UINT32) ) ; recordData.setData( pNewRecordData, oidEle.size() + record.objsize(), FALSE, TRUE ) ; record = BSONObj( pNewRecordData ) ; } dmsrecordSize = recordData.len() ; if ( recordData.len() + DMS_RECORD_METADATA_SZ > DMS_RECORD_USER_MAX_SZ ) { rc = SDB_DMS_RECORD_TOO_BIG ; goto error ; } if ( compressorEntry->ready() ) { const CHAR *compressedData = NULL ; INT32 compressedDataSize = 0 ; UINT8 compressRatio = 0 ; rc = dmsCompress( cb, compressorEntry, recordData.data(), recordData.len(), &compressedData, &compressedDataSize, compressRatio ) ; if ( SDB_OK == rc && compressedDataSize + sizeof(UINT32) < recordData.orgLen() && compressRatio < DMS_COMPRESS_RATIO_THRESHOLD ) { dmsrecordSize = compressedDataSize + sizeof(UINT32) ; recordData.setData( compressedData, compressedDataSize, TRUE, FALSE ) ; } else if ( rc ) { if ( SDB_UTIL_COMPRESS_ABORT == rc ) { PD_LOG( PDINFO, "Record compression aborted. " "Insert the original data. rc: %d", rc ) ; } else { PD_LOG( PDWARNING, "Record compression failed. " "Insert the original data. rc: %d", rc ) ; } rc = SDB_OK ; } } /* * Release the guard to avoid deadlock with truncate/drop collection. */ compGuard.release() ; dmsrecordSize *= DMS_RECORD_OVERFLOW_RATIO ; dmsrecordSize += DMS_RECORD_METADATA_SZ ; dmsrecordSize = OSS_MIN( DMS_RECORD_MAX_SZ, ossAlignX ( dmsrecordSize, 4 ) ) ; INT32 expandSize = dmsrecordSize << DMS_RECORDS_PER_EXTENT_SQUARE ; if ( expandSize > DMS_BEST_UP_EXTENT_SZ ) { expandSize = expandSize < DMS_BEST_UP_EXTENT_SZ ? DMS_BEST_UP_EXTENT_SZ : expandSize ; } if ( !_pCurrentExtent ) { rc = _allocateExtent ( expandSize ) ; if ( rc ) { PD_LOG ( PDERROR, "Failed to allocate new extent in reorg file, " "rc = %d", rc ) ; goto error ; } _currentExtent = (dmsExtent*)_pCurrentExtent ; } if ( dmsrecordSize > (UINT32)_currentExtent->_freeSpace || isLast ) { rc = mbContext->mbLock( EXCLUSIVE ) ; if ( rc ) { PD_LOG ( PDERROR, "Failed to lock collection, rc=%d", rc ) ; goto error ; } if ( !isAsynchr ) { _currentExtent->_firstRecordOffset = DMS_INVALID_OFFSET ; _currentExtent->_lastRecordOffset = DMS_INVALID_OFFSET ; } rc = _su->loadExtentA( mbContext, _pCurrentExtent, _currentExtentSize / _pageSize, TRUE ) ; mbContext->mbUnlock() ; if ( rc ) { PD_LOG ( PDERROR, "Failed to load extent, rc = %d", rc ) ; goto error ; } if ( isLast ) { goto done ; } rc = _allocateExtent ( expandSize ) ; if ( rc ) { PD_LOG ( PDERROR, "Failed to allocate new extent in reorg file, " "rc = %d", rc ) ; goto error ; } } recordOffset = _currentExtentSize - _currentExtent->_freeSpace ; pRecord = ( dmsRecord* )( (const CHAR*)_currentExtent + recordOffset ) ; if ( _currentExtent->_freeSpace - (INT32)dmsrecordSize < (INT32)DMS_MIN_RECORD_SZ && _currentExtent->_freeSpace <= (INT32)DMS_RECORD_MAX_SZ ) { dmsrecordSize = _currentExtent->_freeSpace ; } pRecord->setNormal() ; pRecord->setMyOffset( recordOffset ) ; pRecord->setSize( dmsrecordSize ) ; pRecord->setData( recordData ) ; pRecord->setNextOffset( DMS_INVALID_OFFSET ) ; pRecord->setPrevOffset( DMS_INVALID_OFFSET ) ; if ( isAsynchr ) { _currentExtent->_recCount++ ; } _currentExtent->_freeSpace -= dmsrecordSize ; offset = _currentExtent->_lastRecordOffset ; if ( DMS_INVALID_OFFSET != offset ) { pPreRecord = (dmsRecord*)( (const CHAR*)_currentExtent + offset ) ; pPreRecord->setNextOffset( recordOffset ) ; pRecord->setPrevOffset( offset ) ; } _currentExtent->_lastRecordOffset = recordOffset ; offset = _currentExtent->_firstRecordOffset ; if ( DMS_INVALID_OFFSET == offset ) { _currentExtent->_firstRecordOffset = recordOffset ; } } catch( std::exception &e ) { PD_LOG( PDERROR, "Occur exception: %s", e.what() ) ; rc = SDB_SYS ; goto error ; } done: PD_TRACE_EXITRC ( SDB__DMSSTORAGELOADEXT__IMPRTBLOCK, rc ); return rc ; error: goto done ; }
TEST(RocksRecordStoreTest, OplogHack) { RocksRecordStoreHarnessHelper harnessHelper; scoped_ptr<RecordStore> rs(harnessHelper.newNonCappedRecordStore("local.oplog.foo")); { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); // always illegal ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(2,-1)).getStatus(), ErrorCodes::BadValue); { BSONObj obj = BSON("not_ts" << Timestamp(2,1)); ASSERT_EQ(rs->insertRecord(opCtx.get(), obj.objdata(), obj.objsize(), false ).getStatus(), ErrorCodes::BadValue); obj = BSON( "ts" << "not an Timestamp" ); ASSERT_EQ(rs->insertRecord(opCtx.get(), obj.objdata(), obj.objsize(), false ).getStatus(), ErrorCodes::BadValue); } // currently dasserts // ASSERT_EQ(insertBSON(opCtx, rs, BSON("ts" << Timestamp(-2,1))).getStatus(), // ErrorCodes::BadValue); // success cases ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(1,1)).getValue(), RecordId(1,1)); ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(1,2)).getValue(), RecordId(1,2)); ASSERT_EQ(insertBSON(opCtx, rs, Timestamp(2,2)).getValue(), RecordId(2,2)); } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); // find start ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(0,1)), RecordId()); // nothing <= ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,1)), RecordId(1,2)); // between ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,2)), RecordId(2,2)); // == ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(2,2)); // > highest } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); rs->temp_cappedTruncateAfter(opCtx.get(), RecordId(2,2), false); // no-op } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(2,2)); } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); rs->temp_cappedTruncateAfter(opCtx.get(), RecordId(1,2), false); // deletes 2,2 } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(1,2)); } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); rs->temp_cappedTruncateAfter(opCtx.get(), RecordId(1,2), true); // deletes 1,2 } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId(1,1)); } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); WriteUnitOfWork wuow(opCtx.get()); ASSERT_OK(rs->truncate(opCtx.get())); // deletes 1,1 and leaves collection empty wuow.commit(); } { scoped_ptr<OperationContext> opCtx(harnessHelper.newOperationContext()); ASSERT_EQ(rs->oplogStartHack(opCtx.get(), RecordId(2,3)), RecordId()); } }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { log() << "Running query on new system: " << q.query.toString() << endl; // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; CanonicalQuery* cq; Status status = getRunner(q, &rawRunner, &cq); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Document why we do this. // TODO: do this when we can pass in our own parsed query //replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here, so we must register it with the active // runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; const bool isExplain = pq.isExplain(); if (isExplain && enoughForExplain(pq, numResults)) { break; } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. ClientCursor::deregisterRunner(runner.get()); // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } // TODO: Stage creation can set tailable depending on what's in the parsed query. We have // the full parsed query available during planning...set it there. // // TODO: If we're tailable we want to save the client cursor. Make sure we do this later. //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); log() << "caching runner with cursorid " << ccId << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, int nidx, bool validate, double pf, int pb, bool useDefaultPadding) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); unsigned skipped = 0; Database* db = cc().database(); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; DataFile* mdf = db->getFile( diskloc.a() ); HANDLE fd = mdf->getFd(); int offset = diskloc.getOfs(); Extent* ext = diskloc.ext(); size_t length = ext->length; touch_pages(fd, offset, length, ext); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = db->getExtentManager().getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; // maintain UsePowerOf2Sizes if no padding values were passed in if (d->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) && useDefaultPadding) { lenWPadding = d->quantizePowerOf2AllocationSpace(lenWPadding); } // otherwise use the padding values (pf and pb) that were passed in else { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); } if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent() == diskloc ); verify( d->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); cc().database()->getExtentManager().freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent d->incrementStats( datasize, nrecords ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb, bool useDefaultPadding) { // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; ProgressMeterHolder pm(cc().curop()->setMessage("compact extent", "Extent Compacting Progress", extents.size())); // same data, but might perform a little different after compact? Collection* collection = cc().database()->getCollection( ns ); verify( collection ); collection->infoCache()->addedIndex(); verify( d->getCompletedIndexCount() == d->getTotalIndexCount() ); int nidx = d->getCompletedIndexCount(); scoped_array<BSONObj> indexSpecs( new BSONObj[nidx] ); { NamespaceDetails::IndexIterator ii = d->ii(); // For each existing index... for( int idxNo = 0; ii.more(); ++idxNo ) { // Build a new index spec based on the old index spec. BSONObjBuilder b; BSONObj::iterator i(ii.next().info.obj()); while( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "v" ) ) { // Drop any preexisting index version spec. The default index version will // be used instead for the new index. continue; } if ( str::equals( e.fieldName(), "background" ) ) { // Create the new index in the foreground. continue; } // Pass the element through to the new index spec. b.append(e); } indexSpecs[idxNo] = b.obj().getOwned(); } } log() << "compact orphan deleted lists" << endl; d->orphanDeletedList(); // Start over from scratch with our extent sizing and growth d->setLastExtentSize( 0 ); // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; Status status = collection->getIndexCatalog()->dropAllIndexes( true ); if ( !status.isOK() ) { errmsg = str::stream() << "compact drop indexes failed: " << status.toString(); log() << status.toString() << endl; return false; } getDur().commitIfNeeded(); long long skipped = 0; int n = 0; // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent d->setStats( 0, 0 ); for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { skipped += compactExtent(ns, d, *i, n++, nidx, validate, pf, pb, useDefaultPadding); pm.hit(); } if( skipped ) { result.append("invalidObjects", skipped); } verify( d->firstExtent().ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); // build indexes NamespaceString s(ns); string si = s.db().toString() + ".system.indexes"; for( int i = 0; i < nidx; i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i]; log() << "compact create index " << info["key"].Obj().toString() << endl; theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize()); } return true; }
UpdateResult _updateObjects( bool su, const char* ns, const BSONObj& updateobj, const BSONObj& patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs, bool fromMigrate, const QueryPlanSelectionPolicy& planPolicy, bool forReplication ) { DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi ); Client& client = cc(); debug.updateobj = updateobj; // The idea with these here it to make them loop invariant for // multi updates, and thus be a bit faster for that case. The // pointers may be left invalid on a failed or terminal yield // recovery. NamespaceDetails* d = nsdetails(ns); // can be null if an upsert... NamespaceDetailsTransient* nsdt = &NamespaceDetailsTransient::get(ns); auto_ptr<ModSet> mods; bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$'; int modsIsIndexed = false; // really the # of indexes if ( isOperatorUpdate ) { mods.reset( new ModSet(updateobj, nsdt->indexKeys(), forReplication) ); modsIsIndexed = mods->maxNumIndexUpdated(); } if( planPolicy.permitOptimalIdPlan() && !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) { int idxNo = d->findIdIndex(); if( idxNo >= 0 ) { debug.idhack = true; UpdateResult result = _updateById( isOperatorUpdate, idxNo, mods.get(), d, nsdt, su, ns, updateobj, patternOrig, logop, debug, fromMigrate); if ( result.existing || ! upsert ) { return result; } else if ( upsert && ! isOperatorUpdate ) { // this handles repl inserts checkNoMods( updateobj ); debug.upsert = true; BSONObj no = updateobj; theDataFileMgr.insertWithObjMod(ns, no, false, su); if ( logop ) logOp( "i", ns, no, 0, 0, fromMigrate, &no ); return UpdateResult( 0 , 0 , 1 , no ); } } } int numModded = 0; debug.nscanned = 0; shared_ptr<Cursor> c = getOptimizedCursor( ns, patternOrig, BSONObj(), planPolicy ); d = nsdetails(ns); nsdt = &NamespaceDetailsTransient::get(ns); bool autoDedup = c->autoDedup(); if( c->ok() ) { set<DiskLoc> seenObjects; MatchDetails details; auto_ptr<ClientCursor> cc; do { if ( cc.get() == 0 && client.allowedToThrowPageFaultException() && ! c->currLoc().isNull() && ! c->currLoc().rec()->likelyInPhysicalMemory() ) { throw PageFaultException( c->currLoc().rec() ); } bool atomic = c->matcher() && c->matcher()->docMatcher().atomic(); if ( ! atomic && debug.nscanned > 0 ) { // we need to use a ClientCursor to yield if ( cc.get() == 0 ) { shared_ptr< Cursor > cPtr = c; cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) ); } bool didYield; if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) { cc.release(); break; } if ( !c->ok() ) { break; } if ( didYield ) { d = nsdetails(ns); if ( ! d ) break; nsdt = &NamespaceDetailsTransient::get(ns); if ( mods.get() ) { mods->setIndexedStatus( nsdt->indexKeys() ); modsIsIndexed = mods->maxNumIndexUpdated(); } } } // end yielding block debug.nscanned++; if ( mods.get() && mods->hasDynamicArray() ) { details.requestElemMatchKey(); } if ( !c->currentMatches( &details ) ) { c->advance(); continue; } Record* r = c->_current(); DiskLoc loc = c->currLoc(); if ( c->getsetdup( loc ) && autoDedup ) { c->advance(); continue; } BSONObj js = BSONObj::make(r); BSONObj pattern = patternOrig; if ( logop ) { BSONObjBuilder idPattern; BSONElement id; // NOTE: If the matching object lacks an id, we'll log // with the original pattern. This isn't replay-safe. // It might make sense to suppress the log instead // if there's no id. if ( js.getObjectID( id ) ) { idPattern.append( id ); pattern = idPattern.obj(); } else { uassert( 10157 , "multi-update requires all modified objects to have an _id" , ! multi ); } } /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some regular ones at the moment. */ if ( isOperatorUpdate ) { if ( multi ) { // go to next record in case this one moves c->advance(); // Update operations are deduped for cursors that implement their own // deduplication. In particular, some geo cursors are excluded. if ( autoDedup ) { if ( seenObjects.count( loc ) ) { continue; } // SERVER-5198 Advance past the document to be modified, provided // deduplication is enabled, but see SERVER-5725. while( c->ok() && loc == c->currLoc() ) { c->advance(); } } } const BSONObj& onDisk = loc.obj(); ModSet* useMods = mods.get(); auto_ptr<ModSet> mymodset; if ( details.hasElemMatchKey() && mods->hasDynamicArray() ) { useMods = mods->fixDynamicArray( details.elemMatchKey() ); mymodset.reset( useMods ); } auto_ptr<ModSetState> mss = useMods->prepare( onDisk, false /* not an insertion */ ); bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() ); if ( willAdvanceCursor ) { if ( cc.get() ) { cc->setDoingDeletes( true ); } c->prepareToTouchEarlierIterate(); } // If we've made it this far, "ns" must contain a valid collection name, and so // is of the form "db.collection". Therefore, the following expression must // always be valid. "system.users" updates must never be done in place, in // order to ensure that they are validated inside DataFileMgr::updateRecord(.). bool isSystemUsersMod = nsToCollectionSubstring(ns) == "system.users"; BSONObj newObj; if ( !mss->isUpdateIndexed() && mss->canApplyInPlace() && !isSystemUsersMod ) { mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) ); DEBUGUPDATE( "\t\t\t doing in place update" ); if ( !multi ) debug.fastmod = true; if ( modsIsIndexed ) { seenObjects.insert( loc ); } newObj = loc.obj(); d->paddingFits(); } else { newObj = mss->createNewFromMods(); checkTooLarge(newObj); DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc, newObj.objdata(), newObj.objsize(), debug); if ( newLoc != loc || modsIsIndexed ){ // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl; // object moved, need to make sure we don' get again seenObjects.insert( newLoc ); } } if ( logop ) { DEV verify( mods->size() ); BSONObj logObj = mss->getOpLogRewrite(); DEBUGUPDATE( "\t rewrite update: " << logObj ); // It is possible that the entire mod set was a no-op over this // document. We would have an empty log record in that case. If we // call logOp, with an empty record, that would be replicated as "clear // this record", which is not what we want. Therefore, to get a no-op // in the replica, we simply don't log. if ( logObj.nFields() ) { logOp("u", ns, logObj , &pattern, 0, fromMigrate, &newObj ); } } numModded++; if ( ! multi ) return UpdateResult( 1 , 1 , numModded , BSONObj() ); if ( willAdvanceCursor ) c->recoverFromTouchingEarlierIterate(); getDur().commitIfNeeded(); continue; } uassert( 10158 , "multi update only works with $ operators" , ! multi ); BSONElementManipulator::lookForTimestamps( updateobj ); checkNoMods( updateobj ); theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, su); if ( logop ) { DEV wassert( !su ); // super used doesn't get logged, this would be bad. logOp("u", ns, updateobj, &pattern, 0, fromMigrate, &updateobj ); } return UpdateResult( 1 , 0 , 1 , BSONObj() ); } while ( c->ok() ); } // endif if ( numModded ) return UpdateResult( 1 , 1 , numModded , BSONObj() ); if ( upsert ) { if ( updateobj.firstElementFieldName()[0] == '$' ) { // upsert of an $operation. build a default object BSONObj newObj = mods->createNewFromQuery( patternOrig ); checkNoMods( newObj ); debug.fastmodinsert = true; theDataFileMgr.insertWithObjMod(ns, newObj, false, su); if ( logop ) logOp( "i", ns, newObj, 0, 0, fromMigrate, &newObj ); return UpdateResult( 0 , 1 , 1 , newObj ); } uassert( 10159 , "multi update only works with $ operators" , ! multi ); checkNoMods( updateobj ); debug.upsert = true; BSONObj no = updateobj; theDataFileMgr.insertWithObjMod(ns, no, false, su); if ( logop ) logOp( "i", ns, no, 0, 0, fromMigrate, &no ); return UpdateResult( 0 , 0 , 1 , no ); } return UpdateResult( 0 , isOperatorUpdate , 0 , BSONObj() ); }
std::string DBHashCmd::hashCollection(OperationContext* opCtx, Database* db, const std::string& fullCollectionName, bool* fromCache) { stdx::unique_lock<stdx::mutex> cachedHashedLock(_cachedHashedMutex, stdx::defer_lock); if (isCachable(fullCollectionName)) { cachedHashedLock.lock(); string hash = _cachedHashed[fullCollectionName]; if (hash.size() > 0) { *fromCache = true; return hash; } } *fromCache = false; Collection* collection = db->getCollection(fullCollectionName); if (!collection) return ""; IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx); unique_ptr<PlanExecutor> exec; if (desc) { exec.reset(InternalPlanner::indexScan(opCtx, collection, desc, BSONObj(), BSONObj(), false, InternalPlanner::FORWARD, InternalPlanner::IXSCAN_FETCH)); } else if (collection->isCapped()) { exec.reset(InternalPlanner::collectionScan(opCtx, fullCollectionName, collection)); } else { log() << "can't find _id index for: " << fullCollectionName << endl; return "no _id _index"; } md5_state_t st; md5_init(&st); long long n = 0; PlanExecutor::ExecState state; BSONObj c; verify(NULL != exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&c, NULL))) { md5_append(&st, (const md5_byte_t*)c.objdata(), c.objsize()); n++; } if (PlanExecutor::IS_EOF != state) { warning() << "error while hashing, db dropped? ns=" << fullCollectionName << endl; } md5digest d; md5_finish(&st, d); string hash = digestToString(d); if (cachedHashedLock.owns_lock()) { _cachedHashed[fullCollectionName] = hash; } return hash; }
UpdateResult _updateObjectsNEW( bool su, const char* ns, const BSONObj& updateobj, const BSONObj& patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs, bool fromMigrate, const QueryPlanSelectionPolicy& planPolicy, bool forReplication ) { // TODO // + Separate UpdateParser from UpdateRunner (the latter should be "stage-y") // + All the yield and deduplicate logic would move to the query stage // portion of it // // + Replication related // + fast path for update for query by _id // + support for relaxing viable path constraint in replication // // + Field Management // + Force all upsert to contain _id // + Prevent changes to immutable fields (_id, and those mentioned by sharding) // // + Yiedling related // + $atomic support (or better, support proper yielding if not) // + page fault support debug.updateobj = updateobj; NamespaceDetails* d = nsdetails( ns ); NamespaceDetailsTransient* nsdt = &NamespaceDetailsTransient::get( ns ); UpdateDriver::Options opts; opts.multi = multi; opts.upsert = upsert; opts.logOp = logop; UpdateDriver driver( opts ); Status status = driver.parse( nsdt->indexKeys(), updateobj ); if ( !status.isOK() ) { uasserted( 16840, status.reason() ); } shared_ptr<Cursor> cursor = getOptimizedCursor( ns, patternOrig, BSONObj(), planPolicy ); // The 'cursor' the optimizer gave us may contain query plans that generate duplicate // diskloc's. We set up here the mechanims that will prevent us from processing those // twice if we see them. We also set up a 'ClientCursor' so that we can support // yielding. const bool dedupHere = cursor->autoDedup(); shared_ptr<Cursor> cPtr = cursor; auto_ptr<ClientCursor> clientCursor( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns ) ); // // Upsert Logic // // We may or may not have documents for this update. If we don't, then try to upsert, // if allowed. if ( !cursor->ok() && upsert ) { // If this is a $mod base update, we need to generate a document by examining the // query and the mods. Otherwise, we can use the object replacement sent by the user // update command that was parsed by the driver before. BSONObj oldObj; if ( *updateobj.firstElementFieldName() == '$' ) { if ( !driver.createFromQuery( patternOrig, &oldObj ) ) { uasserted( 16835, "cannot create object to update" ); } debug.fastmodinsert = true; } else { debug.upsert = true; } // Since this is an upsert, we will be oplogging it as an insert. We don't // need the driver's help to build the oplog record, then. We also set the // context of the update driver to an "upsert". Some mods may only work in that // context (e.g. $setOnInsert). driver.setLogOp( false ); driver.setContext( ModifierInterface::ExecInfo::INSERT_CONTEXT ); mutablebson::Document doc( oldObj, mutablebson::Document::kInPlaceDisabled ); status = driver.update( StringData(), &doc, NULL /* no oplog record */); if ( !status.isOK() ) { uasserted( 16836, status.reason() ); } BSONObj newObj = doc.getObject(); theDataFileMgr.insertWithObjMod( ns, newObj, false, su ); if ( logop ) { logOp( "i", ns, newObj, 0, 0, fromMigrate, &newObj ); } return UpdateResult( false /* updated a non existing document */, driver.dollarModMode() /* $mod or obj replacement? */, 1 /* count of updated documents */, newObj /* object that was upserted */ ); } // // We have one or more documents for this update. // // We record that this will not be an upsert, in case a mod doesn't want to be applied // when in strict update mode. driver.setContext( ModifierInterface::ExecInfo::UPDATE_CONTEXT ); // Let's fetch each of them and pipe them through the update expression, making sure to // keep track of the necessary stats. Recall that we'll be pulling documents out of // cursors and some of them do not deduplicate the entries they generate. We have // deduping logic in here, too -- for now. unordered_set<DiskLoc, DiskLoc::Hasher> seenLocs; int numUpdated = 0; debug.nscanned = 0; while ( cursor->ok() ) { // Let's fetch the next candidate object for this update. Record* r = cursor->_current(); DiskLoc loc = cursor->currLoc(); const BSONObj oldObj = loc.obj(); // We count how many documents we scanned even though we may skip those that are // deemed duplicated. The final 'numUpdated' and 'nscanned' numbers may differ for // that reason. debug.nscanned++; // Skips this document if it: // a) doesn't match the query portion of the update // b) was deemed duplicate by the underlying cursor machinery // // Now, if we are going to update the document, // c) we don't want to do so while the cursor is at it, as that may invalidate // the cursor. So, we advance to next document, before issuing the update. MatchDetails matchDetails; matchDetails.requestElemMatchKey(); if ( !cursor->currentMatches( &matchDetails ) ) { // a) cursor->advance(); continue; } else if ( cursor->getsetdup( loc ) && dedupHere ) { // b) cursor->advance(); continue; } else if (driver.dollarModMode() && multi) { // c) cursor->advance(); if ( dedupHere ) { if ( seenLocs.count( loc ) ) { continue; } } // There are certain kind of cursors that hold multiple pointers to data // underneath. $or cursors is one example. In a $or cursor, it may be the case // that when we did the last advance(), we finished consuming documents from // one of $or child and started consuming the next one. In that case, it is // possible that the last document of the previous child is the same as the // first document of the next (see SERVER-5198 and jstests/orp.js). // // So we advance the cursor here until we see a new diskloc. // // Note that we won't be yielding, and we may not do so for a while if we find // a particularly duplicated sequence of loc's. That is highly unlikely, // though. (See SERVER-5725, if curious, but "stage" based $or will make that // ticket moot). while( cursor->ok() && loc == cursor->currLoc() ) { cursor->advance(); } } // For some (unfortunate) historical reasons, not all cursors would be valid after // a write simply because we advanced them to a document not affected by the write. // To protect in those cases, not only we engaged in the advance() logic above, but // we also tell the cursor we're about to write a document that we've just seen. // prepareToTouchEarlierIterate() requires calling later // recoverFromTouchingEarlierIterate(), so we make a note here to do so. bool touchPreviousDoc = multi && cursor->ok(); if ( touchPreviousDoc ) { clientCursor->setDoingDeletes( true ); cursor->prepareToTouchEarlierIterate(); } // Ask the driver to apply the mods. It may be that the driver can apply those "in // place", that is, some values of the old document just get adjusted without any // change to the binary layout on the bson layer. It may be that a whole new // document is needed to accomodate the new bson layout of the resulting document. mutablebson::Document doc( oldObj, mutablebson::Document::kInPlaceEnabled ); BSONObj logObj; StringData matchedField = matchDetails.hasElemMatchKey() ? matchDetails.elemMatchKey(): StringData(); status = driver.update( matchedField, &doc, &logObj ); if ( !status.isOK() ) { uasserted( 16837, status.reason() ); } // If the driver applied the mods in place, we can ask the mutable for what // changed. We call those changes "damages". :) We use the damages to inform the // journal what was changed, and then apply them to the original document // ourselves. If, however, the driver applied the mods out of place, we ask it to // generate a new, modified document for us. In that case, the file manager will // take care of the journaling details for us. // // This code flow is admittedly odd. But, right now, journaling is baked in the file // manager. And if we aren't using the file manager, we have to do jounaling // ourselves. BSONObj newObj; const char* source = NULL; mutablebson::DamageVector damages; bool inPlace = doc.getInPlaceUpdates(&damages, &source); if ( inPlace && !driver.modsAffectIndices() ) { // All updates were in place. Apply them via durability and writing pointer. mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for( ; where != end; ++where ) { const char* sourcePtr = source + where->sourceOffset; void* targetPtr = getDur().writingPtr( const_cast<char*>(oldObj.objdata()) + where->targetOffset, where->size); std::memcpy(targetPtr, sourcePtr, where->size); } newObj = oldObj; debug.fastmod = true; } else { // The updates were not in place. Apply them through the file manager. newObj = doc.getObject(); DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc, newObj.objdata(), newObj.objsize(), debug); // If we've moved this object to a new location, make sure we don't apply // that update again if our traversal picks the objecta again. // // We also take note that the diskloc if the updates are affecting indices. // Chances are that we're traversing one of them and they may be multi key and // therefore duplicate disklocs. if ( newLoc != loc || driver.modsAffectIndices() ) { seenLocs.insert( newLoc ); } } // Log Obj if ( logop ) { if ( !logObj.isEmpty() ) { BSONObj pattern = patternOrig; logOp("u", ns, logObj , &pattern, 0, fromMigrate, &newObj ); } } // One more document updated. numUpdated++; if (!multi) { break; } // If we used the cursor mechanism that prepares an earlier seen document for a // write we need to tell such mechanisms that the write is over. if ( touchPreviousDoc ) { cursor->recoverFromTouchingEarlierIterate(); } getDur().commitIfNeeded(); } return UpdateResult( true /* updated existing object(s) */, driver.dollarModMode() /* $mod or obj replacement */, numUpdated /* # of docments update */, BSONObj() ); }
void applyOperation_inlock(const BSONObj& op , bool fromRepl ) { OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; if( logLevel >= 6 ) log() << "applying op: " << op << endl; assertInWriteLock(); OpDebug debug; BSONObj o = op.getObjectField("o"); const char *ns = op.getStringField("ns"); // operation type -- see logOp() comments for types const char *opType = op.getStringField("op"); if ( *opType == 'i' ) { opCounters->gotInsert(); const char *p = strchr(ns, '.'); if ( p && strcmp(p, ".system.indexes") == 0 ) { // updates aren't allowed for indexes -- so we will do a regular insert. if index already // exists, that is ok. theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize()); } else { // do upserts for inserts as we might get replayed more than once BSONElement _id; if( !o.getObjectID(_id) ) { /* No _id. This will be very slow. */ Timer t; updateObjects(ns, o, o, true, false, false , debug ); if( t.millis() >= 2 ) { RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; } } else { BSONObjBuilder b; b.append(_id); /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */ RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow /* todo : it may be better to do an insert here, and then catch the dup key exception and do update then. very few upserts will not be inserts... */ updateObjects(ns, o, b.done(), true, false, false , debug ); } } } else if ( *opType == 'u' ) { opCounters->gotUpdate(); RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow updateObjects(ns, o, op.getObjectField("o2"), /*upsert*/ op.getBoolField("b"), /*multi*/ false, /*logop*/ false , debug ); } else if ( *opType == 'd' ) { opCounters->gotDelete(); if ( opType[1] == 0 ) deleteObjects(ns, o, op.getBoolField("b")); else assert( opType[1] == 'b' ); // "db" advertisement } else if ( *opType == 'n' ) { // no op } else if ( *opType == 'c' ) { opCounters->gotCommand(); BufBuilder bb; BSONObjBuilder ob; _runCommands(ns, o, bb, ob, true, 0); } else { stringstream ss; ss << "unknown opType [" << opType << "]"; throw MsgAssertionException( 13141 , ss.str() ); } }
bool ShardedClientCursor::sendNextBatch(int batchSize, BufBuilder& buffer, int& docCount) { uassert(10191, "cursor already done", !_done); int maxSize = 1024 * 1024; if (_totalSent > 0) maxSize *= 3; docCount = 0; // If batchSize is negative, it means that we should send up to -batchSize results // back to the client, and that we should only send a *single batch*. An batchSize of // 1 is also a special case which means "return up to 1 result in a single batch" (so // that +1 actually has the same meaning of -1). For all other values of batchSize, we // may have to return multiple batches. const bool sendMoreBatches = batchSize == 0 || batchSize > 1; batchSize = abs(batchSize); // Set the initial batch size to 101, just like mongoD. if (batchSize == 0 && _totalSent == 0) batchSize = 101; // Set batch size to batchSize requested by the current operation unconditionally. This is // necessary because if the loop exited due to docCount == batchSize then setBatchSize(0) was // called, so the next _cusor->more() will be called with a batch size of 0 if the cursor // buffer was drained the previous run. Unconditionally setting the batch size ensures that // we don't ask for a batch size of zero as a side effect. _cursor->setBatchSize(batchSize); bool cursorHasMore = true; while ((cursorHasMore = _cursor->more())) { BSONObj o = _cursor->next(); buffer.appendBuf((void*)o.objdata(), o.objsize()); ++docCount; // Ensure that the next batch will never wind up requesting more docs from the shard // than are remaining to satisfy the initial batchSize. if (batchSize != 0) { if (docCount == batchSize) break; _cursor->setBatchSize(batchSize - docCount); } if (buffer.len() > maxSize) { break; } } // We need to request another batch if the following two conditions hold: // // 1. batchSize is positive and not equal to 1 (see the comment above). This condition // is stored in 'sendMoreBatches'. // // 2. The last call to _cursor->more() was true (i.e. we never explicitly got a false // value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server // hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the // query response to indicate that there are no more results. In this case, _cursor->more() // will be explicitly false, and we know for sure that we do not have to send more batches. // // On the other hand, if _cursor->more() is true there may or may not be more results. // Suppose that the mongod generates enough results to fill this batch. In this case it // does not know whether not there are more, because doing so would require requesting an // extra result and seeing whether we get EOF. The mongod sends a valid cursorId to // indicate that there may be more. We do the same here: we indicate that there may be // more results to retrieve by setting 'hasMoreBatches' to true. bool hasMoreBatches = sendMoreBatches && cursorHasMore; LOG(5) << "\t hasMoreBatches: " << hasMoreBatches << " sendMoreBatches: " << sendMoreBatches << " cursorHasMore: " << cursorHasMore << " batchSize: " << batchSize << " num: " << docCount << " id:" << getId() << " totalSent: " << _totalSent << endl; _totalSent += docCount; _done = !hasMoreBatches; return hasMoreBatches; }
// PD_TRACE_DECLARE_FUNCTION ( SDB_RTNINSERT2, "rtnInsert" ) INT32 rtnInsert ( const CHAR *pCollectionName, BSONObj &objs, INT32 objNum, INT32 flags, pmdEDUCB *cb, SDB_DMSCB *dmsCB, SDB_DPSCB *dpsCB, INT16 w ) { INT32 rc = SDB_OK ; PD_TRACE_ENTRY ( SDB_RTNINSERT2 ) ; SDB_ASSERT ( pCollectionName, "collection name can't be NULL" ) ; SDB_ASSERT ( cb, "educb can't be NULL" ) ; SDB_ASSERT ( dmsCB, "dmsCB can't be NULL" ) ; dmsStorageUnit *su = NULL ; dmsStorageUnitID suID = DMS_INVALID_CS ; const CHAR *pCollectionShortName = NULL ; UINT32 insertCount = 0 ; BOOLEAN writable = FALSE ; ossValuePtr pDataPos = 0 ; rc = dmsCB->writable( cb ) ; if ( rc ) { PD_LOG ( PDERROR, "Database is not writable, rc = %d", rc ) ; goto error; } writable = TRUE; rc = rtnResolveCollectionNameAndLock ( pCollectionName, dmsCB, &su, &pCollectionShortName, suID ) ; if ( rc ) { PD_LOG ( PDERROR, "Failed to resolve collection name %s", pCollectionName ) ; goto error ; } if ( objs.isEmpty () ) { PD_LOG ( PDERROR, "Insert record can't be empty" ) ; rc = SDB_INVALIDARG ; goto error ; } pDataPos = (ossValuePtr)objs.objdata() ; for ( INT32 i = 0 ; i < objNum ; ++i ) { if ( ++insertCount > RTN_INSERT_ONCE_NUM ) { insertCount = 0 ; if ( cb->isInterrupted() ) { rc = SDB_APP_INTERRUPT ; goto error ; } } try { BSONObj record ( (const CHAR*)pDataPos ) ; rc = su->insertRecord ( pCollectionShortName, record, cb, dpsCB ) ; if ( rc ) { if ( ( SDB_IXM_DUP_KEY == rc ) && ( FLG_INSERT_CONTONDUP & flags ) ) { rc = SDB_OK ; } else { PD_LOG ( PDERROR, "Failed to insert record %s into " "collection: %s, rc: %d", record.toString().c_str(), pCollectionName, rc ) ; goto error ; } } pDataPos += ossAlignX ( (ossValuePtr)record.objsize(), 4 ) ; } catch ( std::exception &e ) { PD_LOG ( PDERROR, "Failed to convert to BSON and insert to " "collection: %s", e.what() ) ; rc = SDB_INVALIDARG ; goto error ; } } done : if ( DMS_INVALID_CS != suID ) { dmsCB->suUnlock ( suID ) ; } if ( writable ) { dmsCB->writeDown( cb ); } if ( cb ) { if ( SDB_OK == rc && dpsCB ) { rc = dpsCB->completeOpr( cb, w ) ; } } PD_TRACE_EXITRC ( SDB_RTNINSERT2, rc ) ; return rc ; error : goto done ; }
bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { DEV verify( shardingState.enabled() ); int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; // We don't care about the version here, since we're returning it later in the writeback ChunkVersion received, wanted; if ( shardVersionOk( ns , errmsg, received, wanted ) ) { return false; } bool getsAResponse = doesOpGetAResponse( op ); LOG(1) << "connection sharding metadata does not match for collection " << ns << ", will retry (wanted : " << wanted << ", received : " << received << ")" << ( getsAResponse ? "" : " (queuing writeback)" ) << endl; if( getsAResponse ){ verify( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObjBuilder bob; bob.append( "$err", errmsg ); bob.append( "ns", ns ); wanted.addToBSON( bob, "vWanted" ); received.addToBSON( bob, "vReceived" ); BSONObj obj = bob.obj(); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } uassert(9517, "cannot queue a writeback operation to the writeback queue", (d.reservedField() & Reserved_FromWriteback) == 0); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); // We need to check this here, since otherwise we'll get errors wrapping the writeback - // not just here, but also when returning as a command result. // We choose 1/2 the overhead of the internal maximum so that we can still handle ops of // 16MB exactly. massert( 16437, "data size of operation is too large to queue for writeback", m.dataSize() < BSONObjMaxInternalSize - (8 * 1024)); LOG(1) << "writeback queued for " << m.toString() << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "connectionId" , cc().getConnectionId() ); b.append( "instanceIdent" , prettyHostName() ); wanted.addToBSON( b ); received.addToBSON( b, "yourVersion" ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; // we pass the builder to queueWriteBack so that it can select the writebackId // this is important so that the id is guaranteed to be ascending // that is important since mongos assumes if its seen a greater writeback // that all former have been processed OID writebackID = writeBackManager.queueWriteBack( clientID.str() , b ); lastError.getSafe()->writeback( writebackID ); return true; }
StatusWith<DiskLoc> Collection::updateDocument( OperationContext* txn, const DiskLoc& oldLocation, const BSONObj& objNew, bool enforceQuota, OpDebug* debug ) { BSONObj objOld = _recordStore->dataFor( txn, oldLocation ).toBson(); if ( objOld.hasElement( "_id" ) ) { BSONElement oldId = objOld["_id"]; BSONElement newId = objNew["_id"]; if ( oldId != newId ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596 ); } if ( ns().coll() == "system.users" ) { // XXX - andy and spencer think this should go away now V2UserDocumentParser parser; Status s = parser.checkValidUserDocument(objNew); if ( !s.isOK() ) return StatusWith<DiskLoc>( s ); } /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerMap<IndexDescriptor*,UpdateTicket> updateTickets; IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator( txn, true ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor); UpdateTicket* updateTicket = new UpdateTicket(); updateTickets.mutableMap()[descriptor] = updateTicket; Status ret = iam->validateUpdate(txn, objOld, objNew, oldLocation, options, updateTicket ); if ( !ret.isOK() ) { return StatusWith<DiskLoc>( ret ); } } // this can callback into Collection::recordStoreGoingToMove StatusWith<DiskLoc> newLocation = _recordStore->updateRecord( txn, oldLocation, objNew.objdata(), objNew.objsize(), _enforceQuota( enforceQuota ), this ); if ( !newLocation.isOK() ) { return newLocation; } _infoCache.notifyOfWriteOp(); if ( newLocation.getValue() != oldLocation ) { if ( debug ) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } _indexCatalog.indexRecord(txn, objNew, newLocation.getValue()); return newLocation; } if ( debug ) debug->keyUpdates = 0; ii = _indexCatalog.getIndexIterator( txn, true ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); int64_t updatedKeys; Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys); if ( !ret.isOK() ) return StatusWith<DiskLoc>( ret ); if ( debug ) debug->keyUpdates += updatedKeys; } // Broadcast the mutation so that query results stay correct. _cursorCache.invalidateDocument(oldLocation, INVALIDATION_MUTATION); return newLocation; }
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result) { //int les = d->lastExtentSize; // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitNow(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) ); // same data, but might perform a little different after compact? NamespaceDetailsTransient::get_w(ns).clearQueryCache(); int nidx = d->nIndexes; scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] ); scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] ); { NamespaceDetails::IndexIterator ii = d->ii(); int x = 0; while( ii.more() ) { BSONObjBuilder b; BSONObj::iterator i(ii.next().info.obj()); while( i.more() ) { BSONElement e = i.next(); if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) { b.append(e); } } BSONObj o = b.obj().getOwned(); phase1[x].sorter.reset( new BSONObjExternalSorter( o.getObjectField("key") ) ); phase1[x].sorter->hintNumObjects( d->stats.nrecords ); indexSpecs[x++].reset(o); } } log() << "compact orphan deleted lists" << endl; for( int i = 0; i < Buckets; i++ ) { d->deletedList[i].writing().Null(); } // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; BSONObjBuilder b; if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { errmsg = "compact drop indexes failed"; log() << errmsg << endl; return false; } getDur().commitNow(); long long skipped = 0; int n = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate); pm.hit(); } if( skipped ) { result.append("invalidObjects", skipped); } assert( d->firstExtent.ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); // build indexes NamespaceString s(ns); string si = s.db + ".system.indexes"; for( int i = 0; i < nidx; i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i].info; log() << "compact create index " << info["key"].Obj().toString() << endl; try { precalced = &phase1[i]; theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize()); } catch(...) { precalced = 0; throw; } precalced = 0; } return true; }
static void insert( const BSONObj &o, bool god = false ) { dblock lk; Client::Context ctx( ns() ); theDataFileMgr.insert( ns(), o.objdata(), o.objsize(), god ); }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); QLOG() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } else if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) { ccPin.free(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
/** object cannot be represented in compact format. so store in traditional bson format with a leading sentinel byte IsBSON to indicate it's in that format. Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here so that we don't have to do an extra malloc. */ void KeyV1Owned::traditional(const BSONObj& obj) { b.reset(); b.appendUChar(IsBSON); b.appendBuf(obj.objdata(), obj.objsize()); _keyData = (const unsigned char *) b.buf(); }
std::string runQuery(OperationContext* txn, QueryMessage& q, const NamespaceString& nss, Message& result) { CurOp& curop = *CurOp::get(txn); // Validate the namespace. uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set curop information. beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip); // Parse the qm into a CanonicalQuery. auto statusWithCQ = CanonicalQuery::canonicalize(q, ExtensionsCallbackReal(txn, &nss)); if (!statusWithCQ.isOK()) { uasserted( 17287, str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString()); } unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); invariant(cq.get()); LOG(5) << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // We have a parsed query. Time to get the execution plan for it. std::unique_ptr<PlanExecutor> exec = uassertStatusOK( getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO)); const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } ShardingState* const shardingState = ShardingState::get(txn); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.isSlaveOk() || pq.hasReadPref(); Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK); uassertStatusOK(serveReadsStatus); // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(FindCommon::kInitReplyBufferSize); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. { stdx::lock_guard<Client> lk(*txn->getClient()); curop.setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) { exec->enqueue(obj); break; } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.isOplogReplay()) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (FindCommon::enoughForFirstBatch(pq, numResults)) { LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore() << " ntoreturn=" << pq.getNToReturn().value_or(0) << " numResults=" << numResults << endl; break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { const unique_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error during find: " << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the // chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(nss.ns(), "version changed during initial query", shardingVersionAtStart, shardingState->getVersion(nss.ns())); } // Fill out curop based on query results. If we have a cursorid, we will fill out curop with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); exec->detachFromOperationContext(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); ccId = cc->cursorid(); LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // TODO document if (pq.isOplogReplay() && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.isExhaust()) { curop.debug().exhaust = true; } cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); endQueryOp(txn, collection, *cc->getExecutor(), dbProfilingLevel, numResults, ccId); } else { LOG(5) << "Not caching executor but returning " << numResults << " results.\n"; endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, ccId); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? nss.ns() : ""; }
StatusWith<RecordId> Collection::updateDocument(OperationContext* txn, const RecordId& oldLocation, const Snapshotted<BSONObj>& oldDoc, const BSONObj& newDoc, bool enforceQuota, bool indexesAffected, OpDebug* debug, oplogUpdateEntryArgs& args) { { auto status = checkValidation(txn, newDoc); if (!status.isOK()) { if (_validationLevel == STRICT_V) { return status; } // moderate means we have to check the old doc auto oldDocStatus = checkValidation(txn, oldDoc.value()); if (oldDocStatus.isOK()) { // transitioning from good -> bad is not ok return status; } // bad -> bad is ok in moderate mode } } dassert(txn->lockState()->isCollectionLockedForMode(ns().toString(), MODE_IX)); invariant(oldDoc.snapshotId() == txn->recoveryUnit()->getSnapshotId()); SnapshotId sid = txn->recoveryUnit()->getSnapshotId(); BSONElement oldId = oldDoc.value()["_id"]; if (!oldId.eoo() && (oldId != newDoc["_id"])) return StatusWith<RecordId>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596); // At the end of this step, we will have a map of UpdateTickets, one per index, which // represent the index updates needed to be done, based on the changes between oldDoc and // newDoc. OwnedPointerMap<IndexDescriptor*, UpdateTicket> updateTickets; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexCatalogEntry* entry = ii.catalogEntry(descriptor); IndexAccessMethod* iam = ii.accessMethod(descriptor); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || repl::getGlobalReplicationCoordinator()->shouldIgnoreUniqueIndex(descriptor); UpdateTicket* updateTicket = new UpdateTicket(); updateTickets.mutableMap()[descriptor] = updateTicket; Status ret = iam->validateUpdate(txn, oldDoc.value(), newDoc, oldLocation, options, updateTicket, entry->getFilterExpression()); if (!ret.isOK()) { return StatusWith<RecordId>(ret); } } } // This can call back into Collection::recordStoreGoingToMove. If that happens, the old // object is removed from all indexes. StatusWith<RecordId> newLocation = _recordStore->updateRecord( txn, oldLocation, newDoc.objdata(), newDoc.objsize(), _enforceQuota(enforceQuota), this); if (!newLocation.isOK()) { return newLocation; } // At this point, the old object may or may not still be indexed, depending on if it was // moved. If the object did move, we need to add the new location to all indexes. if (newLocation.getValue() != oldLocation) { if (debug) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } Status s = _indexCatalog.indexRecord(txn, newDoc, newLocation.getValue()); if (!s.isOK()) return StatusWith<RecordId>(s); invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; } // Object did not move. We update each index with each respective UpdateTicket. if (debug) debug->keyUpdates = 0; if (indexesAffected) { IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator(txn, true); while (ii.more()) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = ii.accessMethod(descriptor); int64_t updatedKeys; Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys); if (!ret.isOK()) return StatusWith<RecordId>(ret); if (debug) debug->keyUpdates += updatedKeys; } } invariant(sid == txn->recoveryUnit()->getSnapshotId()); args.ns = ns().ns(); getGlobalServiceContext()->getOpObserver()->onUpdate(txn, args); return newLocation; }
Status FTDCFileWriter::writeMetadata(const BSONObj& metadata, Date_t date) { BSONObj wrapped = FTDCBSONUtil::createBSONMetadataDocument(metadata, date); return writeArchiveFileBuffer({wrapped.objdata(), static_cast<size_t>(wrapped.objsize())}); }
bool ShardedClientCursor::sendNextBatch(int ntoreturn, BufBuilder& buffer, int& docCount) { uassert( 10191 , "cursor already done" , ! _done ); int maxSize = 1024 * 1024; if ( _totalSent > 0 ) maxSize *= 3; docCount = 0; // If ntoreturn is negative, it means that we should send up to -ntoreturn results // back to the client, and that we should only send a *single batch*. An ntoreturn of // 1 is also a special case which means "return up to 1 result in a single batch" (so // that +1 actually has the same meaning of -1). For all other values of ntoreturn, we // may have to return multiple batches. const bool sendMoreBatches = ntoreturn == 0 || ntoreturn > 1; ntoreturn = abs( ntoreturn ); bool cursorHasMore = true; while ( ( cursorHasMore = _cursor->more() ) ) { BSONObj o = _cursor->next(); buffer.appendBuf( (void*)o.objdata() , o.objsize() ); docCount++; // Ensure that the next batch will never wind up requesting more docs from the shard // than are remaining to satisfy the initial ntoreturn. if (ntoreturn != 0) { _cursor->setBatchSize(ntoreturn - docCount); } if ( buffer.len() > maxSize ) { break; } if ( docCount == ntoreturn ) { // soft limit aka batch size break; } if ( ntoreturn == 0 && _totalSent == 0 && docCount >= 100 ) { // first batch should be max 100 unless batch size specified break; } } // We need to request another batch if the following two conditions hold: // // 1. ntoreturn is positive and not equal to 1 (see the comment above). This condition // is stored in 'sendMoreBatches'. // // 2. The last call to _cursor->more() was true (i.e. we never explicitly got a false // value from _cursor->more()). This condition is stored in 'cursorHasMore'. If the server // hits EOF while executing a query or a getmore, it will pass a cursorId of 0 in the // query response to indicate that there are no more results. In this case, _cursor->more() // will be explicitly false, and we know for sure that we do not have to send more batches. // // On the other hand, if _cursor->more() is true there may or may not be more results. // Suppose that the mongod generates enough results to fill this batch. In this case it // does not know whether not there are more, because doing so would require requesting an // extra result and seeing whether we get EOF. The mongod sends a valid cursorId to // indicate that there may be more. We do the same here: we indicate that there may be // more results to retrieve by setting 'hasMoreBatches' to true. bool hasMoreBatches = sendMoreBatches && cursorHasMore; LOG(5) << "\t hasMoreBatches: " << hasMoreBatches << " sendMoreBatches: " << sendMoreBatches << " cursorHasMore: " << cursorHasMore << " ntoreturn: " << ntoreturn << " num: " << docCount << " id:" << getId() << " totalSent: " << _totalSent << endl; _totalSent += docCount; _done = ! hasMoreBatches; return hasMoreBatches; }