intrusive_ptr<DocumentSource> DocumentSourceGroup::createFromBson( BSONElement elem, const intrusive_ptr<ExpressionContext> &pExpCtx) { uassert(15947, "a group's fields must be specified in an object", elem.type() == Object); intrusive_ptr<DocumentSourceGroup> pGroup( DocumentSourceGroup::create(pExpCtx)); BSONObj groupObj(elem.Obj()); BSONObjIterator groupIterator(groupObj); VariablesIdGenerator idGenerator; VariablesParseState vps(&idGenerator); while(groupIterator.more()) { BSONElement groupField(groupIterator.next()); const char *pFieldName = groupField.fieldName(); if (str::equals(pFieldName, "_id")) { uassert(15948, "a group's _id may only be specified once", pGroup->_idExpressions.empty()); pGroup->parseIdExpression(groupField, vps); invariant(!pGroup->_idExpressions.empty()); } else if (str::equals(pFieldName, "$doingMerge")) { massert(17030, "$doingMerge should be true if present", groupField.Bool()); pGroup->setDoingMerge(true); } else { /* Treat as a projection field with the additional ability to add aggregation operators. */ uassert(16414, str::stream() << "the group aggregate field name '" << pFieldName << "' cannot be used because $group's field names cannot contain '.'", !str::contains(pFieldName, '.') ); uassert(15950, str::stream() << "the group aggregate field name '" << pFieldName << "' cannot be an operator name", pFieldName[0] != '$'); uassert(15951, str::stream() << "the group aggregate field '" << pFieldName << "' must be defined as an expression inside an object", groupField.type() == Object); BSONObj subField(groupField.Obj()); BSONObjIterator subIterator(subField); size_t subCount = 0; for(; subIterator.more(); ++subCount) { BSONElement subElement(subIterator.next()); /* look for the specified operator */ GroupOpDesc key; key.name = subElement.fieldName(); const GroupOpDesc *pOp = (const GroupOpDesc *)bsearch( &key, GroupOpTable, NGroupOp, sizeof(GroupOpDesc), GroupOpDescCmp); uassert(15952, str::stream() << "unknown group operator '" << key.name << "'", pOp); intrusive_ptr<Expression> pGroupExpr; BSONType elementType = subElement.type(); if (elementType == Object) { Expression::ObjectCtx oCtx(Expression::ObjectCtx::DOCUMENT_OK); pGroupExpr = Expression::parseObject(subElement.Obj(), &oCtx, vps); } else if (elementType == Array) { uasserted(15953, str::stream() << "aggregating group operators are unary (" << key.name << ")"); } else { /* assume its an atomic single operand */ pGroupExpr = Expression::parseOperand(subElement, vps); } pGroup->addAccumulator(pFieldName, pOp->factory, pGroupExpr); } uassert(15954, str::stream() << "the computed aggregate '" << pFieldName << "' must specify exactly one operator", subCount == 1); } } uassert(15955, "a group specification must include an _id", !pGroup->_idExpressions.empty()); pGroup->_variables.reset(new Variables(idGenerator.getIdCount())); return pGroup; }
bool group( OperationContext* txn, Database* db, const std::string& ns, const BSONObj& query, BSONObj keyPattern, const std::string& keyFunctionCode, const std::string& reduceCode, const char * reduceScope, BSONObj initial, const std::string& finalize, string& errmsg, BSONObjBuilder& result ) { const string userToken = ClientBasic::getCurrent()->getAuthorizationSession() ->getAuthenticatedUserNamesToken(); auto_ptr<Scope> s = globalScriptEngine->getPooledScope(db->name(), "group" + userToken); if ( reduceScope ) s->init( reduceScope ); s->setObject( "$initial" , initial , true ); s->exec( "$reduce = " + reduceCode , "$group reduce setup" , false , true , true , 100 ); s->exec( "$arr = [];" , "$group reduce setup 2" , false , true , true , 100 ); ScriptingFunction f = s->createFunction( "function(){ " " if ( $arr[n] == null ){ " " next = {}; " " Object.extend( next , $key ); " " Object.extend( next , $initial , true ); " " $arr[n] = next; " " next = null; " " } " " $reduce( obj , $arr[n] ); " "}" ); ScriptingFunction keyFunction = 0; if ( keyFunctionCode.size() ) { keyFunction = s->createFunction( keyFunctionCode.c_str() ); } double keysize = keyPattern.objsize() * 3; double keynum = 1; Collection* collection = db->getCollection( txn, ns ); const WhereCallbackReal whereCallback(txn, StringData(db->name())); map<BSONObj,int,BSONObjCmp> map; list<BSONObj> blah; if (collection) { CanonicalQuery* cq; if (!CanonicalQuery::canonicalize(ns, query, &cq, whereCallback).isOK()) { uasserted(17212, "Can't canonicalize query " + query.toString()); return 0; } Runner* rawRunner; if (!getRunner(txn,collection, cq, &rawRunner).isOK()) { uasserted(17213, "Can't get runner for query " + query.toString()); return 0; } auto_ptr<Runner> runner(rawRunner); const ScopedRunnerRegistration safety(runner.get()); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { BSONObj key = getKey(obj , keyPattern , keyFunction , keysize / keynum, s.get() ); keysize += key.objsize(); keynum++; int& n = map[key]; if ( n == 0 ) { n = map.size(); s->setObject( "$key" , key , true ); uassert(17203, "group() can't handle more than 20000 unique keys", n <= 20000 ); } s->setObject( "obj" , obj , true ); s->setNumber( "n" , n - 1 ); if ( s->invoke( f , 0, 0 , 0 , true ) ) { throw UserException(17214, (string)"reduce invoke failed: " + s->getError()); } } } if (!finalize.empty()) { s->exec( "$finalize = " + finalize , "$group finalize define" , false , true , true , 100 ); ScriptingFunction g = s->createFunction( "function(){ " " for(var i=0; i < $arr.length; i++){ " " var ret = $finalize($arr[i]); " " if (ret !== undefined) " " $arr[i] = ret; " " } " "}" ); s->invoke( g , 0, 0 , 0 , true ); } result.appendArray( "retval" , s->getObject( "$arr" ) ); result.append( "count" , keynum - 1 ); result.append( "keys" , (int)(map.size()) ); s->exec( "$arr = [];" , "$group reduce setup 2" , false , true , true , 100 ); s->gc(); return true; }
DocumentSource::GetNextResult Exchange::getNext(OperationContext* opCtx, size_t consumerId) { // Grab a lock. stdx::unique_lock<stdx::mutex> lk(_mutex); for (;;) { // Execute only in case we have not encountered an error. if (!_errorInLoadNextBatch.isOK()) { uasserted(ErrorCodes::ExchangePassthrough, "Exchange failed due to an error on different thread."); } // Check if we have a document. if (!_consumers[consumerId]->isEmpty()) { auto doc = _consumers[consumerId]->getNext(); unblockLoading(consumerId); return doc; } // There is not any document so try to load more from the source. if (_loadingThreadId == kInvalidThreadId) { LOG(3) << "A consumer " << consumerId << " begins loading"; try { // This consumer won the race and will fill the buffers. _loadingThreadId = consumerId; _pipeline->reattachToOperationContext(opCtx); // This will return when some exchange buffer is full and we cannot make any forward // progress anymore. // The return value is an index of a full consumer buffer. size_t fullConsumerId = loadNextBatch(); if (MONGO_FAIL_POINT(exchangeFailLoadNextBatch)) { log() << "exchangeFailLoadNextBatch fail point enabled."; uasserted(ErrorCodes::FailPointEnabled, "Asserting on loading the next batch due to failpoint."); } _pipeline->detachFromOperationContext(); // The loading cannot continue until the consumer with the full buffer consumes some // documents. _loadingThreadId = fullConsumerId; // Wake up everybody and try to make some progress. _haveBufferSpace.notify_all(); } catch (const DBException& ex) { _errorInLoadNextBatch = ex.toStatus(); // We have to wake up all other blocked threads so they can detect the error and // fail too. They can be woken up only after _errorInLoadNextBatch has been set. _haveBufferSpace.notify_all(); throw; } } else { // Some other consumer is already loading the buffers. There is nothing else we can do // but wait. _haveBufferSpace.wait(lk); } } }
void uasserted(int msgid , const string &msg) { uasserted(msgid, msg.c_str()); }
void WiredTigerRecoveryUnit::_txnOpen() { invariant(!_isActive(), toString(_state)); invariant(!_isCommittingOrAborting(), str::stream() << "commit or rollback handler reopened transaction: " << toString(_state)); _ensureSession(); // Only start a timer for transaction's lifetime if we're going to log it. if (shouldLog(kSlowTransactionSeverity)) { _timer.reset(new Timer()); } WT_SESSION* session = _session->getSession(); switch (_timestampReadSource) { case ReadSource::kUnset: case ReadSource::kNoTimestamp: { WiredTigerBeginTxnBlock txnOpen(session, _ignorePrepared); if (_isOplogReader) { auto status = txnOpen.setTimestamp(Timestamp(_oplogManager->getOplogReadTimestamp()), WiredTigerBeginTxnBlock::RoundToOldest::kRound); fassert(50771, status); } txnOpen.done(); break; } case ReadSource::kMajorityCommitted: { // We reset _majorityCommittedSnapshot to the actual read timestamp used when the // transaction was started. _majorityCommittedSnapshot = _sessionCache->snapshotManager().beginTransactionOnCommittedSnapshot( session, _ignorePrepared); break; } case ReadSource::kLastApplied: { if (_sessionCache->snapshotManager().getLocalSnapshot()) { _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot( session, _ignorePrepared); } else { WiredTigerBeginTxnBlock(session, _ignorePrepared).done(); } break; } case ReadSource::kAllCommittedSnapshot: { if (_readAtTimestamp.isNull()) { _readAtTimestamp = _beginTransactionAtAllCommittedTimestamp(session); break; } // Intentionally continue to the next case to read at the _readAtTimestamp. } case ReadSource::kLastAppliedSnapshot: { // Only ever read the last applied timestamp once, and continue reusing it for // subsequent transactions. if (_readAtTimestamp.isNull()) { _readAtTimestamp = _sessionCache->snapshotManager().beginTransactionOnLocalSnapshot( session, _ignorePrepared); break; } // Intentionally continue to the next case to read at the _readAtTimestamp. } case ReadSource::kProvided: { WiredTigerBeginTxnBlock txnOpen(session, _ignorePrepared); auto status = txnOpen.setTimestamp(_readAtTimestamp); if (!status.isOK() && status.code() == ErrorCodes::BadValue) { uasserted(ErrorCodes::SnapshotTooOld, str::stream() << "Read timestamp " << _readAtTimestamp.toString() << " is older than the oldest available timestamp."); } uassertStatusOK(status); txnOpen.done(); break; } } LOG(3) << "WT begin_transaction for snapshot id " << _mySnapshotId; }
Config::Config( const string& _dbname , const BSONObj& cmdObj ) { dbname = _dbname; ns = dbname + "." + cmdObj.firstElement().valuestr(); verbose = cmdObj["verbose"].trueValue(); uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() ); if ( cmdObj["out"].type() == String ) { finalShort = cmdObj["out"].String(); outType = REPLACE; } else if ( cmdObj["out"].type() == Object ) { BSONObj o = cmdObj["out"].embeddedObject(); BSONElement e = o.firstElement(); string t = e.fieldName(); if ( t == "normal" || t == "replace" ) { outType = REPLACE; finalShort = e.String(); } else if ( t == "merge" ) { outType = MERGE; finalShort = e.String(); } else if ( t == "reduce" ) { outType = REDUCE; finalShort = e.String(); } else if ( t == "inline" ) { outType = INMEMORY; } else { uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" ); } if (o.hasElement("db")) { outDB = o["db"].String(); } } else { uasserted( 13606 , "'out' has to be a string or an object" ); } if ( outType != INMEMORY ) { // setup names tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << finalShort << "_" << JOB_NUMBER++; incLong = tempLong + "_inc"; finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort; } { // scope and code if ( cmdObj["scope"].type() == Object ) scopeSetup = cmdObj["scope"].embeddedObjectUserCheck(); mapper.reset( new JSMapper( cmdObj["map"] ) ); reducer.reset( new JSReducer( cmdObj["reduce"] ) ); if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() ) finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) ); if ( cmdObj["mapparams"].type() == Array ) { mapParams = cmdObj["mapparams"].embeddedObjectUserCheck(); } } { // query options BSONElement q = cmdObj["query"]; if ( q.type() == Object ) filter = q.embeddedObjectUserCheck(); else uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() ); BSONElement s = cmdObj["sort"]; if ( s.type() == Object ) sort = s.embeddedObjectUserCheck(); else uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() ); if ( cmdObj["limit"].isNumber() ) limit = cmdObj["limit"].numberLong(); else limit = 0; } }
unsigned long long addExistingToIndex( OperationContext* txn, Collection* collection, const IndexDescriptor* descriptor, IndexAccessMethod* accessMethod, bool canBeKilled ) { string ns = collection->ns().ns(); // our copy for sanity bool dupsAllowed = !descriptor->unique(); bool dropDups = descriptor->dropDups(); string curopMessage; { stringstream ss; ss << "Index Build"; if ( canBeKilled ) ss << "(background)"; curopMessage = ss.str(); } ProgressMeter* progress = txn->setMessage(curopMessage.c_str(), curopMessage, collection->numRecords()); unsigned long long n = 0; unsigned long long numDropped = 0; auto_ptr<Runner> runner(InternalPlanner::collectionScan(ns,collection)); std::string idxName = descriptor->indexName(); // After this yields in the loop, idx may point at a different index (if indexes get // flipped, see insert_makeIndex) or even an empty IndexDetails, so nothing below should // depend on idx. idxNo should be recalculated after each yield. BSONObj js; DiskLoc loc; while (Runner::RUNNER_ADVANCED == runner->getNext(&js, &loc)) { try { if ( !dupsAllowed && dropDups ) { LastError::Disabled led( lastError.get() ); addKeysToIndex(txn, collection, descriptor, accessMethod, js, loc); } else { addKeysToIndex(txn, collection, descriptor, accessMethod, js, loc); } } catch( AssertionException& e ) { if (ErrorCodes::isInterruption(DBException::convertExceptionCode(e.getCode()))) { txn->checkForInterrupt(); } // TODO: Does exception really imply dropDups exception? if (dropDups) { bool runnerEOF = runner->isEOF(); runner->saveState(); BSONObj toDelete; collection->deleteDocument( txn, loc, false, true, &toDelete ); repl::logOp(txn, "d", ns.c_str(), toDelete); if (!runner->restoreState(txn)) { // Runner got killed somehow. This probably shouldn't happen. if (runnerEOF) { // Quote: "We were already at the end. Normal. // TODO: Why is this normal? } else { uasserted(ErrorCodes::CursorNotFound, "cursor gone during bg index; dropDups"); } break; } // We deleted a record, but we didn't actually yield the dblock. // TODO: Why did the old code assume we yielded the lock? numDropped++; } else { log() << "background addExistingToIndex exception " << e.what() << endl; throw; } } n++; progress->hit(); txn->recoveryUnit()->commitIfNeeded(); if (canBeKilled) { // Checking for interrupt here is necessary because the bg index // interruptors can only interrupt this index build while they hold // a write lock, and yieldAndCheckIfOK only checks for // interrupt prior to yielding our write lock. We need to check the kill flag // here before another iteration of the loop. txn->checkForInterrupt(); } progress->setTotalWhileRunning( collection->numRecords() ); } progress->finished(); if ( dropDups && numDropped ) log() << "\t index build dropped: " << numDropped << " dups"; return n; }
UpdateNode::ApplyResult RenameNode::apply(ApplyParams applyParams) const { // It would make sense to store fromFieldRef and toFieldRef as members during // RenameNode::init(), but FieldRef is not copyable. auto fromFieldRef = std::make_shared<FieldRef>(_val.fieldName()); FieldRef toFieldRef(_val.valueStringData()); mutablebson::Document& document = applyParams.element.getDocument(); size_t fromIdxFound; mutablebson::Element fromElement(document.end()); auto status = pathsupport::findLongestPrefix(*fromFieldRef, document.root(), &fromIdxFound, &fromElement); if (!status.isOK() || !fromElement.ok() || fromIdxFound != (fromFieldRef->numParts() - 1)) { // We could safely remove this restriction (thereby treating a rename with a non-viable // source path as a no-op), but most updates fail on an attempt to update a non-viable path, // so we throw an error for consistency. if (status == ErrorCodes::PathNotViable) { uassertStatusOK(status); MONGO_UNREACHABLE; // The previous uassertStatusOK should always throw. } // The element we want to rename does not exist. When that happens, we treat the operation // as a no-op. The attempted from/to paths are still considered modified. if (applyParams.modifiedPaths) { applyParams.modifiedPaths->keepShortest(*fromFieldRef); applyParams.modifiedPaths->keepShortest(toFieldRef); } return ApplyResult::noopResult(); } // Renaming through an array is prohibited. Check that our source path does not contain an // array. (The element being renamed may be an array, however.) for (auto currentElement = fromElement.parent(); currentElement != document.root(); currentElement = currentElement.parent()) { invariant(currentElement.ok()); if (BSONType::Array == currentElement.getType()) { auto idElem = mutablebson::findFirstChildNamed(document.root(), "_id"); uasserted(ErrorCodes::BadValue, str::stream() << "The source field cannot be an array element, '" << fromFieldRef->dottedField() << "' in doc with " << (idElem.ok() ? idElem.toString() : "no id") << " has an array field called '" << currentElement.getFieldName() << "'"); } } // Check that our destination path does not contain an array. (If the rename will overwrite an // existing element, that element may be an array. Iff pathToCreate is empty, "element" // represents an element that we are going to overwrite.) for (auto currentElement = applyParams.pathToCreate->empty() ? applyParams.element.parent() : applyParams.element; currentElement != document.root(); currentElement = currentElement.parent()) { invariant(currentElement.ok()); if (BSONType::Array == currentElement.getType()) { auto idElem = mutablebson::findFirstChildNamed(document.root(), "_id"); uasserted(ErrorCodes::BadValue, str::stream() << "The destination field cannot be an array element, '" << toFieldRef.dottedField() << "' in doc with " << (idElem.ok() ? idElem.toString() : "no id") << " has an array field called '" << currentElement.getFieldName() << "'"); } } // Once we've determined that the rename is valid and found the source element, the actual work // gets broken out into a $set operation and an $unset operation. Note that, generally, we // should call the init() method of a ModifierNode before calling its apply() method, but the // init() methods of SetElementNode and UnsetNode don't do anything, so we can skip them. SetElementNode setElement(fromElement); auto setElementApplyResult = setElement.apply(applyParams); ApplyParams unsetParams(applyParams); unsetParams.element = fromElement; unsetParams.pathToCreate = std::make_shared<FieldRef>(); unsetParams.pathTaken = fromFieldRef; UnsetNode unsetElement; auto unsetElementApplyResult = unsetElement.apply(unsetParams); // Determine the final result based on the results of the $set and $unset. ApplyResult applyResult; applyResult.indexesAffected = setElementApplyResult.indexesAffected || unsetElementApplyResult.indexesAffected; // The $unset would only be a no-op if the source element did not exist, in which case we would // have exited early with a no-op result. invariant(!unsetElementApplyResult.noop); return applyResult; }
OpMsg OpMsg::parse(const Message& message) try { // It is the caller's responsibility to call the correct parser for a given message type. invariant(!message.empty()); invariant(message.operation() == dbMsg); const uint32_t flags = OpMsg::flags(message); uassert(ErrorCodes::IllegalOpMsgFlag, str::stream() << "Message contains illegal flags value: Ob" << std::bitset<32>(flags).to_string(), !containsUnknownRequiredFlags(flags)); constexpr int kCrc32Size = 4; const bool haveChecksum = flags & kChecksumPresent; const int checksumSize = haveChecksum ? kCrc32Size : 0; // The sections begin after the flags and before the checksum (if present). BufReader sectionsBuf(message.singleData().data() + sizeof(flags), message.dataSize() - sizeof(flags) - checksumSize); // TODO some validation may make more sense in the IDL parser. I've tagged them with comments. bool haveBody = false; OpMsg msg; while (!sectionsBuf.atEof()) { const auto sectionKind = sectionsBuf.read<Section>(); switch (sectionKind) { case Section::kBody: { uassert(40430, "Multiple body sections in message", !haveBody); haveBody = true; msg.body = sectionsBuf.read<Validated<BSONObj>>(); break; } case Section::kDocSequence: { // We use an O(N^2) algorithm here and an O(N*M) algorithm below. These are fastest // for the current small values of N, but would be problematic if it is large. // If we need more document sequences, raise the limit and use a better algorithm. uassert(ErrorCodes::TooManyDocumentSequences, "Too many document sequences in OP_MSG", msg.sequences.size() < 2); // Limit is <=2 since we are about to add one. // The first 4 bytes are the total size, including themselves. const auto remainingSize = sectionsBuf.read<LittleEndian<int32_t>>() - sizeof(int32_t); BufReader seqBuf(sectionsBuf.skip(remainingSize), remainingSize); const auto name = seqBuf.readCStr(); uassert(40431, str::stream() << "Duplicate document sequence: " << name, !msg.getSequence(name)); // TODO IDL msg.sequences.push_back({name.toString()}); while (!seqBuf.atEof()) { msg.sequences.back().objs.push_back(seqBuf.read<Validated<BSONObj>>()); } break; } default: // Using uint32_t so we append as a decimal number rather than as a char. uasserted(40432, str::stream() << "Unknown section kind " << uint32_t(sectionKind)); } } uassert(40587, "OP_MSG messages must have a body", haveBody); // Detect duplicates between doc sequences and body. TODO IDL // Technically this is O(N*M) but N is at most 2. for (const auto& docSeq : msg.sequences) { const char* name = docSeq.name.c_str(); // Pointer is redirected by next call. auto inBody = !dotted_path_support::extractElementAtPathOrArrayAlongPath(msg.body, name).eoo(); uassert(40433, str::stream() << "Duplicate field between body and document sequence " << docSeq.name, !inBody); } return msg; } catch (const DBException& ex) { LOG(1) << "invalid message: " << ex.code() << " " << redact(ex) << " -- " << redact(hexdump(message.singleData().view2ptr(), message.size())); throw; }
UpdateResult update(const UpdateRequest& request, OpDebug* opDebug, UpdateDriver* driver) { LOG(3) << "processing update : " << request; const NamespaceString& nsString = request.getNamespaceString(); validateUpdate( nsString.ns().c_str(), request.getUpdates(), request.getQuery() ); NamespaceDetails* nsDetails = nsdetails( nsString.ns() ); NamespaceDetailsTransient* nsDetailsTransient = &NamespaceDetailsTransient::get( nsString.ns().c_str() ); // TODO: This seems a bit circuitious. opDebug->updateobj = request.getUpdates(); driver->refreshIndexKeys( nsDetailsTransient->indexKeys() ); shared_ptr<Cursor> cursor = getOptimizedCursor( nsString.ns(), request.getQuery(), BSONObj(), request.getQueryPlanSelectionPolicy() ); // If the update was marked with '$isolated' (a.k.a '$atomic'), we are not allowed to // yield while evaluating the update loop below. // // TODO: Old code checks this repeatedly within the update loop. Is that necessary? It seems // that once atomic should be always atomic. const bool isolated = cursor->ok() && cursor->matcher() && cursor->matcher()->docMatcher().atomic(); // The 'cursor' the optimizer gave us may contain query plans that generate duplicate // diskloc's. We set up here the mechanims that will prevent us from processing those // twice if we see them. We also set up a 'ClientCursor' so that we can support // yielding. // // TODO: Is it valid to call this on a non-ok cursor? const bool dedupHere = cursor->autoDedup(); // // We'll start assuming we have one or more documents for this update. (Othwerwise, // we'll fallback to upserting.) // // We record that this will not be an upsert, in case a mod doesn't want to be applied // when in strict update mode. driver->setContext( ModifierInterface::ExecInfo::UPDATE_CONTEXT ); // Let's fetch each of them and pipe them through the update expression, making sure to // keep track of the necessary stats. Recall that we'll be pulling documents out of // cursors and some of them do not deduplicate the entries they generate. We have // deduping logic in here, too -- for now. unordered_set<DiskLoc, DiskLoc::Hasher> seenLocs; int numMatched = 0; // Reset these counters on each call. We might re-enter this function to retry this // update if we throw a page fault exception below, and we rely on these counters // reflecting only the actions taken locally. In particlar, we must have the no-op // counter reset so that we can meaningfully comapre it with numMatched above. opDebug->nscanned = 0; opDebug->nupdateNoops = 0; Client& client = cc(); mutablebson::Document doc; mutablebson::DamageVector damages; // If we are going to be yielding, we will need a ClientCursor scoped to this loop. We // only loop as long as the underlying cursor is OK. for ( auto_ptr<ClientCursor> clientCursor; cursor->ok(); ) { // If we haven't constructed a ClientCursor, and if the client allows us to throw // page faults, and if we are referring to a location that is likely not in // physical memory, then throw a PageFaultException. The entire operation will be // restarted. if ( clientCursor.get() == NULL && client.allowedToThrowPageFaultException() && !cursor->currLoc().isNull() && !cursor->currLoc().rec()->likelyInPhysicalMemory() ) { // We should never throw a PFE if we have already updated items. The numMatched // variable includes no-ops, which do not prevent us from raising a PFE, so if // numMatched is non-zero, we are still OK to throw as long all matched items // resulted in a no-op. dassert((numMatched == 0) || (numMatched == opDebug->nupdateNoops)); throw PageFaultException( cursor->currLoc().rec() ); } if ( !isolated && opDebug->nscanned != 0 ) { // We are permitted to yield. To do so we need a ClientCursor, so create one // now if we have not yet done so. if ( !clientCursor.get() ) clientCursor.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, nsString.ns() ) ); // Ask the client cursor to yield. We get two bits of state back: whether or not // we yielded, and whether or not we correctly recovered from yielding. bool yielded = false; const bool recovered = clientCursor->yieldSometimes( ClientCursor::WillNeed, &yielded ); if ( !recovered ) { // If we failed to recover from the yield, then the ClientCursor is already // gone. Release it so we don't destroy it a second time. clientCursor.release(); break; } if ( !cursor->ok() ) { // If the cursor died while we were yielded, just get out of the update loop. break; } if ( yielded ) { // We yielded and recovered OK, and our cursor is still good. Details about // our namespace may have changed while we were yielded, so we re-acquire // them here. If we can't do so, escape the update loop. Otherwise, refresh // the driver so that it knows about what is currently indexed. nsDetails = nsdetails( nsString.ns() ); if ( !nsDetails ) break; nsDetailsTransient = &NamespaceDetailsTransient::get( nsString.ns().c_str() ); // TODO: This copies the index keys, but it may not need to do so. driver->refreshIndexKeys( nsDetailsTransient->indexKeys() ); } } // Let's fetch the next candidate object for this update. Record* record = cursor->_current(); DiskLoc loc = cursor->currLoc(); const BSONObj oldObj = loc.obj(); // We count how many documents we scanned even though we may skip those that are // deemed duplicated. The final 'numUpdated' and 'nscanned' numbers may differ for // that reason. opDebug->nscanned++; // Skips this document if it: // a) doesn't match the query portion of the update // b) was deemed duplicate by the underlying cursor machinery // // Now, if we are going to update the document, // c) we don't want to do so while the cursor is at it, as that may invalidate // the cursor. So, we advance to next document, before issuing the update. MatchDetails matchDetails; matchDetails.requestElemMatchKey(); if ( !cursor->currentMatches( &matchDetails ) ) { // a) cursor->advance(); continue; } else if ( cursor->getsetdup( loc ) && dedupHere ) { // b) cursor->advance(); continue; } else if (!driver->isDocReplacement() && request.isMulti()) { // c) cursor->advance(); if ( dedupHere ) { if ( seenLocs.count( loc ) ) { continue; } } // There are certain kind of cursors that hold multiple pointers to data // underneath. $or cursors is one example. In a $or cursor, it may be the case // that when we did the last advance(), we finished consuming documents from // one of $or child and started consuming the next one. In that case, it is // possible that the last document of the previous child is the same as the // first document of the next (see SERVER-5198 and jstests/orp.js). // // So we advance the cursor here until we see a new diskloc. // // Note that we won't be yielding, and we may not do so for a while if we find // a particularly duplicated sequence of loc's. That is highly unlikely, // though. (See SERVER-5725, if curious, but "stage" based $or will make that // ticket moot). while( cursor->ok() && loc == cursor->currLoc() ) { cursor->advance(); } } // For some (unfortunate) historical reasons, not all cursors would be valid after // a write simply because we advanced them to a document not affected by the write. // To protect in those cases, not only we engaged in the advance() logic above, but // we also tell the cursor we're about to write a document that we've just seen. // prepareToTouchEarlierIterate() requires calling later // recoverFromTouchingEarlierIterate(), so we make a note here to do so. bool touchPreviousDoc = request.isMulti() && cursor->ok(); if ( touchPreviousDoc ) { if ( clientCursor.get() ) clientCursor->setDoingDeletes( true ); cursor->prepareToTouchEarlierIterate(); } // Found a matching document numMatched++; // Ask the driver to apply the mods. It may be that the driver can apply those "in // place", that is, some values of the old document just get adjusted without any // change to the binary layout on the bson layer. It may be that a whole new // document is needed to accomodate the new bson layout of the resulting document. doc.reset( oldObj, mutablebson::Document::kInPlaceEnabled ); BSONObj logObj; // If there was a matched field, obtain it. string matchedField; if (matchDetails.hasElemMatchKey()) matchedField = matchDetails.elemMatchKey(); Status status = driver->update( matchedField, &doc, &logObj ); if ( !status.isOK() ) { uasserted( 16837, status.reason() ); } // If the driver applied the mods in place, we can ask the mutable for what // changed. We call those changes "damages". :) We use the damages to inform the // journal what was changed, and then apply them to the original document // ourselves. If, however, the driver applied the mods out of place, we ask it to // generate a new, modified document for us. In that case, the file manager will // take care of the journaling details for us. // // This code flow is admittedly odd. But, right now, journaling is baked in the file // manager. And if we aren't using the file manager, we have to do jounaling // ourselves. bool objectWasChanged = false; BSONObj newObj; const char* source = NULL; bool inPlace = doc.getInPlaceUpdates(&damages, &source); if ( inPlace && !driver->modsAffectIndices() ) { // If a set of modifiers were all no-ops, we are still 'in place', but there is // no work to do, in which case we want to consider the object unchanged. if (!damages.empty() ) { nsDetails->paddingFits(); // All updates were in place. Apply them via durability and writing pointer. mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for( ; where != end; ++where ) { const char* sourcePtr = source + where->sourceOffset; void* targetPtr = getDur().writingPtr( const_cast<char*>(oldObj.objdata()) + where->targetOffset, where->size); std::memcpy(targetPtr, sourcePtr, where->size); } objectWasChanged = true; opDebug->fastmod = true; } newObj = oldObj; } else { // The updates were not in place. Apply them through the file manager. newObj = doc.getObject(); DiskLoc newLoc = theDataFileMgr.updateRecord(nsString.ns().c_str(), nsDetails, nsDetailsTransient, record, loc, newObj.objdata(), newObj.objsize(), *opDebug); // If we've moved this object to a new location, make sure we don't apply // that update again if our traversal picks the objecta again. // // We also take note that the diskloc if the updates are affecting indices. // Chances are that we're traversing one of them and they may be multi key and // therefore duplicate disklocs. if ( newLoc != loc || driver->modsAffectIndices() ) { seenLocs.insert( newLoc ); } objectWasChanged = true; } // Log Obj if ( request.shouldUpdateOpLog() ) { if ( driver->isDocReplacement() || !logObj.isEmpty() ) { BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti()); logOp("u", nsString.ns().c_str(), logObj , &idQuery, NULL, request.isFromMigration(), &newObj); } } // If it was noop since the document didn't change, record that. if (!objectWasChanged) opDebug->nupdateNoops++; if (!request.isMulti()) { break; } // If we used the cursor mechanism that prepares an earlier seen document for a // write we need to tell such mechanisms that the write is over. if ( touchPreviousDoc ) { cursor->recoverFromTouchingEarlierIterate(); } getDur().commitIfNeeded(); } // TODO: Can this be simplified? if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) { opDebug->nupdated = numMatched; return UpdateResult( numMatched > 0 /* updated existing object(s) */, !driver->isDocReplacement() /* $mod or obj replacement */, numMatched /* # of docments update, even no-ops */, BSONObj() ); } // // We haven't found any existing document so an insert is done // (upsert is true). // opDebug->upsert = true; // Since this is an insert (no docs found and upsert:true), we will be logging it // as an insert in the oplog. We don't need the driver's help to build the // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT. // Some mods may only work in that context (e.g. $setOnInsert). driver->setLogOp( false ); driver->setContext( ModifierInterface::ExecInfo::INSERT_CONTEXT ); BSONObj baseObj; // Reset the document we will be writing to doc.reset( baseObj, mutablebson::Document::kInPlaceDisabled ); if ( request.getQuery().hasElement("_id") ) { uassertStatusOK(doc.root().appendElement(request.getQuery().getField("_id"))); } // If this is a $mod base update, we need to generate a document by examining the // query and the mods. Otherwise, we can use the object replacement sent by the user // update command that was parsed by the driver before. // In the following block we handle the query part, and then do the regular mods after. if ( *request.getUpdates().firstElementFieldName() == '$' ) { uassertStatusOK(UpdateDriver::createFromQuery(request.getQuery(), doc)); opDebug->fastmodinsert = true; } // Apply the update modifications and then log the update as an insert manually. Status status = driver->update( StringData(), &doc, NULL /* no oplog record */); if ( !status.isOK() ) { uasserted( 16836, status.reason() ); } BSONObj newObj = doc.getObject(); theDataFileMgr.insertWithObjMod( nsString.ns().c_str(), newObj, false, request.isGod() ); if ( request.shouldUpdateOpLog() ) { logOp( "i", nsString.ns().c_str(), newObj, NULL, NULL, request.isFromMigration(), &newObj ); } opDebug->nupdated = 1; return UpdateResult( false /* updated a non existing document */, !driver->isDocReplacement() /* $mod or obj replacement? */, 1 /* count of updated documents */, newObj /* object that was upserted */ ); }
void _insert( Request& r , DbMessage& d, ChunkManagerPtr manager ) { const int flags = d.reservedField() | InsertOption_ContinueOnError; // ContinueOnError is always on when using sharding. map<ChunkPtr, vector<BSONObj> > insertsForChunk; // Group bulk insert for appropriate shards try { while ( d.moreJSObjs() ) { BSONObj o = d.nextJsObj(); if ( ! manager->hasShardKey( o ) ) { bool bad = true; if ( manager->getShardKey().partOfShardKey( "_id" ) ) { BSONObjBuilder b; b.appendOID( "_id" , 0 , true ); b.appendElements( o ); o = b.obj(); bad = ! manager->hasShardKey( o ); } if ( bad ) { log() << "tried to insert object with no valid shard key: " << r.getns() << " " << o << endl; uasserted( 8011 , "tried to insert object with no valid shard key" ); } } // Many operations benefit from having the shard key early in the object o = manager->getShardKey().moveToFront(o); insertsForChunk[manager->findChunk(o)].push_back(o); } for (map<ChunkPtr, vector<BSONObj> >::iterator it = insertsForChunk.begin(); it != insertsForChunk.end(); ++it) { ChunkPtr c = it->first; vector<BSONObj> objs = it->second; const int maxTries = 30; bool gotThrough = false; for ( int i=0; i<maxTries; i++ ) { try { LOG(4) << " server:" << c->getShard().toString() << " bulk insert " << objs.size() << " documents" << endl; insert( c->getShard() , r.getns() , objs , flags); int bytesWritten = 0; for (vector<BSONObj>::iterator vecIt = objs.begin(); vecIt != objs.end(); ++vecIt) { r.gotInsert(); // Record the correct number of individual inserts bytesWritten += (*vecIt).objsize(); } if ( r.getClientInfo()->autoSplitOk() ) c->splitIfShould( bytesWritten ); gotThrough = true; break; } catch ( StaleConfigException& e ) { int logLevel = i < ( maxTries / 2 ); LOG( logLevel ) << "retrying bulk insert of " << objs.size() << " documents because of StaleConfigException: " << e << endl; r.reset(); manager = r.getChunkManager(); if( ! manager ) { uasserted(14804, "collection no longer sharded"); } unsigned long long old = manager->getSequenceNumber(); LOG( logLevel ) << " sequence number - old: " << old << " new: " << manager->getSequenceNumber() << endl; } sleepmillis( i * 20 ); } assert( inShutdown() || gotThrough ); // not caught below } } catch (const UserException&){ if (!d.moreJSObjs()){ throw; } // Ignore and keep going. ContinueOnError is implied with sharding. } }
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request) : _args(std::move(request)), _startTime() { invariant(!txn->lockState()->isLocked()); const auto& oss = OperationShardingState::get(txn); if (!oss.hasShardVersion()) { uasserted(ErrorCodes::InvalidOptions, "collection version is missing"); } // Even though the moveChunk command transmits a value in the operation's shardVersion field, // this value does not actually contain the shard version, but the global collection version. const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss()); log() << "Starting chunk migration for " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with expected collection version " << expectedCollectionVersion; // Now that the collection is locked, snapshot the metadata and fetch the latest versions ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); if (!refreshStatus.isOK()) { uasserted(refreshStatus.code(), str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " due to " << refreshStatus.toString()); } if (shardVersion.majorVersion() == 0) { // If the major version is zero, this means we do not have any chunks locally to migrate in // the first place uasserted(ErrorCodes::IncompatibleShardingMetadata, str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with zero shard version"); } // Snapshot the committed metadata from the time the migration starts { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); _committedMetadata = css->getMetadata(); } const ChunkVersion collectionVersion = _committedMetadata->getCollVersion(); if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) { throw SendStaleConfigException( _args.getNss().ns(), str::stream() << "cannot move chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " because collection may have been dropped. " << "current epoch: " << collectionVersion.epoch() << ", cmd epoch: " << expectedCollectionVersion.epoch(), expectedCollectionVersion, collectionVersion); } // With nonzero shard version, we must have a coll version >= our shard version invariant(collectionVersion >= shardVersion); // With nonzero shard version, we must have a shard key invariant(!_committedMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) { // If this assertion is hit, it means that whoever called the shard moveChunk command // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this // shard. It is a benign error and does not indicate data corruption. uasserted(40145, str::stream() << "Chunk with bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " is not owned by this shard."); } uassert(40146, str::stream() << "Unable to find chunk with the exact bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " at collection version " << collectionVersion.toString() << ". This indicates corrupted metadata.", origChunk.getMin().woCompare(_args.getMinKey()) == 0 && origChunk.getMax().woCompare(_args.getMaxKey()) == 0); }
UpdateNode::ApplyResult UpdateArrayNode::apply(ApplyParams applyParams) const { if (!applyParams.pathToCreate->empty()) { for (size_t i = 0; i < applyParams.pathToCreate->numParts(); ++i) { applyParams.pathTaken->appendPart(applyParams.pathToCreate->getPart(i)); } uasserted(ErrorCodes::BadValue, str::stream() << "The path '" << applyParams.pathTaken->dottedField() << "' must exist in the document in order to apply array updates."); } uassert(ErrorCodes::BadValue, str::stream() << "Cannot apply array updates to non-array element " << applyParams.element.toString(), applyParams.element.getType() == BSONType::Array); // Construct a map from the array index to the set of updates that should be applied to the // array element at that index. We do not apply the updates yet because we need to know how many // array elements will be updated in order to know whether to pass 'logBuilder' on to the // UpdateNode children. std::map<size_t, std::vector<UpdateNode*>> matchingElements; size_t i = 0; for (auto childElement = applyParams.element.leftChild(); childElement.ok(); childElement = childElement.rightSibling()) { // 'childElement' will always be serialized because no updates have been performed on the // array yet, and when we populate an upserted document with equality fields from the query, // arrays can only be added in entirety. invariant(childElement.hasValue()); auto arrayElement = childElement.getValue(); for (const auto& update : _children) { if (update.first.empty()) { // If the identifier is the empty string (e.g. came from 'a.$[].b'), the update // should be applied to all array elements. matchingElements[i].push_back(update.second.get()); } else { auto filter = _arrayFilters.find(update.first); invariant(filter != _arrayFilters.end()); if (filter->second->matchesBSONElement(arrayElement)) { matchingElements[i].push_back(update.second.get()); } } } ++i; } // If at most one array element will be updated, pass 'logBuilder' to the UpdateNode child when // applying it to that element. const bool childrenShouldLogThemselves = matchingElements.size() <= 1; // Keep track of which array elements were actually modified (non-noop updates) for logging // purposes. We only need to keep track of one element, since if more than one element is // modified, we log the whole array. boost::optional<mutablebson::Element> modifiedElement; size_t nModified = 0; // Update array elements. auto applyResult = ApplyResult::noopResult(); i = 0; for (auto childElement = applyParams.element.leftChild(); childElement.ok(); childElement = childElement.rightSibling()) { auto updates = matchingElements.find(i); if (updates != matchingElements.end()) { // Merge all of the updates for this array element. invariant(updates->second.size() > 0); auto mergedChild = updates->second[0]; FieldRefTempAppend tempAppend(*(applyParams.pathTaken), childElement.getFieldName()); for (size_t j = 1; j < updates->second.size(); ++j) { // Use the cached merge result, if it is available. const auto& cachedResult = _mergedChildrenCache[mergedChild][updates->second[j]]; if (cachedResult.get()) { mergedChild = cachedResult.get(); continue; } // The cached merge result is not available, so perform the merge and cache the // result. _mergedChildrenCache[mergedChild][updates->second[j]] = UpdateNode::createUpdateNodeByMerging( *mergedChild, *updates->second[j], applyParams.pathTaken.get()); mergedChild = _mergedChildrenCache[mergedChild][updates->second[j]].get(); } auto childApplyParams = applyParams; childApplyParams.element = childElement; if (!childrenShouldLogThemselves) { childApplyParams.logBuilder = nullptr; } auto childApplyResult = mergedChild->apply(childApplyParams); applyResult.indexesAffected = applyResult.indexesAffected || childApplyResult.indexesAffected; applyResult.noop = applyResult.noop && childApplyResult.noop; if (!childApplyResult.noop) { modifiedElement = childElement; ++nModified; } } ++i; } // If no elements match the array filter, report the path to the array itself as modified. if (applyParams.modifiedPaths && matchingElements.size() == 0) { applyParams.modifiedPaths->keepShortest(*applyParams.pathTaken); } // If the child updates have not been logged, log the updated array elements. if (!childrenShouldLogThemselves && applyParams.logBuilder) { if (nModified > 1) { // Log the entire array. auto logElement = applyParams.logBuilder->getDocument().makeElementWithNewFieldName( applyParams.pathTaken->dottedField(), applyParams.element); invariant(logElement.ok()); uassertStatusOK(applyParams.logBuilder->addToSets(logElement)); } else if (nModified == 1) { // Log the modified array element. invariant(modifiedElement); FieldRefTempAppend tempAppend(*(applyParams.pathTaken), modifiedElement->getFieldName()); auto logElement = applyParams.logBuilder->getDocument().makeElementWithNewFieldName( applyParams.pathTaken->dottedField(), *modifiedElement); invariant(logElement.ok()); uassertStatusOK(applyParams.logBuilder->addToSets(logElement)); } } return applyResult; }
static void assertParallelArrays(const char* first, const char* second) { std::stringstream ss; ss << "cannot index parallel arrays [" << first << "] [" << second << "]"; uasserted(ErrorCodes::CannotIndexParallelArrays, ss.str()); }
std::string runQuery(OperationContext* txn, QueryMessage& q, const NamespaceString& nss, CurOp& curop, Message &result) { // Validate the namespace. uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set curop information. beginQueryOp(nss, q.query, q.ntoreturn, q.ntoskip, &curop); // Parse the qm into a CanonicalQuery. std::auto_ptr<CanonicalQuery> cq; { CanonicalQuery* cqRaw; Status canonStatus = CanonicalQuery::canonicalize(q, &cqRaw, WhereCallbackReal(txn, nss.db())); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } cq.reset(cqRaw); } invariant(cq.get()); LOG(5) << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // We have a parsed query. Time to get the execution plan for it. std::unique_ptr<PlanExecutor> exec; { PlanExecutor* rawExec; Status execStatus = getExecutorFind(txn, collection, nss, cq.release(), PlanExecutor::YIELD_AUTO, &rawExec); uassertStatusOK(execStatus); exec.reset(rawExec); } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(nss.ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.isSlaveOk() || pq.hasReadPref(); Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, slaveOK); uassertStatusOK(serveReadsStatus); // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.isOplogReplay()) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForFirstBatch(pq, numResults, bb.len())) { LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the // chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState.getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(nss.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(nss.ns())); } // Fill out curop based on query results. If we have a cursorid, we will fill out curop with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), pq.getOptions(), pq.getFilter()); ccId = cc->cursorid(); if (txn->getClient()->isInDirectClient()) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.isTailable()) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. txn->recoveryUnit()->abandonSnapshot(); cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); invariant(txn->setRecoveryUnit(storageEngine->newRecoveryUnit(), OperationContext::kNotInUnitOfWork) == OperationContext::kNotInUnitOfWork); } LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // TODO document if (pq.isOplogReplay() && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.isExhaust()) { curop.debug().exhaust = true; } cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); endQueryOp(cc->getExecutor(), dbProfilingLevel, numResults, ccId, &curop); } else { LOG(5) << "Not caching executor but returning " << numResults << " results.\n"; endQueryOp(exec.get(), dbProfilingLevel, numResults, ccId, &curop); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? nss.ns() : ""; }
long long DeleteExecutor::execute(OperationContext* txn, Database* db) { uassertStatusOK(prepare()); uassert(17417, mongoutils::str::stream() << "DeleteExecutor::prepare() failed to parse query " << _request->getQuery(), _isQueryParsed); const bool logop = _request->shouldCallLogOp(); const NamespaceString& ns(_request->getNamespaceString()); if (!_request->isGod()) { if (ns.isSystem()) { uassert(12050, "cannot delete from system namespace", legalClientSystemNS(ns.ns(), true)); } if (ns.ns().find('$') != string::npos) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uasserted( 10100, "cannot delete from collection with reserved $ in name" ); } } Collection* collection = db->getCollection(txn, ns.ns()); if (NULL == collection) { return 0; } uassert(10101, str::stream() << "cannot remove from a capped collection: " << ns.ns(), !collection->isCapped()); uassert(ErrorCodes::NotMaster, str::stream() << "Not primary while removing from " << ns.ns(), !logop || repl::isMasterNs(ns.ns().c_str())); long long nDeleted = 0; Runner* rawRunner; if (_canonicalQuery.get()) { uassertStatusOK(getRunner(collection, _canonicalQuery.release(), &rawRunner)); } else { CanonicalQuery* ignored; uassertStatusOK(getRunner(collection, ns.ns(), _request->getQuery(), &rawRunner, &ignored)); } auto_ptr<Runner> runner(rawRunner); ScopedRunnerRegistration safety(runner.get()); DiskLoc rloc; Runner::RunnerState state; CurOp* curOp = txn->getCurOp(); int oldYieldCount = curOp->numYields(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &rloc))) { if (oldYieldCount != curOp->numYields()) { uassert(ErrorCodes::NotMaster, str::stream() << "No longer primary while removing from " << ns.ns(), !logop || repl::isMasterNs(ns.ns().c_str())); oldYieldCount = curOp->numYields(); } BSONObj toDelete; // TODO: do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? runner->saveState(); collection->deleteDocument(txn, rloc, false, false, logop ? &toDelete : NULL ); runner->restoreState(txn); nDeleted++; if (logop) { if ( toDelete.isEmpty() ) { log() << "Deleted object without id in collection " << collection->ns() << ", not logging."; } else { bool replJustOne = true; repl::logOp(txn, "d", ns.ns().c_str(), toDelete, 0, &replJustOne); } } if (!_request->isMulti()) { break; } if (!_request->isGod()) { txn->recoveryUnit()->commitIfNeeded(); } if (debug && _request->isGod() && nDeleted == 100) { log() << "warning high number of deletes with god=true " << " which could use significant memory b/c we don't commit journal"; } } return nDeleted; }
/** * actually applies a reduce, to a list of tuples (key, value). * After the call, tuples will hold a single tuple {"0": key, "1": value} */ void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) { uassert( 10074 , "need values" , tuples.size() ); int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128; // need to build the reduce args: ( key, [values] ) BSONObjBuilder reduceArgs( sizeEstimate ); boost::scoped_ptr<BSONArrayBuilder> valueBuilder; int sizeSoFar = 0; unsigned n = 0; for ( ; n<tuples.size(); n++ ) { BSONObjIterator j(tuples[n]); BSONElement keyE = j.next(); if ( n == 0 ) { reduceArgs.append( keyE ); key = keyE.wrap(); sizeSoFar = 5 + keyE.size(); valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) )); } BSONElement ee = j.next(); uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) ); if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) { assert( n > 1 ); // if not, inf. loop break; } valueBuilder->append( ee ); sizeSoFar += ee.size(); } assert(valueBuilder); valueBuilder->done(); BSONObj args = reduceArgs.obj(); Scope * s = _func.scope(); s->invokeSafe( _func.func() , args ); if ( s->type( "return" ) == Array ) { uasserted( 10075 , "reduce -> multiple not supported yet"); return; } endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() ); if ( n == tuples.size() ) return; // the input list was too large, add the rest of elmts to new tuples and reduce again // note: would be better to use loop instead of recursion to avoid stack overflow BSONList x; for ( ; n < tuples.size(); n++ ) { x.push_back( tuples[n] ); } BSONObjBuilder temp( endSizeEstimate ); temp.append( key.firstElement() ); s->append( temp , "1" , "return" ); x.push_back( temp.obj() ); _reduce( x , key , endSizeEstimate ); }
// ran at startup. static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) { // LastError * le = lastError.get( true ); LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; Lock::GlobalWrite lk; vector< string > dbNames; getDatabaseNames( dbNames ); for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { string dbName = *i; LOG(1) << "\t" << dbName << endl; Client::Context ctx( dbName ); DataFile *p = ctx.db()->getExtentManager().getFile( 0 ); DataFileHeader *h = p->getHeader(); if ( replSettings.usingReplSets() ) { // we only care about the _id index if we are in a replset checkForIdIndexes(ctx.db()); } if (shouldClearNonLocalTmpCollections || dbName == "local") ctx.db()->clearTmpCollections(); if (!h->isCurrentVersion() || mongodGlobalParams.repair) { if( h->version <= 0 ) { uasserted(14026, str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version << " info: " << h->versionMinor << ' ' << h->fileLength); } if ( !h->isCurrentVersion() ) { log() << "****" << endl; log() << "****" << endl; log() << "need to upgrade database " << dbName << " " << "with pdfile version " << h->version << "." << h->versionMinor << ", " << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR_22_AND_OLDER << endl; } if (mongodGlobalParams.upgrade) { // QUESTION: Repair even if file format is higher version than code? doDBUpgrade( dbName, h ); } else { log() << "\t Not upgrading, exiting" << endl; log() << "\t run --upgrade to upgrade dbs, then start again" << endl; log() << "****" << endl; dbexit( EXIT_NEED_UPGRADE ); mongodGlobalParams.upgrade = 1; return; } } else { const string systemIndexes = cc().database()->name() + ".system.indexes"; auto_ptr<Runner> runner(InternalPlanner::collectionScan(systemIndexes)); BSONObj index; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (h->versionMinor == PDFILE_VERSION_MINOR_22_AND_OLDER) { if (IndexNames::existedBefore24(plugin)) continue; log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } } if (Runner::RUNNER_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } Database::closeDatabase(dbName.c_str(), storageGlobalParams.dbpath); } } LOG(1) << "done repairDatabases" << endl; if (mongodGlobalParams.upgrade) { log() << "finished checking dbs" << endl; cc().shutdown(); dbexit( EXIT_CLEAN ); } }
/** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. */ const DiskLoc DataFileMgr::updateRecord( const char *ns, Collection* collection, Record *toupdate, const DiskLoc& dl, const char *_buf, int _len, OpDebug& debug, bool god) { dassert( toupdate == dl.rec() ); BSONObj objOld = BSONObj::make(toupdate); BSONObj objNew(_buf); DEV verify( objNew.objsize() == _len ); DEV verify( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { /* add back the old _id value if the update removes it. Note this implementation is slow (copies entire object multiple times), but this shouldn't happen often, so going for simple code, not speed. */ BSONObjBuilder b; BSONElement e; verify( objOld.getObjectID(e) ); b.append(e); // put _id first, for best performance b.appendElements(objNew); objNew = b.obj(); } NamespaceString nsstring(ns); if (nsstring.coll() == "system.users") { V2UserDocumentParser parser; uassertStatusOK(parser.checkValidUserDocument(objNew)); } uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew, objNew["_id"] == objOld["_id"]); /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerVector<UpdateTicket> updateTickets; updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount()); for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) { auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i)); auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get())); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || ignoreUniqueIndex(descriptor->getOnDisk()); updateTickets.mutableVector()[i] = new UpdateTicket(); Status ret = iam->validateUpdate(objOld, objNew, dl, options, updateTickets.mutableVector()[i]); if (Status::OK() != ret) { uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString()); } } if ( toupdate->netLength() < objNew.objsize() ) { // doesn't fit. reallocate ----------------------------------------------------- moveCounter.increment(); uassert( 10003, "failing update: objects in a capped ns cannot grow", !(collection && collection->details()->isCapped())); collection->details()->paddingTooSmall(); deleteRecord(ns, toupdate, dl); DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god); if (debug.nmoved == -1) // default of -1 rather than 0 debug.nmoved = 1; else debug.nmoved += 1; return res; } collection->infoCache()->notifyOfWriteOp(); collection->details()->paddingFits(); debug.keyUpdates = 0; for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) { auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i)); auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get())); int64_t updatedKeys; Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys); if (Status::OK() != ret) { // This shouldn't happen unless something disastrous occurred. massert(16799, "update failed: " + ret.toString(), false); } debug.keyUpdates += updatedKeys; } // update in place int sz = objNew.objsize(); memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz); return dl; }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { // Determine the default read preference mode based on the value of the slaveOk flag. ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk) ? ReadPreference::SecondaryPreferred : ReadPreference::PrimaryOnly; ReadPreferenceSetting readPreference(readPreferenceOption, TagSet()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK(ClusterFind::runExplain( txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
void _quotaExceeded() { uasserted(12501, "quota exceeded"); }
void Strategy::clientCommandOp(OperationContext* txn, Request& request) { QueryMessage q(request.d()); LOG(3) << "command: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.queryOptions & QueryOption_Exhaust) { uasserted(18527, string("the 'exhaust' query option is invalid for mongos commands: ") + q.ns + " " + q.query.toString()); } NamespaceString nss(request.getns()); // Regular queries are handled in strategy_shard.cpp verify(nss.isCommand() || nss.isSpecialCommand()); if (handleSpecialNamespaces(txn, request, q)) return; int loops = 5; bool cmChangeAttempted = false; while (true) { BSONObjBuilder builder; try { BSONObj cmdObj = q.query; { BSONElement e = cmdObj.firstElement(); if (e.type() == Object && (e.fieldName()[0] == '$' ? str::equals("query", e.fieldName() + 1) : str::equals("query", e.fieldName()))) { // Extract the embedded query object. if (cmdObj.hasField(Query::ReadPrefField.name())) { // The command has a read preference setting. We don't want // to lose this information so we copy this to a new field // called $queryOptions.$readPreference BSONObjBuilder finalCmdObjBuilder; finalCmdObjBuilder.appendElements(e.embeddedObject()); BSONObjBuilder queryOptionsBuilder( finalCmdObjBuilder.subobjStart("$queryOptions")); queryOptionsBuilder.append(cmdObj[Query::ReadPrefField.name()]); queryOptionsBuilder.done(); cmdObj = finalCmdObjBuilder.obj(); } else { cmdObj = e.embeddedObject(); } } } Command::runAgainstRegistered(txn, q.ns, cmdObj, builder, q.queryOptions); BSONObj x = builder.done(); replyToQuery(0, request.p(), request.m(), x); return; } catch (const StaleConfigException& e) { if (loops <= 0) throw e; loops--; log() << "retrying command: " << q.query; // For legacy reasons, ns may not actually be set in the exception :-( string staleNS = e.getns(); if (staleNS.size() == 0) staleNS = q.ns; ShardConnection::checkMyConnectionVersions(txn, staleNS); if (loops < 4) versionManager.forceRemoteCheckShardVersionCB(txn, staleNS); } catch (const DBException& e) { if (e.getCode() == ErrorCodes::IncompatibleCatalogManager) { fassert(28791, !cmChangeAttempted); cmChangeAttempted = true; grid.forwardingCatalogManager()->waitForCatalogManagerChange(txn); } else { Command::appendCommandStatus(builder, e.toStatus()); BSONObj x = builder.done(); replyToQuery(0, request.p(), request.m(), x); return; } } } }
bool MessagingPort::recv(Message& m) { try { #ifdef MONGO_CONFIG_SSL again: #endif // mmm( log() << "* recv() sock:" << this->sock << endl; ) MSGHEADER::Value header; int headerLen = sizeof(MSGHEADER::Value); psock->recv((char*)&header, headerLen); int len = header.constView().getMessageLength(); if (len == 542393671) { // an http GET string msg = "It looks like you are trying to access MongoDB over HTTP on the native driver " "port.\n"; LOG(psock->getLogLevel()) << msg; std::stringstream ss; ss << "HTTP/1.0 200 OK\r\nConnection: close\r\nContent-Type: " "text/plain\r\nContent-Length: " << msg.size() << "\r\n\r\n" << msg; string s = ss.str(); send(s.c_str(), s.size(), "http"); return false; } // If responseTo is not 0 or -1 for first packet assume SSL else if (psock->isAwaitingHandshake()) { #ifndef MONGO_CONFIG_SSL if (header.constView().getResponseTo() != 0 && header.constView().getResponseTo() != -1) { uasserted(17133, "SSL handshake requested, SSL feature not available in this build"); } #else if (header.constView().getResponseTo() != 0 && header.constView().getResponseTo() != -1) { uassert(17132, "SSL handshake received but server is started without SSL support", sslGlobalParams.sslMode.load() != SSLParams::SSLMode_disabled); setX509SubjectName( psock->doSSLHandshake(reinterpret_cast<const char*>(&header), sizeof(header))); psock->setHandshakeReceived(); goto again; } uassert(17189, "The server is configured to only allow SSL connections", sslGlobalParams.sslMode.load() != SSLParams::SSLMode_requireSSL); #endif // MONGO_CONFIG_SSL } if (static_cast<size_t>(len) < sizeof(MSGHEADER::Value) || static_cast<size_t>(len) > MaxMessageSizeBytes) { LOG(0) << "recv(): message len " << len << " is invalid. " << "Min " << sizeof(MSGHEADER::Value) << " Max: " << MaxMessageSizeBytes; return false; } psock->setHandshakeReceived(); int z = (len + 1023) & 0xfffffc00; verify(z >= len); MsgData::View md = reinterpret_cast<char*>(mongolMalloc(z)); ScopeGuard guard = MakeGuard(free, md.view2ptr()); verify(md.view2ptr()); memcpy(md.view2ptr(), &header, headerLen); int left = len - headerLen; psock->recv(md.data(), left); guard.Dismiss(); m.setData(md.view2ptr(), true); return true; } catch (const SocketException& e) { logger::LogSeverity severity = psock->getLogLevel(); if (!e.shouldPrint()) severity = severity.lessSevere(); LOG(severity) << "SocketException: remote: " << remote() << " error: " << e; m.reset(); return false; } }
void Strategy::getMore(OperationContext* txn, Request& request) { Timer getMoreTimer; const char* ns = request.getns(); const int ntoreturn = request.d().pullInt(); const long long id = request.d().pullInt64(); // TODO: Handle stale config exceptions here from coll being dropped or sharded during op // for now has same semantics as legacy request const NamespaceString nss(ns); auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString()); if (statusGetDb == ErrorCodes::DatabaseNotFound) { cursorCache.remove(id); replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0); return; } uassertStatusOK(statusGetDb); // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { boost::optional<long long> batchSize; if (ntoreturn) { batchSize = ntoreturn; } GetMoreRequest getMoreRequest(NamespaceString(ns), id, batchSize, boost::none); auto cursorResponse = ClusterFind::runGetMore(txn, getMoreRequest); if (cursorResponse == ErrorCodes::CursorNotFound) { replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0); return; } uassertStatusOK(cursorResponse.getStatus()); // Build the response document. // // TODO: this constant should be shared between mongos and mongod, and should not be inside // ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int numResults = 0; for (const auto& obj : cursorResponse.getValue().batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); ++numResults; } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), numResults, cursorResponse.getValue().numReturnedSoFar.value_or(0), cursorResponse.getValue().cursorId); return; } shared_ptr<DBConfig> config = statusGetDb.getValue(); ShardPtr primary; ChunkManagerPtr info; config->getChunkManagerOrPrimary(txn, ns, info, primary); // // TODO: Cleanup cursor cache, consolidate into single codepath // const string host = cursorCache.getRef(id); ShardedClientCursorPtr cursor = cursorCache.get(id); int cursorMaxTimeMS = cursorCache.getMaxTimeMS(id); // Cursor ids should not overlap between sharded and unsharded cursors massert(17012, str::stream() << "duplicate sharded and unsharded cursor id " << id << " detected for " << ns << ", duplicated on host " << host, NULL == cursorCache.get(id).get() || host.empty()); ClientBasic* client = ClientBasic::getCurrent(); NamespaceString nsString(ns); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForGetMore(nsString, id, false); audit::logGetMoreAuthzCheck(client, nsString, id, status.code()); uassertStatusOK(status); if (!host.empty()) { LOG(3) << "single getmore: " << ns; // we used ScopedDbConnection because we don't get about config versions // not deleting data is handled elsewhere // and we don't want to call setShardVersion ScopedDbConnection conn(host); Message response; bool ok = conn->callRead(request.m(), response); uassert(10204, "dbgrid: getmore: error calling db", ok); bool hasMore = (response.singleData().getCursor() != 0); if (!hasMore) { cursorCache.removeRef(id); } request.reply(response, "" /*conn->getServerAddress() */); conn.done(); return; } else if (cursor) { if (cursorMaxTimeMS == kMaxTimeCursorTimeLimitExpired) { cursorCache.remove(id); uasserted(ErrorCodes::ExceededTimeLimit, "operation exceeded time limit"); } // TODO: Try to match logic of mongod, where on subsequent getMore() we pull lots more data? BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cursor->getTotalSent(); bool hasMore = cursor->sendNextBatch(ntoreturn, buffer, docCount); if (hasMore) { // still more data cursor->accessed(); if (cursorMaxTimeMS != kMaxTimeCursorNoTimeLimit) { // Update remaining amount of time in cursor cache. int cursorLeftoverMillis = cursorMaxTimeMS - getMoreTimer.millis(); if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.updateMaxTimeMS(id, cursorLeftoverMillis); } } else { // we've exhausted the cursor cursorCache.remove(id); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cursor->getId() : 0); return; } else { LOG(3) << "could not find cursor " << id << " in cache for " << ns; replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0); return; } }
void _update( Request& r , DbMessage& d, ChunkManagerPtr manager ){ int flags = d.pullInt(); BSONObj query = d.nextJsObj(); uassert( 10201 , "invalid update" , d.moreJSObjs() ); BSONObj toupdate = d.nextJsObj(); BSONObj chunkFinder = query; bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; uassert( 10202 , "can't mix multi and upsert and sharding" , ! ( upsert && multi ) ); if ( upsert && !(manager->hasShardKey(toupdate) || (toupdate.firstElement().fieldName()[0] == '$' && manager->hasShardKey(query)))) { throw UserException( 8012 , "can't upsert something without shard key" ); } bool save = false; if ( ! manager->hasShardKey( query ) ){ if ( multi ){ } else if ( strcmp( query.firstElement().fieldName() , "_id" ) || query.nFields() != 1 ){ throw UserException( 8013 , "can't do non-multi update with query that doesn't have the shard key" ); } else { save = true; chunkFinder = toupdate; } } if ( ! save ){ if ( toupdate.firstElement().fieldName()[0] == '$' ){ BSONObjIterator ops(toupdate); while(ops.more()){ BSONElement op(ops.next()); if (op.type() != Object) continue; BSONObjIterator fields(op.embeddedObject()); while(fields.more()){ const string field = fields.next().fieldName(); uassert(13123, "Can't modify shard key's value", ! manager->getShardKey().partOfShardKey(field)); } } } else if ( manager->hasShardKey( toupdate ) ){ uassert( 8014, "change would move shards!", manager->getShardKey().compare( query , toupdate ) == 0 ); } else { uasserted(12376, "shard key must be in update object"); } } if ( multi ){ set<Shard> shards; manager->getShardsForQuery( shards , chunkFinder ); int * x = (int*)(r.d().afterNS()); x[0] |= UpdateOption_Broadcast; for ( set<Shard>::iterator i=shards.begin(); i!=shards.end(); i++){ doWrite( dbUpdate , r , *i , false ); } } else { int left = 5; while ( true ){ try { ChunkPtr c = manager->findChunk( chunkFinder ); doWrite( dbUpdate , r , c->getShard() ); c->splitIfShould( d.msg().header()->dataLen() ); break; } catch ( StaleConfigException& e ){ if ( left <= 0 ) throw e; left--; log() << "update failed b/c of StaleConfigException, retrying " << " left:" << left << " ns: " << r.getns() << " query: " << query << endl; r.reset( false ); manager = r.getChunkManager(); } } } }
void acquirePathLock(bool doingRepair) { string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).string(); bool oldFile = false; if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ) { oldFile = true; } #ifdef _WIN32 lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE, 0 /* do not allow anyone else access */, NULL, OPEN_ALWAYS /* success if fh can open */, 0, NULL ); if (lockFileHandle == INVALID_HANDLE_VALUE) { DWORD code = GetLastError(); char *msg; FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&msg, 0, NULL); string m = msg; str::stripTrailing(m, "\r\n"); uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << ". Is a mongod instance already running?" ); } lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0); #else lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO ); if( lockFile <= 0 ) { uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" ); } if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) { close ( lockFile ); lockFile = 0; uassert( 10310 , "Unable to lock file: " + name + ". Is a mongod instance already running?", 0 ); } #endif if ( oldFile ) { // we check this here because we want to see if we can get the lock // if we can't, then its probably just another mongod running string errmsg; if (doingRepair && dur::haveJournalFiles()) { errmsg = "************** \n" "You specified --repair but there are dirty journal files. Please\n" "restart without --repair to allow the journal files to be replayed.\n" "If you wish to repair all databases, please shutdown cleanly and\n" "run with --repair again.\n" "**************"; } else if (cmdLine.dur) { if (!dur::haveJournalFiles(/*anyFiles=*/true)) { // Passing anyFiles=true as we are trying to protect against starting in an // unclean state with the journal directory unmounted. If there are any files, // even prealloc files, then it means that it is mounted so we can continue. // Previously there was an issue (SERVER-5056) where we would fail to start up // if killed during prealloc. vector<string> dbnames; getDatabaseNames( dbnames ); if ( dbnames.size() == 0 ) { // this means that mongod crashed // between initial startup and when journaling was initialized // it is safe to continue } else { errmsg = str::stream() << "************** \n" << "old lock file: " << name << ". probably means unclean shutdown,\n" << "but there are no journal files to recover.\n" << "this is likely human error or filesystem corruption.\n" << "please make sure that your journal directory is mounted.\n" << "found " << dbnames.size() << " dbs.\n" << "see: http://dochub.mongodb.org/core/repair for more information\n" << "*************"; } } } else { if (!dur::haveJournalFiles() && !doingRepair) { errmsg = str::stream() << "************** \n" << "Unclean shutdown detected.\n" << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n" << "*************"; } } if (!errmsg.empty()) { cout << errmsg << endl; #ifdef _WIN32 CloseHandle( lockFileHandle ); #else close ( lockFile ); #endif lockFile = 0; uassert( 12596 , "old lock file" , 0 ); } } // Not related to lock file, but this is where we handle unclean shutdown if( !cmdLine.dur && dur::haveJournalFiles() ) { cout << "**************" << endl; cout << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl; cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl; cout << "**************" << endl; uasserted(13597, "can't start without --journal enabled when journal/ files are present"); } #ifdef _WIN32 uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0); writePid( lockFile ); _commit( lockFile ); #else uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0); writePid( lockFile ); fsync( lockFile ); flushMyDirectory(name); #endif }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { QLOG() << "Running query on new system: " << cq->toString(); // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // Need to call cq->toString() now, since upon error getRunner doesn't guarantee // cq is in a consistent state. string cqStr = cq->toString(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View getMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; const NamespaceString nss(ns); // Depending on the type of cursor being operated on, we hold locks for the whole getMore, // or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't // own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection lock. // We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for the // unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that the // pin's destructor is called before the lock destructors (so that the unpin occurs under // the lock). boost::scoped_ptr<AutoGetCollectionForRead> ctx; boost::scoped_ptr<Lock::DBLock> unpinDBLock; boost::scoped_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); cursorManager = collection->getCursorManager(); } LOG(5) << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorManager, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->ns(), ns == cc->ns()); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (txn->getClient()->isInDirectClient()) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn); } if (cc->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForGetMore(ntoreturn, numResults, bb.len())) { break; } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { // Propagate this error to caller. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc)); // In order to deregister a cursor, we need to be holding the DB + collection lock and // if the cursor is aggregation, we release these locks. if (cc->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS)); unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS)); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In // this case, the pin's destructor will be invoked, which will call release() on the // pin. Because our ClientCursorPin is declared after our lock is declared, this // will happen under the lock. if (!shouldSaveCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; curop.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!txn->getClient()->isInDirectClient()) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper->dismiss(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); LOG(5) << "getMore returned " << numResults << " results\n"; return qr; }
/* called on a reconfig AND on initiate throws @param initial true when initiating */ void checkMembersUpForConfigChange(const ReplSetConfig& cfg, bool initial) { int failures = 0, majority = 0; int me = 0; stringstream selfs; for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) { if( i->h.isSelf() ) { me++; if( me > 1 ) selfs << ','; selfs << i->h.toString(); if( !i->potentiallyHot() ) { uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary"); } majority += i->votes; } } majority = (majority / 2) + 1; uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups? if( me != 1 ) { stringstream ss; ss << "can't find self in the replset config"; if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port; if( me != 0 ) ss << " found: " << me; uasserted(13279, ss.str()); } for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) { // we know we're up if (i->h.isSelf()) { continue; } BSONObj res; { bool ok = false; try { int theirVersion = -1000; ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/); if( theirVersion >= cfg.version ) { stringstream ss; ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure"; uasserted(13259, ss.str()); } } catch(DBException& e) { log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog; } catch(...) { log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog; } if( res.getBoolField("mismatch") ) uasserted(13145, "set name does not match the set name host " + i->h.toString() + " expects"); if( *res.getStringField("set") ) { if( cfg.version <= 1 ) { // this was to be initiation, no one shoudl be initiated already. uasserted(13256, "member " + i->h.toString() + " is already initiated"); } else { // Assure no one has a newer config. if( res["v"].Int() >= cfg.version ) { uasserted(13341, "member " + i->h.toString() + " has a config version >= to the new cfg version; cannot change config"); } } } if( !ok && !res["rs"].trueValue() ) { if( !res.isEmpty() ) { /* strange. got a response, but not "ok". log it. */ log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog; } bool allowFailure = false; failures += i->votes; if( res.isEmpty() && !initial && failures >= majority ) { const Member* m = theReplSet->findById( i->_id ); if( m ) { assert( m->h().toString() == i->h.toString() ); } // it's okay if the down member isn't part of the config, // we might be adding a new member that isn't up yet allowFailure = true; } if( !allowFailure ) { string msg = string("need all members up to initiate, not ok : ") + i->h.toString(); if( !initial ) msg = string("need most members up to reconfigure, not ok : ") + i->h.toString(); uasserted(13144, msg); } } } if( initial ) { bool hasData = res["hasData"].Bool(); uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set. All members except initiator must be empty.", !hasData || i->h.isSelf()); } } }
NOINLINE_DECL void DataFile::badOfs(int ofs) const { stringstream ss; ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; uasserted(13440, ss.str()); }