Status CmdAuthenticate::_authenticateCR(OperationContext* opCtx, const UserName& user, const BSONObj& cmdObj) { if (user == internalSecurity.user->getName() && serverGlobalParams.clusterAuthMode.load() == ServerGlobalParams::ClusterAuthMode_x509) { return Status(ErrorCodes::AuthenticationFailed, "Mechanism x509 is required for internal cluster authentication"); } if (_isCRAuthDisabled) { // SERVER-8461, MONGODB-CR must be enabled for authenticating the internal user, so that // cluster members may communicate with each other. if (user != internalSecurity.user->getName()) { return Status(ErrorCodes::BadValue, _nonceAuthenticationDisabledMessage); } } string key = cmdObj.getStringField("key"); string received_nonce = cmdObj.getStringField("nonce"); if (user.getUser().empty() || key.empty() || received_nonce.empty()) { sleepmillis(10); return Status(ErrorCodes::ProtocolError, "field missing/wrong type in received authenticate command"); } stringstream digestBuilder; { Client* client = Client::getCurrent(); std::unique_ptr<AuthenticationSession> session; AuthenticationSession::swap(client, session); if (!session || session->getType() != AuthenticationSession::SESSION_TYPE_MONGO) { sleepmillis(30); return Status(ErrorCodes::ProtocolError, "No pending nonce"); } else { nonce64 nonce = static_cast<MongoAuthenticationSession*>(session.get())->getNonce(); digestBuilder << hex << nonce; if (digestBuilder.str() != received_nonce) { sleepmillis(30); return Status(ErrorCodes::AuthenticationFailed, "Received wrong nonce."); } } } User* userObj; Status status = getGlobalAuthorizationManager()->acquireUserForInitialAuth(opCtx, user, &userObj); if (!status.isOK()) { // Failure to find the privilege document indicates no-such-user, a fact that we do not // wish to reveal to the client. So, we return AuthenticationFailed rather than passing // through the returned status. return Status(ErrorCodes::AuthenticationFailed, status.toString()); } string pwd = userObj->getCredentials().password; getGlobalAuthorizationManager()->releaseUser(userObj); if (pwd.empty()) { return Status(ErrorCodes::AuthenticationFailed, "MONGODB-CR credentials missing in the user document"); } md5digest d; { digestBuilder << user.getUser() << pwd; string done = digestBuilder.str(); md5_state_t st; md5_init(&st); md5_append(&st, (const md5_byte_t*)done.c_str(), done.size()); md5_finish(&st, d); } string computed = digestToString(d); if (key != computed) { return Status(ErrorCodes::AuthenticationFailed, "key mismatch"); } AuthorizationSession* authorizationSession = AuthorizationSession::get(Client::getCurrent()); status = authorizationSession->addAndAuthorizeUser(opCtx, user); if (!status.isOK()) { return status; } return Status::OK(); }
Status ShardingCatalogClientImpl::applyChunkOpsDeprecated(OperationContext* opCtx, const BSONArray& updateOps, const BSONArray& preCondition, const NamespaceString& nss, const ChunkVersion& lastChunkVersion, const WriteConcernOptions& writeConcern, repl::ReadConcernLevel readConcern) { invariant(serverGlobalParams.clusterRole == ClusterRole::ConfigServer || (readConcern == repl::ReadConcernLevel::kMajorityReadConcern && writeConcern.wMode == WriteConcernOptions::kMajority)); BSONObj cmd = BSON("applyOps" << updateOps << "preCondition" << preCondition << WriteConcernOptions::kWriteConcernField << writeConcern.toBSON()); auto response = Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "config", cmd, Shard::RetryPolicy::kIdempotent); if (!response.isOK()) { return response.getStatus(); } Status status = response.getValue().commandStatus.isOK() ? std::move(response.getValue().writeConcernStatus) : std::move(response.getValue().commandStatus); // TODO (Dianna) This fail point needs to be reexamined when CommitChunkMigration is in: // migrations will no longer be able to exercise it, so split or merge will need to do so. // SERVER-22659. if (MONGO_FAIL_POINT(failApplyChunkOps)) { status = Status(ErrorCodes::InternalError, "Failpoint 'failApplyChunkOps' generated error"); } if (!status.isOK()) { string errMsg; // This could be a blip in the network connectivity. Check if the commit request made it. // // If all the updates were successfully written to the chunks collection, the last // document in the list of updates should be returned from a query to the chunks // collection. The last chunk can be identified by namespace and version number. warning() << "chunk operation commit failed and metadata will be revalidated" << causedBy(redact(status)); // Look for the chunk in this shard whose version got bumped. We assume that if that // mod made it to the config server, then transaction was successful. BSONObjBuilder query; lastChunkVersion.appendLegacyWithField(&query, ChunkType::lastmod()); query.append(ChunkType::ns(), nss.ns()); auto chunkWithStatus = getChunks(opCtx, query.obj(), BSONObj(), 1, nullptr, readConcern); if (!chunkWithStatus.isOK()) { errMsg = str::stream() << "getChunks function failed, unable to validate chunk " << "operation metadata: " << chunkWithStatus.getStatus().toString() << ". applyChunkOpsDeprecated failed to get confirmation " << "of commit. Unable to save chunk ops. Command: " << cmd << ". Result: " << response.getValue().response; return status.withContext(errMsg); }; const auto& newestChunk = chunkWithStatus.getValue(); if (newestChunk.empty()) { errMsg = str::stream() << "chunk operation commit failed: version " << lastChunkVersion.toString() << " doesn't exist in namespace: " << nss.ns() << ". Unable to save chunk ops. Command: " << cmd << ". Result: " << response.getValue().response; return status.withContext(errMsg); }; invariant(newestChunk.size() == 1); return Status::OK(); } return Status::OK(); }
// ran at startup. static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) { LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; OperationContextImpl txn; Lock::GlobalWrite lk(txn.lockState()); WriteUnitOfWork wunit(txn.recoveryUnit()); vector< string > dbNames; globalStorageEngine->listDatabases( &dbNames ); for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { string dbName = *i; LOG(1) << "\t" << dbName << endl; Client::Context ctx(&txn, dbName ); if (repl::replSettings.usingReplSets()) { // we only care about the _id index if we are in a replset checkForIdIndexes(&txn, ctx.db()); } if (shouldClearNonLocalTmpCollections || dbName == "local") ctx.db()->clearTmpCollections(&txn); if ( mongodGlobalParams.repair ) { fassert(18506, globalStorageEngine->repairDatabase(&txn, dbName)); } else if (!ctx.db()->getDatabaseCatalogEntry()->currentFilesCompatible(&txn)) { log() << "****"; log() << "cannot do this upgrade without an upgrade in the middle"; log() << "please do a --repair with 2.6 and then start this version"; dbexit( EXIT_NEED_UPGRADE ); invariant( false ); return; } else { // major versions match, check indexes const string systemIndexes = ctx.db()->name() + ".system.indexes"; Collection* coll = ctx.db()->getCollection( &txn, systemIndexes ); auto_ptr<Runner> runner(InternalPlanner::collectionScan(&txn, systemIndexes,coll)); BSONObj index; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (ctx.db()->getDatabaseCatalogEntry()->isOlderThan24(&txn)) { if (IndexNames::existedBefore24(plugin)) continue; log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } } if (Runner::RUNNER_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } Database::closeDatabase(&txn, dbName.c_str()); } } wunit.commit(); LOG(1) << "done repairDatabases" << endl; }
StatusWith<ForwardingCatalogManager::ScopedDistLock*> ChunkMoveOperationState::acquireMoveMetadata() { // Get the distributed lock const string whyMessage(stream() << "migrating chunk [" << _minKey << ", " << _maxKey << ") in " << _nss.ns()); _distLockStatus = grid.forwardingCatalogManager()->distLock(_txn, _nss.ns(), whyMessage); if (!_distLockStatus->isOK()) { const string msg = stream() << "could not acquire collection lock for " << _nss.ns() << " to migrate chunk [" << _minKey << "," << _maxKey << ")" << causedBy(_distLockStatus->getStatus()); warning() << msg; return Status(_distLockStatus->getStatus().code(), msg); } ShardingState* const shardingState = ShardingState::get(_txn); // Snapshot the metadata Status refreshStatus = shardingState->refreshMetadataNow(_txn, _nss.ns(), &_shardVersion); if (!refreshStatus.isOK()) { const string msg = stream() << "moveChunk cannot start migrate of chunk " << "[" << _minKey << "," << _maxKey << ")" << causedBy(refreshStatus.reason()); warning() << msg; return Status(refreshStatus.code(), msg); } if (_shardVersion.majorVersion() == 0) { // It makes no sense to migrate if our version is zero and we have no chunks const string msg = stream() << "moveChunk cannot start migrate of chunk " << "[" << _minKey << "," << _maxKey << ")" << " with zero shard version"; warning() << msg; return Status(ErrorCodes::IncompatibleShardingMetadata, msg); } { // Mongos >= v3.2 sends the full version, v3.0 only sends the epoch. // TODO(SERVER-20742): Stop parsing epoch separately after 3.2. auto& operationVersion = OperationShardVersion::get(_txn); if (operationVersion.hasShardVersion()) { _collectionVersion = operationVersion.getShardVersion(_nss); _collectionEpoch = _collectionVersion.epoch(); } // else the epoch will already be set from the parsing of the ChunkMoveOperationState if (_collectionEpoch != _shardVersion.epoch()) { const string msg = stream() << "moveChunk cannot move chunk " << "[" << _minKey << "," << _maxKey << "), " << "collection may have been dropped. " << "current epoch: " << _shardVersion.epoch() << ", cmd epoch: " << _collectionEpoch; warning() << msg; throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion); } } _collMetadata = shardingState->getCollectionMetadata(_nss.ns()); // With nonzero shard version, we must have a coll version >= our shard version invariant(_collMetadata->getCollVersion() >= _shardVersion); // With nonzero shard version, we must have a shard key invariant(!_collMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_collMetadata->getNextChunk(_minKey, &origChunk) || origChunk.getMin().woCompare(_minKey) || origChunk.getMax().woCompare(_maxKey)) { // Our boundaries are different from those passed in const string msg = stream() << "moveChunk cannot find chunk " << "[" << _minKey << "," << _maxKey << ")" << " to migrate, the chunk boundaries may be stale"; warning() << msg; throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion); } return &_distLockStatus->getValue(); }
StatusWith<bool> SaslPLAINServerConversation::step(const StringData& inputData, std::string* outputData) { // Expecting user input on the form: user\0user\0pwd std::string input = inputData.toString(); std::string pwd = ""; try { _user = input.substr(0, inputData.find('\0')); pwd = input.substr(inputData.find('\0', _user.size()+1)+1); } catch (std::out_of_range& exception) { return StatusWith<bool>(ErrorCodes::AuthenticationFailed, mongoutils::str::stream() << "Incorrectly formatted PLAIN client message"); } User* userObj; // The authentication database is also the source database for the user. Status status = _saslAuthSession->getAuthorizationSession()->getAuthorizationManager(). acquireUser(_saslAuthSession->getOpCtxt(), UserName(_user, _saslAuthSession->getAuthenticationDatabase()), &userObj); if (!status.isOK()) { return StatusWith<bool>(status); } const User::CredentialData creds = userObj->getCredentials(); _saslAuthSession->getAuthorizationSession()->getAuthorizationManager(). releaseUser(userObj); std::string authDigest = createPasswordDigest(_user, pwd); if (!creds.password.empty()) { // Handle schemaVersion26Final (MONGODB-CR/SCRAM mixed mode) if (authDigest != creds.password) { return StatusWith<bool>(ErrorCodes::AuthenticationFailed, mongoutils::str::stream() << "Incorrect user name or password"); } } else { // Handle schemaVersion28SCRAM (SCRAM only mode) unsigned char storedKey[scram::hashSize]; unsigned char serverKey[scram::hashSize]; scram::generateSecrets(authDigest, reinterpret_cast<const unsigned char*>(base64::decode(creds.scram.salt).c_str()), 16, creds.scram.iterationCount, storedKey, serverKey); if (creds.scram.storedKey != base64::encode(reinterpret_cast<const char*>(storedKey), scram::hashSize)) { return StatusWith<bool>(ErrorCodes::AuthenticationFailed, mongoutils::str::stream() << "Incorrect user name or password"); } } *outputData = ""; return StatusWith<bool>(true); }
StatusWith<LocksType> LocksType::fromBSON(const BSONObj& source) { LocksType lock; { std::string lockName; Status status = bsonExtractStringField(source, name.name(), &lockName); if (!status.isOK()) return status; lock._name = lockName; } { long long lockStateInt; Status status = bsonExtractIntegerField(source, state.name(), &lockStateInt); if (!status.isOK()) return status; lock._state = static_cast<State>(lockStateInt); } if (source.hasField(process.name())) { std::string lockProcess; Status status = bsonExtractStringField(source, process.name(), &lockProcess); if (!status.isOK()) return status; lock._process = lockProcess; } if (source.hasField(lockID.name())) { BSONElement lockIDElem; Status status = bsonExtractTypedField(source, lockID.name(), BSONType::jstOID, &lockIDElem); if (!status.isOK()) return status; lock._lockID = lockIDElem.OID(); } if (source.hasField(who.name())) { std::string lockWho; Status status = bsonExtractStringField(source, who.name(), &lockWho); if (!status.isOK()) return status; lock._who = lockWho; } if (source.hasField(why.name())) { std::string lockWhy; Status status = bsonExtractStringField(source, why.name(), &lockWhy); if (!status.isOK()) return status; lock._why = lockWhy; } return lock; }
Status parseCreateOrUpdateUserCommands(const BSONObj& cmdObj, const StringData& cmdName, const std::string& dbname, CreateOrUpdateUserArgs* parsedArgs) { unordered_set<std::string> validFieldNames; validFieldNames.insert(cmdName.toString()); validFieldNames.insert("customData"); validFieldNames.insert("pwd"); validFieldNames.insert("roles"); validFieldNames.insert("writeConcern"); Status status = _checkNoExtraFields(cmdObj, cmdName, validFieldNames); if (!status.isOK()) { return status; } status = _extractWriteConcern(cmdObj, &parsedArgs->writeConcern); if (!status.isOK()) { return status; } BSONObjBuilder userObjBuilder; // Parse user name std::string userName; status = bsonExtractStringField(cmdObj, cmdName, &userName); if (!status.isOK()) { return status; } parsedArgs->userName = UserName(userName, dbname); // Parse password if (cmdObj.hasField("pwd")) { std::string clearTextPassword; status = bsonExtractStringField(cmdObj, "pwd", &clearTextPassword); if (!status.isOK()) { return status; } parsedArgs->hashedPassword = auth::createPasswordDigest(userName, clearTextPassword); parsedArgs->hasHashedPassword = true; } // Parse custom data if (cmdObj.hasField("customData")) { BSONElement element; status = bsonExtractTypedField(cmdObj, "customData", Object, &element); if (!status.isOK()) { return status; } parsedArgs->customData = element.Obj(); parsedArgs->hasCustomData = true; } // Parse roles if (cmdObj.hasField("roles")) { BSONElement rolesElement; status = bsonExtractTypedField(cmdObj, "roles", Array, &rolesElement); if (!status.isOK()) { return status; } status = _extractRoleDataFromBSONArray(rolesElement, dbname, &parsedArgs->roles); if (!status.isOK()) { return status; } parsedArgs->hasRoles = true; } return Status::OK(); }
bool wrappedRun(OperationContext* txn, const string& dbname, BSONObj& jsobj, string& errmsg, BSONObjBuilder& anObjBuilder) { BSONElement e = jsobj.firstElement(); const string toDeleteNs = dbname + '.' + e.valuestr(); if (!serverGlobalParams.quiet) { LOG(0) << "CMD: dropIndexes " << toDeleteNs << endl; } Client::Context ctx(toDeleteNs); Database* db = ctx.db(); Collection* collection = db->getCollection( txn, toDeleteNs ); if ( ! collection ) { errmsg = "ns not found"; return false; } stopIndexBuilds(txn, db, jsobj); IndexCatalog* indexCatalog = collection->getIndexCatalog(); anObjBuilder.appendNumber("nIndexesWas", indexCatalog->numIndexesTotal() ); BSONElement f = jsobj.getField("index"); if ( f.type() == String ) { string indexToDelete = f.valuestr(); if ( indexToDelete == "*" ) { Status s = indexCatalog->dropAllIndexes(txn, false); if ( !s.isOK() ) { appendCommandStatus( anObjBuilder, s ); return false; } anObjBuilder.append("msg", "non-_id indexes dropped for collection"); return true; } IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByName( indexToDelete ); if ( desc == NULL ) { errmsg = str::stream() << "index not found with name [" << indexToDelete << "]"; return false; } if ( desc->isIdIndex() ) { errmsg = "cannot drop _id index"; return false; } Status s = indexCatalog->dropIndex(txn, desc); if ( !s.isOK() ) { appendCommandStatus( anObjBuilder, s ); return false; } return true; } if ( f.type() == Object ) { IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByKeyPattern( f.embeddedObject() ); if ( desc == NULL ) { errmsg = "can't find index with key:"; errmsg += f.embeddedObject().toString(); return false; } if ( desc->isIdIndex() ) { errmsg = "cannot drop _id index"; return false; } Status s = indexCatalog->dropIndex(txn, desc); if ( !s.isOK() ) { appendCommandStatus( anObjBuilder, s ); return false; } return true; } errmsg = "invalid index name spec"; return false; }
bool run(OperationContext* txn, const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { DBDirectClient db; BSONElement e = jsobj.firstElement(); string toDeleteNs = dbname + '.' + e.valuestr(); LOG(0) << "CMD: reIndex " << toDeleteNs << endl; Lock::DBWrite dbXLock(txn->lockState(), dbname); Client::Context ctx(toDeleteNs); Collection* collection = ctx.db()->getCollection( txn, toDeleteNs ); if ( !collection ) { errmsg = "ns not found"; return false; } BackgroundOperation::assertNoBgOpInProgForNs( toDeleteNs ); std::vector<BSONObj> indexesInProg = stopIndexBuilds(txn, ctx.db(), jsobj); list<BSONObj> all; auto_ptr<DBClientCursor> i = db.query( dbname + ".system.indexes" , BSON( "ns" << toDeleteNs ) , 0 , 0 , 0 , QueryOption_SlaveOk ); BSONObjBuilder b; while ( i->more() ) { const BSONObj spec = i->next().removeField("v").getOwned(); const BSONObj key = spec.getObjectField("key"); const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { errmsg = str::stream() << "Cannot rebuild index " << spec << ": " << keyStatus.reason() << " For more info see http://dochub.mongodb.org/core/index-validation"; return false; } b.append( BSONObjBuilder::numStr( all.size() ) , spec ); all.push_back( spec ); } result.appendNumber( "nIndexesWas", collection->getIndexCatalog()->numIndexesTotal() ); Status s = collection->getIndexCatalog()->dropAllIndexes(txn, true); if ( !s.isOK() ) { errmsg = "dropIndexes failed"; return appendCommandStatus( result, s ); } for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) { BSONObj o = *i; LOG(1) << "reIndex ns: " << toDeleteNs << " index: " << o << endl; Status s = collection->getIndexCatalog()->createIndex(txn, o, false); if ( !s.isOK() ) return appendCommandStatus( result, s ); } result.append( "nIndexes" , (int)all.size() ); result.appendArray( "indexes" , b.obj() ); IndexBuilder::restoreIndexes(indexesInProg); return true; }
Status MetadataLoader::initChunks(CatalogManager* catalogManager, const string& ns, const string& shard, const CollectionMetadata* oldMetadata, CollectionMetadata* metadata) const { map<string, ChunkVersion> versionMap; // Preserve the epoch versionMap[shard] = metadata->_shardVersion; OID epoch = metadata->getCollVersion().epoch(); bool fullReload = true; // Check to see if we should use the old version or not. if ( oldMetadata ) { // If our epochs are compatible, it's useful to use the old metadata for diffs if ( oldMetadata->getCollVersion().hasEqualEpoch( epoch ) ) { fullReload = false; invariant( oldMetadata->isValid() ); versionMap[shard] = oldMetadata->_shardVersion; metadata->_collVersion = oldMetadata->_collVersion; // TODO: This could be made more efficient if copying not required, but // not as frequently reloaded as in mongos. metadata->_chunksMap = oldMetadata->_chunksMap; LOG( 2 ) << "loading new chunks for collection " << ns << " using old metadata w/ version " << oldMetadata->getShardVersion() << " and " << metadata->_chunksMap.size() << " chunks" << endl; } else { warning() << "reloading collection metadata for " << ns << " with new epoch " << epoch.toString() << ", the current epoch is " << oldMetadata->getCollVersion().epoch().toString() << endl; } } // Exposes the new metadata's range map and version to the "differ," who // would ultimately be responsible of filling them up. SCMConfigDiffTracker differ( shard ); differ.attach( ns, metadata->_chunksMap, metadata->_collVersion, versionMap ); try { std::vector<ChunkType> chunks; Status status = catalogManager->getChunks(differ.configDiffQuery(), &chunks); if (!status.isOK()) { if (status == ErrorCodes::HostUnreachable) { // Make our metadata invalid metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); } return status; } // // The diff tracker should always find at least one chunk (the highest chunk we saw // last time). If not, something has changed on the config server (potentially between // when we read the collection data and when we read the chunks data). // int diffsApplied = differ.calculateConfigDiff(chunks); if ( diffsApplied > 0 ) { // Chunks found, return ok LOG(2) << "loaded " << diffsApplied << " chunks into new metadata for " << ns << " with version " << metadata->_collVersion << endl; metadata->_shardVersion = versionMap[shard]; metadata->fillRanges(); invariant( metadata->isValid() ); return Status::OK(); } else if ( diffsApplied == 0 ) { // No chunks found, the collection is dropping or we're confused // If this is a full reload, assume it is a drop for backwards compatibility // TODO: drop the config.collections entry *before* the chunks and eliminate this // ambiguity string errMsg = str::stream() << "no chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ( fullReload ? ", this is a drop" : "" ); warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); return fullReload ? Status( ErrorCodes::NamespaceNotFound, errMsg ) : Status( ErrorCodes::RemoteChangeDetected, errMsg ); } else { // Invalid chunks found, our epoch may have changed because we dropped/recreated // the collection. string errMsg = // br str::stream() << "invalid chunks found when reloading " << ns << ", previous version was " << metadata->_collVersion.toString() << ", this should be rare"; warning() << errMsg << endl; metadata->_collVersion = ChunkVersion( 0, 0, OID() ); metadata->_chunksMap.clear(); return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } } catch ( const DBException& e ) { string errMsg = str::stream() << "problem querying chunks metadata" << causedBy( e ); // We deliberately do not return connPtr to the pool, since it was involved // with the error here. return Status( ErrorCodes::HostUnreachable, errMsg ); } }
// Construct a subscription if needed // ============================================================================================= Status SubscriptionFactory::acquireSubscription( const SubscriptionSettings& subscriptionSettings, Subscription*& subscription) { logger_->debug("Acquiring subscription with the following settings:"); logger_->debug(subscriptionSettings.toString()); Status ret; subscription = 0; // lock the mutex to make sure the subscriptionMap_ is not being manipulated UaMutexLocker locker(&subscriptionMapMutex_); // we'll try to find a similar subscription that can be re-used, // unless we're creating an "unique" subscription // (one that is only created for -and used by- the current request) if (subscriptionSettings.unique) { logger_->debug("The requested subscription must be unique"); } else { // loop trough the subscriptions ... for (SubscriptionMap::const_iterator it = subscriptionMap_.begin(); it != subscriptionMap_.end(); ++it) { // ... until a suitable one is found if (it->second->subscriptionSettings() == subscriptionSettings) { subscription = it->second; logger_->debug("A suitable subscription (ClientSubscriptionHandle=%d) already exists", subscription->clientSubscriptionHandle()); // get the ClientSubscriptionHandle of the subscription ClientSubscriptionHandle handle = subscription->clientSubscriptionHandle(); // now increment the activity count of the subscription activityMapMutex_.lock(); activityMap_[handle] = activityMap_[handle] + 1; activityMapMutex_.unlock(); ret.setGood(); break; } } } // if no subscription exists yet, we create one if (subscription == 0) { ClientSubscriptionHandle clientSubscriptionHandle; clientSubscriptionHandle = database_->createUniqueClientSubscriptionHandle(); logger_->debug("We create a new subscription with clientSubscriptionHandle %d", clientSubscriptionHandle); // create a new subscription instance subscription = new Subscription( logger_->loggerFactory(), subscriptionSettings, clientSubscriptionHandle, clientConnectionId_, uaSession_, this, clientInterface_, database_); // store the new subscription instance in the subscriptionMap subscriptionMap_[clientSubscriptionHandle] = subscription; logger_->debug("The new subscription has been created"); // create an activity count for the subscription activityMapMutex_.lock(); activityMap_[clientSubscriptionHandle] = 1; activityMapMutex_.unlock(); // create the subscription on the server ret = subscription->createSubscription(); } // 'subscription' now points to an existing Subscription instance // (i.e. a valid memory location) ret.setGood(); // add some diagnostics if (ret.isGood()) { activityMapMutex_.lock(); logger_->debug("The requested subscription is acquired (#activities: %d)", activityMap_[subscription->clientSubscriptionHandle()]); activityMapMutex_.unlock(); } else { logger_->error("The requested subscription could not be acquired"); ret.addDiagnostic("The requested subscription could not be acquired"); } return ret; }
// ran at startup. static void repairDatabasesAndCheckVersion(bool shouldClearNonLocalTmpCollections) { // LastError * le = lastError.get( true ); LOG(1) << "enter repairDatabases (to check pdfile version #)" << endl; Lock::GlobalWrite lk; DurTransaction txn; vector< string > dbNames; getDatabaseNames( dbNames ); for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { string dbName = *i; LOG(1) << "\t" << dbName << endl; Client::Context ctx( dbName ); DataFile *p = ctx.db()->getExtentManager().getFile( 0 ); DataFileHeader *h = p->getHeader(); if ( replSettings.usingReplSets() ) { // we only care about the _id index if we are in a replset checkForIdIndexes(ctx.db()); } if (shouldClearNonLocalTmpCollections || dbName == "local") ctx.db()->clearTmpCollections(&txn); if (!h->isCurrentVersion() || mongodGlobalParams.repair) { if( h->version <= 0 ) { uasserted(14026, str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version << " info: " << h->versionMinor << ' ' << h->fileLength); } if ( !h->isCurrentVersion() ) { log() << "****" << endl; log() << "****" << endl; log() << "need to upgrade database " << dbName << " " << "with pdfile version " << h->version << "." << h->versionMinor << ", " << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR_22_AND_OLDER << endl; } if (mongodGlobalParams.upgrade) { // QUESTION: Repair even if file format is higher version than code? doDBUpgrade( dbName, h ); } else { log() << "\t Not upgrading, exiting" << endl; log() << "\t run --upgrade to upgrade dbs, then start again" << endl; log() << "****" << endl; dbexit( EXIT_NEED_UPGRADE ); mongodGlobalParams.upgrade = 1; return; } } else { const string systemIndexes = ctx.db()->name() + ".system.indexes"; Collection* coll = ctx.db()->getCollection( systemIndexes ); auto_ptr<Runner> runner(InternalPlanner::collectionScan(systemIndexes,coll)); BSONObj index; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (h->versionMinor == PDFILE_VERSION_MINOR_22_AND_OLDER) { if (IndexNames::existedBefore24(plugin)) continue; log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { log() << "Problem with index " << index << ": " << keyStatus.reason() << " This index can still be used however it cannot be rebuilt." << " For more info see" << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog; } } if (Runner::RUNNER_EOF != state) { warning() << "Internal error while reading collection " << systemIndexes; } Database::closeDatabase(dbName.c_str(), storageGlobalParams.dbpath); } } LOG(1) << "done repairDatabases" << endl; if (mongodGlobalParams.upgrade) { log() << "finished checking dbs" << endl; cc().shutdown(); dbexit( EXIT_CLEAN ); } }
StatusWith<BSONObj> fixDocumentForInsert( const BSONObj& doc ) { if ( doc.objsize() > BSONObjMaxUserSize ) return StatusWith<BSONObj>( ErrorCodes::BadValue, str::stream() << "object to insert too large" << doc.objsize() ); bool firstElementIsId = doc.firstElement().fieldNameStringData() == "_id"; bool hasTimestampToFix = false; { BSONObjIterator i( doc ); while ( i.more() ) { BSONElement e = i.next(); if ( e.type() == Timestamp && e.timestampValue() == 0 ) { // we replace Timestamp(0,0) at the top level with a correct value // in the fast pass, we just mark that we want to swap hasTimestampToFix = true; break; } const char* fieldName = e.fieldName(); if ( fieldName[0] == '$' ) { return StatusWith<BSONObj>( ErrorCodes::BadValue, str::stream() << "Document can't have $ prefixed field names: " << e.fieldName() ); } // check no regexp for _id (SERVER-9502) // also, disallow undefined and arrays if ( str::equals( fieldName, "_id") ) { if ( e.type() == RegEx ) { return StatusWith<BSONObj>( ErrorCodes::BadValue, "can't use a regex for _id" ); } if ( e.type() == Undefined ) { return StatusWith<BSONObj>( ErrorCodes::BadValue, "can't use a undefined for _id" ); } if ( e.type() == Array ) { return StatusWith<BSONObj>( ErrorCodes::BadValue, "can't use an array for _id" ); } if ( e.type() == Object ) { BSONObj o = e.Obj(); Status s = o.storageValidEmbedded(); if ( !s.isOK() ) return StatusWith<BSONObj>( s ); } } } } if ( firstElementIsId && !hasTimestampToFix ) return StatusWith<BSONObj>( BSONObj() ); bool hadId = firstElementIsId; BSONObjIterator i( doc ); BSONObjBuilder b( doc.objsize() + 16 ); if ( firstElementIsId ) { b.append( doc.firstElement() ); i.next(); } else { BSONElement e = doc["_id"]; if ( e.type() ) { b.append( e ); hadId = true; } else { b.appendOID( "_id", NULL, true ); } } while ( i.more() ) { BSONElement e = i.next(); if ( hadId && e.fieldNameStringData() == "_id" ) { // no-op } else if ( e.type() == Timestamp && e.timestampValue() == 0 ) { mutex::scoped_lock lk(OpTime::m); b.append( e.fieldName(), OpTime::now(lk) ); } else { b.append( e ); } } return StatusWith<BSONObj>( b.obj() ); }
StatusWith<CompactStats> compactCollection(OperationContext* opCtx, Collection* collection, const CompactOptions* compactOptions) { dassert(opCtx->lockState()->isCollectionLockedForMode(collection->ns(), MODE_X)); DisableDocumentValidation validationDisabler(opCtx); auto recordStore = collection->getRecordStore(); auto indexCatalog = collection->getIndexCatalog(); if (!recordStore->compactSupported()) return StatusWith<CompactStats>(ErrorCodes::CommandNotSupported, str::stream() << "cannot compact collection with record store: " << recordStore->name()); if (recordStore->compactsInPlace()) { CompactStats stats; Status status = recordStore->compact(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); // Compact all indexes (not including unfinished indexes) status = indexCatalog->compactIndexes(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); return StatusWith<CompactStats>(stats); } if (indexCatalog->numIndexesInProgress(opCtx)) return StatusWith<CompactStats>(ErrorCodes::BadValue, "cannot compact when indexes in progress"); std::vector<BSONObj> indexSpecs; { std::unique_ptr<IndexCatalog::IndexIterator> ii( indexCatalog->getIndexIterator(opCtx, false)); while (ii->more()) { const IndexDescriptor* descriptor = ii->next()->descriptor(); // Compact always creates the new index in the foreground. const BSONObj spec = descriptor->infoObj().removeField(IndexDescriptor::kBackgroundFieldName); const BSONObj key = spec.getObjectField("key"); const Status keyStatus = index_key_validate::validateKeyPattern(key, descriptor->version()); if (!keyStatus.isOK()) { return StatusWith<CompactStats>( ErrorCodes::CannotCreateIndex, str::stream() << "Cannot compact collection due to invalid index " << spec << ": " << keyStatus.reason() << " For more info see" << " http://dochub.mongodb.org/core/index-validation"); } indexSpecs.push_back(spec); } } // Give a chance to be interrupted *before* we drop all indexes. opCtx->checkForInterrupt(); { // note that the drop indexes call also invalidates all clientcursors for the namespace, // which is important and wanted here WriteUnitOfWork wunit(opCtx); log() << "compact dropping indexes"; indexCatalog->dropAllIndexes(opCtx, true); wunit.commit(); } CompactStats stats; MultiIndexBlock indexer; indexer.ignoreUniqueConstraint(); // in compact we should be doing no checking // The 'indexer' could throw, so ensure build cleanup occurs. ON_BLOCK_EXIT([&] { indexer.cleanUpAfterBuild(opCtx, collection); }); Status status = indexer.init(opCtx, collection, indexSpecs, MultiIndexBlock::kNoopOnInitFn).getStatus(); if (!status.isOK()) return StatusWith<CompactStats>(status); status = recordStore->compact(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); log() << "starting index commits"; status = indexer.dumpInsertsFromBulk(opCtx); if (!status.isOK()) return StatusWith<CompactStats>(status); { WriteUnitOfWork wunit(opCtx); status = indexer.commit(opCtx, collection, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn); if (!status.isOK()) { return StatusWith<CompactStats>(status); } wunit.commit(); } return StatusWith<CompactStats>(stats); }
void SyncSourceFeedback::run() { Client::initThread("SyncSourceFeedbackThread"); OperationContextImpl txn; bool positionChanged = false; bool handshakeNeeded = false; ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { _cond.wait(lock); } if (_shutdownSignaled) { break; } positionChanged = _positionChanged; handshakeNeeded = _handshakeNeeded; _positionChanged = false; _handshakeNeeded = false; } MemberState state = replCoord->getCurrentMemberState(); if (state.primary() || state.startup()) { _resetConnection(); continue; } const HostAndPort target = BackgroundSync::get()->getSyncTarget(); if (_syncTarget != target) { _resetConnection(); _syncTarget = target; } if (!hasConnection()) { // fix connection if need be if (target.empty()) { sleepmillis(500); continue; } if (!_connect(&txn, target)) { sleepmillis(500); continue; } handshakeNeeded = true; } if (handshakeNeeded) { positionChanged = true; if (!replHandshake(&txn)) { boost::unique_lock<boost::mutex> lock(_mtx); _handshakeNeeded = true; continue; } } if (positionChanged) { Status status = updateUpstream(&txn); if (!status.isOK()) { boost::unique_lock<boost::mutex> lock(_mtx); _positionChanged = true; if (status == ErrorCodes::NodeNotFound) { _handshakeNeeded = true; } } } } cc().shutdown(); }
PlanStage* parseQuery(OperationContext* txn, Collection* collection, BSONObj obj, WorkingSet* workingSet, OwnedPointerVector<MatchExpression>* exprs) { BSONElement firstElt = obj.firstElement(); if (!firstElt.isABSONObj()) { return NULL; } BSONObj paramObj = firstElt.Obj(); MatchExpression* matcher = NULL; BSONObj nodeArgs; // Every node has these two fields. const string filterTag = "filter"; const string argsTag = "args"; BSONObjIterator it(paramObj); while (it.more()) { BSONElement e = it.next(); if (!e.isABSONObj()) { return NULL; } BSONObj argObj = e.Obj(); if (filterTag == e.fieldName()) { StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse( argObj, ExtensionsCallbackReal(txn, &collection->ns())); if (!statusWithMatcher.isOK()) { return NULL; } std::unique_ptr<MatchExpression> me = std::move(statusWithMatcher.getValue()); // exprs is what will wind up deleting this. matcher = me.release(); verify(NULL != matcher); exprs->mutableVector().push_back(matcher); } else if (argsTag == e.fieldName()) { nodeArgs = argObj; } else { uasserted(16910, "Unknown fieldname " + string(e.fieldName()) + " in query node " + obj.toString()); return NULL; } } string nodeName = firstElt.fieldName(); if ("ixscan" == nodeName) { // This'll throw if it's not an obj but that's OK. BSONObj keyPatternObj = nodeArgs["keyPattern"].Obj(); IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByKeyPattern(txn, keyPatternObj); uassert(16890, "Can't find index: " + keyPatternObj.toString(), desc); IndexScanParams params; params.descriptor = desc; params.bounds.isSimpleRange = true; params.bounds.startKey = stripFieldNames(nodeArgs["startKey"].Obj()); params.bounds.endKey = stripFieldNames(nodeArgs["endKey"].Obj()); params.bounds.endKeyInclusive = nodeArgs["endKeyInclusive"].Bool(); params.direction = nodeArgs["direction"].numberInt(); return new IndexScan(txn, params, workingSet, matcher); } else if ("andHash" == nodeName) { uassert( 16921, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj()); auto andStage = make_unique<AndHashStage>(txn, workingSet, collection); int nodesAdded = 0; BSONObjIterator it(nodeArgs["nodes"].Obj()); while (it.more()) { BSONElement e = it.next(); uassert(16922, "node of AND isn't an obj?: " + e.toString(), e.isABSONObj()); PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); uassert( 16923, "Can't parse sub-node of AND: " + e.Obj().toString(), NULL != subNode); // takes ownership andStage->addChild(subNode); ++nodesAdded; } uassert(16927, "AND requires more than one child", nodesAdded >= 2); return andStage.release(); } else if ("andSorted" == nodeName) { uassert( 16924, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj()); auto andStage = make_unique<AndSortedStage>(txn, workingSet, collection); int nodesAdded = 0; BSONObjIterator it(nodeArgs["nodes"].Obj()); while (it.more()) { BSONElement e = it.next(); uassert(16925, "node of AND isn't an obj?: " + e.toString(), e.isABSONObj()); PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); uassert( 16926, "Can't parse sub-node of AND: " + e.Obj().toString(), NULL != subNode); // takes ownership andStage->addChild(subNode); ++nodesAdded; } uassert(16928, "AND requires more than one child", nodesAdded >= 2); return andStage.release(); } else if ("or" == nodeName) { uassert( 16934, "Nodes argument must be provided to AND", nodeArgs["nodes"].isABSONObj()); uassert(16935, "Dedup argument must be provided to OR", !nodeArgs["dedup"].eoo()); BSONObjIterator it(nodeArgs["nodes"].Obj()); auto orStage = make_unique<OrStage>(txn, workingSet, nodeArgs["dedup"].Bool(), matcher); while (it.more()) { BSONElement e = it.next(); if (!e.isABSONObj()) { return NULL; } PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); uassert( 16936, "Can't parse sub-node of OR: " + e.Obj().toString(), NULL != subNode); // takes ownership orStage->addChild(subNode); } return orStage.release(); } else if ("fetch" == nodeName) { uassert( 16929, "Node argument must be provided to fetch", nodeArgs["node"].isABSONObj()); PlanStage* subNode = parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); uassert(28731, "Can't parse sub-node of FETCH: " + nodeArgs["node"].Obj().toString(), NULL != subNode); return new FetchStage(txn, workingSet, subNode, matcher, collection); } else if ("limit" == nodeName) { uassert( 16937, "Limit stage doesn't have a filter (put it on the child)", NULL == matcher); uassert( 16930, "Node argument must be provided to limit", nodeArgs["node"].isABSONObj()); uassert(16931, "Num argument must be provided to limit", nodeArgs["num"].isNumber()); PlanStage* subNode = parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); uassert(28732, "Can't parse sub-node of LIMIT: " + nodeArgs["node"].Obj().toString(), NULL != subNode); return new LimitStage(txn, nodeArgs["num"].numberInt(), workingSet, subNode); } else if ("skip" == nodeName) { uassert( 16938, "Skip stage doesn't have a filter (put it on the child)", NULL == matcher); uassert(16932, "Node argument must be provided to skip", nodeArgs["node"].isABSONObj()); uassert(16933, "Num argument must be provided to skip", nodeArgs["num"].isNumber()); PlanStage* subNode = parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); uassert(28733, "Can't parse sub-node of SKIP: " + nodeArgs["node"].Obj().toString(), NULL != subNode); return new SkipStage(txn, nodeArgs["num"].numberInt(), workingSet, subNode); } else if ("cscan" == nodeName) { CollectionScanParams params; params.collection = collection; // What direction? uassert(16963, "Direction argument must be specified and be a number", nodeArgs["direction"].isNumber()); if (1 == nodeArgs["direction"].numberInt()) { params.direction = CollectionScanParams::FORWARD; } else { params.direction = CollectionScanParams::BACKWARD; } return new CollectionScan(txn, params, workingSet, matcher); } // sort is disabled for now. #if 0 else if ("sort" == nodeName) { uassert(16969, "Node argument must be provided to sort", nodeArgs["node"].isABSONObj()); uassert(16970, "Pattern argument must be provided to sort", nodeArgs["pattern"].isABSONObj()); PlanStage* subNode = parseQuery(txn, db, nodeArgs["node"].Obj(), workingSet, exprs); SortStageParams params; params.pattern = nodeArgs["pattern"].Obj(); return new SortStage(params, workingSet, subNode); } #endif else if ("mergeSort" == nodeName) { uassert( 16971, "Nodes argument must be provided to sort", nodeArgs["nodes"].isABSONObj()); uassert(16972, "Pattern argument must be provided to sort", nodeArgs["pattern"].isABSONObj()); MergeSortStageParams params; params.pattern = nodeArgs["pattern"].Obj(); // Dedup is true by default. auto mergeStage = make_unique<MergeSortStage>(txn, params, workingSet, collection); BSONObjIterator it(nodeArgs["nodes"].Obj()); while (it.more()) { BSONElement e = it.next(); uassert(16973, "node of mergeSort isn't an obj?: " + e.toString(), e.isABSONObj()); PlanStage* subNode = parseQuery(txn, collection, e.Obj(), workingSet, exprs); uassert(16974, "Can't parse sub-node of mergeSort: " + e.Obj().toString(), NULL != subNode); // takes ownership mergeStage->addChild(subNode); } return mergeStage.release(); } else if ("text" == nodeName) { string search = nodeArgs["search"].String(); vector<IndexDescriptor*> idxMatches; collection->getIndexCatalog()->findIndexByType(txn, "text", idxMatches); uassert(17194, "Expected exactly one text index", idxMatches.size() == 1); IndexDescriptor* index = idxMatches[0]; FTSAccessMethod* fam = dynamic_cast<FTSAccessMethod*>(collection->getIndexCatalog()->getIndex(index)); TextStageParams params(fam->getSpec()); params.index = index; // TODO: Deal with non-empty filters. This is a hack to put in covering information // that can only be checked for equality. We ignore this now. Status s = fam->getSpec().getIndexPrefix(BSONObj(), ¶ms.indexPrefix); if (!s.isOK()) { // errmsg = s.toString(); return NULL; } params.spec = fam->getSpec(); params.query.setQuery(search); params.query.setLanguage(fam->getSpec().defaultLanguage().str()); params.query.setCaseSensitive(TextMatchExpressionBase::kCaseSensitiveDefault); params.query.setDiacriticSensitive(TextMatchExpressionBase::kDiacriticSensitiveDefault); if (!params.query.parse(fam->getSpec().getTextIndexVersion()).isOK()) { return NULL; } return new TextStage(txn, params, workingSet, matcher); } else if ("delete" == nodeName) { uassert( 18636, "Delete stage doesn't have a filter (put it on the child)", NULL == matcher); uassert( 18637, "node argument must be provided to delete", nodeArgs["node"].isABSONObj()); uassert(18638, "isMulti argument must be provided to delete", nodeArgs["isMulti"].type() == Bool); PlanStage* subNode = parseQuery(txn, collection, nodeArgs["node"].Obj(), workingSet, exprs); uassert(28734, "Can't parse sub-node of DELETE: " + nodeArgs["node"].Obj().toString(), NULL != subNode); DeleteStageParams params; params.isMulti = nodeArgs["isMulti"].Bool(); return new DeleteStage(txn, params, workingSet, collection, subNode); } else { return NULL; } }
Status ABISysV_ppc::SetReturnValueObject(lldb::StackFrameSP &frame_sp, lldb::ValueObjectSP &new_value_sp) { Status error; if (!new_value_sp) { error.SetErrorString("Empty value object for return value."); return error; } CompilerType compiler_type = new_value_sp->GetCompilerType(); if (!compiler_type) { error.SetErrorString("Null clang type for return value."); return error; } Thread *thread = frame_sp->GetThread().get(); bool is_signed; uint32_t count; bool is_complex; RegisterContext *reg_ctx = thread->GetRegisterContext().get(); bool set_it_simple = false; if (compiler_type.IsIntegerOrEnumerationType(is_signed) || compiler_type.IsPointerType()) { const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName("r3", 0); DataExtractor data; Status data_error; size_t num_bytes = new_value_sp->GetData(data, data_error); if (data_error.Fail()) { error.SetErrorStringWithFormat( "Couldn't convert return value to raw data: %s", data_error.AsCString()); return error; } lldb::offset_t offset = 0; if (num_bytes <= 8) { uint64_t raw_value = data.GetMaxU64(&offset, num_bytes); if (reg_ctx->WriteRegisterFromUnsigned(reg_info, raw_value)) set_it_simple = true; } else { error.SetErrorString("We don't support returning longer than 64 bit " "integer values at present."); } } else if (compiler_type.IsFloatingPointType(count, is_complex)) { if (is_complex) error.SetErrorString( "We don't support returning complex values at present"); else { size_t bit_width = compiler_type.GetBitSize(frame_sp.get()); if (bit_width <= 64) { DataExtractor data; Status data_error; size_t num_bytes = new_value_sp->GetData(data, data_error); if (data_error.Fail()) { error.SetErrorStringWithFormat( "Couldn't convert return value to raw data: %s", data_error.AsCString()); return error; } unsigned char buffer[16]; ByteOrder byte_order = data.GetByteOrder(); data.CopyByteOrderedData(0, num_bytes, buffer, 16, byte_order); set_it_simple = true; } else { // FIXME - don't know how to do 80 bit long doubles yet. error.SetErrorString( "We don't support returning float values > 64 bits at present"); } } } if (!set_it_simple) { // Okay we've got a structure or something that doesn't fit in a simple // register. // We should figure out where it really goes, but we don't support this yet. error.SetErrorString("We only support setting simple integer and float " "return types at present."); } return error; }
Status MMAPV1Engine::repairDatabase( OperationContext* txn, const std::string& dbName, bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) { // We must hold some form of lock here invariant(txn->lockState()->threadState()); invariant( dbName.find( '.' ) == string::npos ); scoped_ptr<RepairFileDeleter> repairFileDeleter; doingRepair dr; log() << "repairDatabase " << dbName << endl; BackgroundOperation::assertNoBgOpInProgForDb(dbName); txn->recoveryUnit()->syncDataAndTruncateJournal(); // Must be done before and after repair intmax_t totalSize = dbSize( dbName ); intmax_t freeSize = File::freeSpace(storageGlobalParams.repairpath); if ( freeSize > -1 && freeSize < totalSize ) { return Status( ErrorCodes::OutOfDiskSpace, str::stream() << "Cannot repair database " << dbName << " having size: " << totalSize << " (bytes) because free disk space is: " << freeSize << " (bytes)" ); } txn->checkForInterrupt(); Path reservedPath = uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ? "backup" : "_tmp" ); MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); string reservedPathString = reservedPath.string(); if ( !preserveClonedFilesOnFailure ) repairFileDeleter.reset( new RepairFileDeleter( txn, dbName, reservedPathString, reservedPath ) ); { Database* originalDatabase = dbHolder().get(txn, dbName); if (originalDatabase == NULL) { return Status(ErrorCodes::NamespaceNotFound, "database does not exist to repair"); } scoped_ptr<MMAPV1DatabaseCatalogEntry> dbEntry; scoped_ptr<Database> tempDatabase; { dbEntry.reset( new MMAPV1DatabaseCatalogEntry( txn, dbName, reservedPathString, storageGlobalParams.directoryperdb, true ) ); invariant( !dbEntry->exists() ); tempDatabase.reset( new Database( txn, dbName, dbEntry.get() ) ); } map<string,CollectionOptions> namespacesToCopy; { string ns = dbName + ".system.namespaces"; Client::Context ctx(txn, ns ); Collection* coll = originalDatabase->getCollection( txn, ns ); if ( coll ) { scoped_ptr<RecordIterator> it( coll->getIterator( txn, DiskLoc(), false, CollectionScanParams::FORWARD ) ); while ( !it->isEOF() ) { DiskLoc loc = it->getNext(); BSONObj obj = coll->docFor( loc ); string ns = obj["name"].String(); NamespaceString nss( ns ); if ( nss.isSystem() ) { if ( nss.isSystemDotIndexes() ) continue; if ( nss.coll() == "system.namespaces" ) continue; } if ( !nss.isNormal() ) continue; CollectionOptions options; if ( obj["options"].isABSONObj() ) { Status status = options.parse( obj["options"].Obj() ); if ( !status.isOK() ) return status; } namespacesToCopy[ns] = options; } } } for ( map<string,CollectionOptions>::const_iterator i = namespacesToCopy.begin(); i != namespacesToCopy.end(); ++i ) { string ns = i->first; CollectionOptions options = i->second; Collection* tempCollection = NULL; { Client::Context tempContext(txn, ns, tempDatabase ); tempCollection = tempDatabase->createCollection( txn, ns, options, true, false ); } Client::Context readContext(txn, ns, originalDatabase); Collection* originalCollection = originalDatabase->getCollection( txn, ns ); invariant( originalCollection ); // data MultiIndexBlock indexBlock(txn, tempCollection ); { vector<BSONObj> indexes; IndexCatalog::IndexIterator ii = originalCollection->getIndexCatalog()->getIndexIterator( false ); while ( ii.more() ) { IndexDescriptor* desc = ii.next(); indexes.push_back( desc->infoObj() ); } Client::Context tempContext(txn, ns, tempDatabase); Status status = indexBlock.init( indexes ); if ( !status.isOK() ) return status; } scoped_ptr<RecordIterator> iterator( originalCollection->getIterator( txn, DiskLoc(), false, CollectionScanParams::FORWARD )); while ( !iterator->isEOF() ) { DiskLoc loc = iterator->getNext(); invariant( !loc.isNull() ); BSONObj doc = originalCollection->docFor( loc ); Client::Context tempContext(txn, ns, tempDatabase); StatusWith<DiskLoc> result = tempCollection->insertDocument( txn, doc, indexBlock ); if ( !result.isOK() ) return result.getStatus(); txn->recoveryUnit()->commitIfNeeded(); txn->checkForInterrupt(false); } { Client::Context tempContext(txn, ns, tempDatabase); Status status = indexBlock.commit(); if ( !status.isOK() ) return status; } } txn->recoveryUnit()->syncDataAndTruncateJournal(); globalStorageEngine->flushAllFiles(true); // need both in case journaling is disabled txn->checkForInterrupt(false); } // at this point if we abort, we don't want to delete new files // as they might be the only copies if ( repairFileDeleter.get() ) repairFileDeleter->success(); dbHolder().close( txn, dbName ); if ( backupOriginalFiles ) { _renameForBackup( dbName, reservedPath ); } else { // first make new directory before deleting data Path newDir = Path(storageGlobalParams.dbpath) / dbName; MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir)); // this deletes old files _deleteDataFiles( dbName ); if ( !boost::filesystem::exists(newDir) ) { // we deleted because of directoryperdb // re-create MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir)); } } _replaceWithRecovered( dbName, reservedPathString.c_str() ); if ( !backupOriginalFiles ) MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); return Status::OK(); }
Status MMAPV1DatabaseCatalogEntry::renameCollection( OperationContext* txn, const StringData& fromNS, const StringData& toNS, bool stayTemp ) { Status s = _renameSingleNamespace( txn, fromNS, toNS, stayTemp ); if ( !s.isOK() ) return s; NamespaceDetails* details = _namespaceIndex.details( toNS ); invariant( details ); RecordStoreV1Base* systemIndexRecordStore = _getIndexRecordStore( txn ); scoped_ptr<RecordIterator> it( systemIndexRecordStore->getIterator() ); while ( !it->isEOF() ) { DiskLoc loc = it->getNext(); const Record* rec = it->recordFor( loc ); BSONObj oldIndexSpec( rec->data() ); if ( fromNS != oldIndexSpec["ns"].valuestrsafe() ) continue; BSONObj newIndexSpec; { BSONObjBuilder b; BSONObjIterator i( oldIndexSpec ); while( i.more() ) { BSONElement e = i.next(); if ( strcmp( e.fieldName(), "ns" ) != 0 ) b.append( e ); else b << "ns" << toNS; } newIndexSpec = b.obj(); } StatusWith<DiskLoc> newIndexSpecLoc = systemIndexRecordStore->insertRecord( txn, newIndexSpec.objdata(), newIndexSpec.objsize(), -1 ); if ( !newIndexSpecLoc.isOK() ) return newIndexSpecLoc.getStatus(); const string& indexName = oldIndexSpec.getStringField( "name" ); { // fix IndexDetails pointer NamespaceDetailsCollectionCatalogEntry ce( toNS, details, _getIndexRecordStore( txn ), this ); int indexI = ce._findIndexNumber( indexName ); IndexDetails& indexDetails = details->idx(indexI); *txn->recoveryUnit()->writing(&indexDetails.info) = newIndexSpecLoc.getValue(); // XXX: dur } { // move underlying namespac string oldIndexNs = IndexDescriptor::makeIndexNamespace( fromNS, indexName ); string newIndexNs = IndexDescriptor::makeIndexNamespace( toNS, indexName ); Status s = _renameSingleNamespace( txn, oldIndexNs, newIndexNs, false ); if ( !s.isOK() ) return s; } systemIndexRecordStore->deleteRecord( txn, loc ); } return Status::OK(); }
Status WriteCmd::explain(OperationContext* txn, const std::string& dbname, const BSONObj& cmdObj, ExplainCommon::Verbosity verbosity, BSONObjBuilder* out) const { // For now we only explain update and delete write commands. if ( BatchedCommandRequest::BatchType_Update != _writeType && BatchedCommandRequest::BatchType_Delete != _writeType ) { return Status( ErrorCodes::IllegalOperation, "Only update and delete write ops can be explained" ); } // Parse the batch request. BatchedCommandRequest request( _writeType ); std::string errMsg; if ( !request.parseBSON( cmdObj, &errMsg ) || !request.isValid( &errMsg ) ) { return Status( ErrorCodes::FailedToParse, errMsg ); } // Note that this is a runCommmand, and therefore, the database and the collection name // are in different parts of the grammar for the command. But it's more convenient to // work with a NamespaceString. We built it here and replace it in the parsed command. // Internally, everything work with the namespace string as opposed to just the // collection name. NamespaceString nsString(dbname, request.getNS()); request.setNS(nsString.ns()); // Do the validation of the batch that is shared with non-explained write batches. Status isValid = WriteBatchExecutor::validateBatch( request ); if (!isValid.isOK()) { return isValid; } // Explain must do one additional piece of validation: For now we only explain // singleton batches. if ( request.sizeWriteOps() != 1u ) { return Status( ErrorCodes::InvalidLength, "explained write batches must be of size 1" ); } // Get a reference to the singleton batch item (it's the 0th item in the batch). BatchItemRef batchItem( &request, 0 ); if ( BatchedCommandRequest::BatchType_Update == _writeType ) { // Create the update request. UpdateRequest updateRequest( txn, nsString ); updateRequest.setQuery( batchItem.getUpdate()->getQuery() ); updateRequest.setUpdates( batchItem.getUpdate()->getUpdateExpr() ); updateRequest.setMulti( batchItem.getUpdate()->getMulti() ); updateRequest.setUpsert( batchItem.getUpdate()->getUpsert() ); updateRequest.setUpdateOpLog( true ); UpdateLifecycleImpl updateLifecycle( true, updateRequest.getNamespaceString() ); updateRequest.setLifecycle( &updateLifecycle ); updateRequest.setExplain(); // Explained updates can yield. updateRequest.setYieldPolicy(PlanExecutor::YIELD_AUTO); // Use the request to create an UpdateExecutor, and from it extract the // plan tree which will be used to execute this update. UpdateExecutor updateExecutor( &updateRequest, &txn->getCurOp()->debug() ); Status prepStatus = updateExecutor.prepare(); if ( !prepStatus.isOK() ) { return prepStatus; } // Explains of write commands are read-only, but we take an exclusive lock so // that timing info is more accurate. Lock::DBLock dlk(txn->lockState(), nsString.db(), MODE_IX); Client::Context ctx(txn, nsString); Status prepInLockStatus = updateExecutor.prepareInLock(ctx.db()); if ( !prepInLockStatus.isOK() ) { return prepInLockStatus; } // Executor registration and yield policy is handled internally by the update executor. PlanExecutor* exec = updateExecutor.getPlanExecutor(); // Explain the plan tree. return Explain::explainStages( exec, verbosity, out ); } else { invariant( BatchedCommandRequest::BatchType_Delete == _writeType ); // Create the delete request. DeleteRequest deleteRequest( txn, nsString ); deleteRequest.setQuery( batchItem.getDelete()->getQuery() ); deleteRequest.setMulti( batchItem.getDelete()->getLimit() != 1 ); deleteRequest.setUpdateOpLog(true); deleteRequest.setGod( false ); deleteRequest.setExplain(); // Explained deletes can yield. deleteRequest.setYieldPolicy(PlanExecutor::YIELD_AUTO); // Use the request to create a DeleteExecutor, and from it extract the // plan tree which will be used to execute this update. DeleteExecutor deleteExecutor( &deleteRequest ); Status prepStatus = deleteExecutor.prepare(); if ( !prepStatus.isOK() ) { return prepStatus; } // Explains of write commands are read-only, but we take a write lock so that timing // info is more accurate. Lock::DBLock dlk(txn->lockState(), nsString.db(), MODE_IX); Client::Context ctx(txn, nsString); Status prepInLockStatus = deleteExecutor.prepareInLock(ctx.db()); if ( !prepInLockStatus.isOK()) { return prepInLockStatus; } // Executor registration and yield policy is handled internally by the delete executor. PlanExecutor* exec = deleteExecutor.getPlanExecutor(); // Explain the plan tree. return Explain::explainStages( exec, verbosity, out ); } }
DiskLoc DataFileMgr::insert(const char* ns, const void* obuf, int32_t len, bool mayInterrupt, bool god, bool mayAddIndex, bool* addedID) { Database* database = cc().database(); bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys ) { if ( !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); if ( mayAddIndex && wouldAddIndex ) { // TODO: this should be handled above this function BSONObj spec( static_cast<const char*>( obuf ) ); string collectionToIndex = spec.getStringField( "ns" ); uassert(10096, "invalid ns to index", collectionToIndex.find( '.' ) != string::npos); massert(10097, str::stream() << "trying to create index on wrong db " << " db: " << database->name() << " collection: " << collectionToIndex, database->ownsNS( collectionToIndex ) ); Collection* collection = database->getCollection( collectionToIndex ); if ( !collection ) { collection = database->createCollection( collectionToIndex, false, NULL, true ); verify( collection ); if ( !god ) ensureIdIndexForNewNs( collection ); } Status status = collection->getIndexCatalog()->createIndex( spec, mayInterrupt ); if ( status.code() == ErrorCodes::IndexAlreadyExists ) return DiskLoc(); uassertStatusOK( status ); return DiskLoc(); } } } Collection* collection = database->getCollection( ns ); if ( collection == NULL ) { collection = database->createCollection( ns, false, NULL, false ); int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using // initialExtentSize() which is more for user collections. // TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } collection->increaseStorageSize( ies, false); if ( !god ) ensureIdIndexForNewNs( collection ); } NamespaceDetails* d = collection->details(); IDToInsert idToInsert; // only initialized if needed if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections in local as they don't have an _id index if( idField.eoo() && !wouldAddIndex && nsToDatabase( ns ) != "local" && d->haveIdIndex() ) { if( addedID ) *addedID = true; idToInsert.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize ); fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) ); // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->isCapped() && !god) { BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) ); Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp ); uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { string errmsg = str::stream() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped(); log() << errmsg; uasserted( 17248, errmsg ); } Record *r = loc.rec(); { verify( r->lengthWithHeaders() >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( idToInsert.needed() ) { /* a little effort was made here to avoid a double copy when we add an ID */ int originalSize = *((int*) obuf); ((int&)*r->data()) = originalSize + idToInsert.size(); memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data(), obuf, len); } } addRecordToRecListInExtent(r, loc); d->incrementStats( r->netLength(), 1 ); // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) collection->infoCache()->notifyOfWriteOp(); /* add this record to our indexes */ if ( d->getTotalIndexCount() > 0 ) { try { BSONObj obj(r->data()); collection->getIndexCatalog()->indexRecord(obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( d->isCapped() ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; setLastError(0, s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } d->paddingFits(); return loc; }
void CursorCache::gotKillCursors(Message& m) { LastError::get(cc()).disable(); DbMessage dbmessage(m); int n = dbmessage.pullInt(); if (n > 2000) { (n < 30000 ? warning() : error()) << "receivedKillCursors, n=" << n << endl; } uassert(13286, "sent 0 cursors to kill", n >= 1); uassert(13287, "too many cursors to kill", n < 30000); massert(18632, str::stream() << "bad kill cursors size: " << m.dataSize(), m.dataSize() == 8 + (8 * n)); ConstDataCursor cursors(dbmessage.getArray(n)); ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = AuthorizationSession::get(client); for (int i = 0; i < n; i++) { long long id = cursors.readAndAdvance<LittleEndian<int64_t>>(); LOG(_myLogLevel) << "CursorCache::gotKillCursors id: " << id << endl; if (!id) { warning() << " got cursor id of 0 to kill" << endl; continue; } string server; { stdx::lock_guard<stdx::mutex> lk(_mutex); MapSharded::iterator i = _cursors.find(id); if (i != _cursors.end()) { Status authorizationStatus = authSession->checkAuthForKillCursors(NamespaceString(i->second->getNS()), id); audit::logKillCursorsAuthzCheck( client, NamespaceString(i->second->getNS()), id, authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (authorizationStatus.isOK()) { _cursorsMaxTimeMS.erase(i->second->getId()); _cursors.erase(i); } continue; } MapNormal::iterator refsIt = _refs.find(id); MapNormal::iterator refsNSIt = _refsNS.find(id); if (refsIt == _refs.end()) { warning() << "can't find cursor: " << id << endl; continue; } verify(refsNSIt != _refsNS.end()); Status authorizationStatus = authSession->checkAuthForKillCursors(NamespaceString(refsNSIt->second), id); audit::logKillCursorsAuthzCheck(client, NamespaceString(refsNSIt->second), id, authorizationStatus.isOK() ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (!authorizationStatus.isOK()) { continue; } server = refsIt->second; _refs.erase(refsIt); _refsNS.erase(refsNSIt); cursorStatsSingleTarget.decrement(); } LOG(_myLogLevel) << "CursorCache::found gotKillCursors id: " << id << " server: " << server << endl; verify(server.size()); ScopedDbConnection conn(server); conn->killCursor(id); conn.done(); } }
Status ChunkMoveOperationState::commitMigration(const MigrationSessionId& sessionId) { invariant(_distLockStatus.is_initialized()); invariant(_distLockStatus->isOK()); log() << "About to enter migrate critical section"; // We're under the collection distributed lock here, so no other migrate can change maxVersion // or CollectionMetadata state. ShardingState* const shardingState = ShardingState::get(_txn); Status startStatus = ShardingStateRecovery::startMetadataOp(_txn); if (!startStatus.isOK()) { warning() << "Failed to write sharding state recovery document" << causedBy(startStatus); return startStatus; } shardingState->migrationSourceManager()->setInCriticalSection(true); const ChunkVersion originalCollVersion = getCollMetadata()->getCollVersion(); ChunkVersion myVersion = originalCollVersion; myVersion.incMajor(); { ScopedTransaction transaction(_txn, MODE_IX); Lock::DBLock lk(_txn->lockState(), _nss.db(), MODE_IX); Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X); invariant(myVersion > shardingState->getVersion(_nss.ns())); // Bump the metadata's version up and "forget" about the chunk being moved. This is // not the commit point, but in practice the state in this shard won't change until // the commit it done. shardingState->donateChunk(_txn, _nss.ns(), _minKey, _maxKey, myVersion); } log() << "moveChunk setting version to: " << myVersion << migrateLog; // We're under the collection lock here, too, so we can undo the chunk donation because // no other state change could be ongoing BSONObj res; Status recvChunkCommitStatus{ErrorCodes::InternalError, "status not set"}; try { ScopedDbConnection connTo(_toShardCS, 35.0); connTo->runCommand("admin", createRecvChunkCommitRequest(sessionId), res); connTo.done(); recvChunkCommitStatus = getStatusFromCommandResult(res); } catch (const DBException& e) { const string msg = stream() << "moveChunk could not contact to shard " << _toShard << " to commit transfer" << causedBy(e); warning() << msg; recvChunkCommitStatus = Status(e.toStatus().code(), msg); } if (MONGO_FAIL_POINT(failMigrationCommit) && recvChunkCommitStatus.isOK()) { recvChunkCommitStatus = Status(ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."); } if (!recvChunkCommitStatus.isOK()) { log() << "moveChunk migrate commit not accepted by TO-shard: " << res << " resetting shard version to: " << getShardVersion() << migrateLog; { ScopedTransaction transaction(_txn, MODE_IX); Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX); Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X); log() << "moveChunk collection lock acquired to reset shard version from " "failed migration"; // Revert the chunk manager back to the state before "forgetting" about the chunk shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata()); } log() << "Shard version successfully reset to clean up failed migration"; return Status(recvChunkCommitStatus.code(), stream() << "_recvChunkCommit failed: " << causedBy(recvChunkCommitStatus)); } log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog; BSONArrayBuilder updates; { // Update for the chunk being moved BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey)); myVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _nss.ns()); n.append(ChunkType::min(), _minKey); n.append(ChunkType::max(), _maxKey); n.append(ChunkType::shard(), _toShard); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey)); q.done(); updates.append(op.obj()); } // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = myVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. const std::shared_ptr<CollectionMetadata> bumpedCollMetadata( shardingState->getCollectionMetadata(_nss.ns())); if (bumpedCollMetadata->getNumChunks() > 0) { // get another chunk on that shard ChunkType bumpChunk; invariant(bumpedCollMetadata->getNextChunk(bumpedCollMetadata->getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); dassert(bumpMin.woCompare(_minKey) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); nextVersion.incMinor(); // same as used on donateChunk BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _nss.ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _fromShard); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _nss.ns() << "'" << migrateLog; } else { log() << "moveChunk moved last chunk out for collection '" << _nss.ns() << "'" << migrateLog; } BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_nss.ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), originalCollVersion.toLong()); bb.done(); } preCond.append(b.obj()); } Status applyOpsStatus{Status::OK()}; try { // For testing migration failures if (MONGO_FAIL_POINT(failMigrationConfigWritePrepare)) { throw DBException("mock migration failure before config write", ErrorCodes::PrepareConfigsFailed); } applyOpsStatus = grid.catalogManager(_txn)->applyChunkOpsDeprecated(_txn, updates.arr(), preCond.arr()); if (MONGO_FAIL_POINT(failMigrationApplyOps)) { throw SocketException(SocketException::RECV_ERROR, shardingState->getConfigServer(_txn).toString()); } } catch (const DBException& ex) { applyOpsStatus = ex.toStatus(); } if (applyOpsStatus == ErrorCodes::PrepareConfigsFailed) { // In the process of issuing the migrate commit, the SyncClusterConnection checks that // the config servers are reachable. If they are not, we are sure that the applyOps // command was not sent to any of the configs, so we can safely back out of the // migration here, by resetting the shard version that we bumped up to in the // donateChunk() call above. log() << "About to acquire moveChunk coll lock to reset shard version from " << "failed migration"; { ScopedTransaction transaction(_txn, MODE_IX); Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX); Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X); // Revert the metadata back to the state before "forgetting" about the chunk shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata()); } log() << "Shard version successfully reset to clean up failed migration"; const string msg = stream() << "Failed to send migrate commit to configs " << causedBy(applyOpsStatus); return Status(applyOpsStatus.code(), msg); } else if (!applyOpsStatus.isOK()) { // This could be a blip in the connectivity. Wait out a few seconds and check if the // commit request made it. // // If the commit made it to the config, we'll see the chunk in the new shard and // there's no further action to be done. // // If the commit did not make it, currently the only way to fix this state is to // bounce the mongod so that the old state (before migrating) is brought in. warning() << "moveChunk commit failed and metadata will be revalidated" << causedBy(applyOpsStatus) << migrateLog; sleepsecs(10); // Look for the chunk in this shard whose version got bumped. We assume that if that // mod made it to the config server, then applyOps was successful. try { std::vector<ChunkType> newestChunk; Status status = grid.catalogManager(_txn)->getChunks(_txn, BSON(ChunkType::ns(_nss.ns())), BSON(ChunkType::DEPRECATED_lastmod() << -1), 1, &newestChunk, nullptr); uassertStatusOK(status); ChunkVersion checkVersion; if (!newestChunk.empty()) { invariant(newestChunk.size() == 1); checkVersion = newestChunk[0].getVersion(); } if (checkVersion.equals(nextVersion)) { log() << "moveChunk commit confirmed" << migrateLog; } else { error() << "moveChunk commit failed: version is at " << checkVersion << " instead of " << nextVersion << migrateLog; error() << "TERMINATING" << migrateLog; dbexit(EXIT_SHARDING_ERROR); } } catch (...) { error() << "moveChunk failed to get confirmation of commit" << migrateLog; error() << "TERMINATING" << migrateLog; dbexit(EXIT_SHARDING_ERROR); } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); shardingState->migrationSourceManager()->setInCriticalSection(false); ShardingStateRecovery::endMetadataOp(_txn); // Migration is done, just log some diagnostics information BSONObj chunkInfo = BSON("min" << _minKey << "max" << _maxKey << "from" << _fromShard << "to" << _toShard); BSONObjBuilder commitInfo; commitInfo.appendElements(chunkInfo); if (res["counts"].type() == Object) { commitInfo.appendElements(res["counts"].Obj()); } grid.catalogManager(_txn)->logChange(_txn, "moveChunk.commit", _nss.ns(), commitInfo.obj()); shardingState->migrationSourceManager()->done(_txn); _isRunning = false; return Status::OK(); }
void BackgroundSync::_produce( OperationContext* txn, ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; StorageInterface::get(txn) ->setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>(&_threadPoolTaskExecutor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3, stdx::placeholders::_4), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = StorageInterface::get(txn)->getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
bool ShardingCatalogClientImpl::runUserManagementWriteCommand(OperationContext* opCtx, const std::string& commandName, const std::string& dbname, const BSONObj& cmdObj, BSONObjBuilder* result) { BSONObj cmdToRun = cmdObj; { // Make sure that if the command has a write concern that it is w:1 or w:majority, and // convert w:1 or no write concern to w:majority before sending. WriteConcernOptions writeConcern; writeConcern.reset(); BSONElement writeConcernElement = cmdObj[WriteConcernOptions::kWriteConcernField]; bool initialCmdHadWriteConcern = !writeConcernElement.eoo(); if (initialCmdHadWriteConcern) { Status status = writeConcern.parse(writeConcernElement.Obj()); if (!status.isOK()) { return CommandHelpers::appendCommandStatusNoThrow(*result, status); } if (!(writeConcern.wNumNodes == 1 || writeConcern.wMode == WriteConcernOptions::kMajority)) { return CommandHelpers::appendCommandStatusNoThrow( *result, {ErrorCodes::InvalidOptions, str::stream() << "Invalid replication write concern. User management write " "commands may only use w:1 or w:'majority', got: " << writeConcern.toBSON()}); } } writeConcern.wMode = WriteConcernOptions::kMajority; writeConcern.wNumNodes = 0; BSONObjBuilder modifiedCmd; if (!initialCmdHadWriteConcern) { modifiedCmd.appendElements(cmdObj); } else { BSONObjIterator cmdObjIter(cmdObj); while (cmdObjIter.more()) { BSONElement e = cmdObjIter.next(); if (WriteConcernOptions::kWriteConcernField == e.fieldName()) { continue; } modifiedCmd.append(e); } } modifiedCmd.append(WriteConcernOptions::kWriteConcernField, writeConcern.toBSON()); cmdToRun = modifiedCmd.obj(); } auto response = Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, dbname, cmdToRun, Shard::kDefaultConfigCommandTimeout, Shard::RetryPolicy::kNotIdempotent); if (!response.isOK()) { return CommandHelpers::appendCommandStatusNoThrow(*result, response.getStatus()); } if (!response.getValue().commandStatus.isOK()) { return CommandHelpers::appendCommandStatusNoThrow(*result, response.getValue().commandStatus); } if (!response.getValue().writeConcernStatus.isOK()) { return CommandHelpers::appendCommandStatusNoThrow(*result, response.getValue().writeConcernStatus); } CommandHelpers::filterCommandReplyForPassthrough(response.getValue().response, result); return true; }
Status enforceLegacyWriteConcern(MultiCommandDispatch* dispatcher, StringData dbName, const BSONObj& options, const HostOpTimeMap& hostOpTimes, vector<LegacyWCResponse>* legacyWCResponses) { if (hostOpTimes.empty()) { return Status::OK(); } for (HostOpTimeMap::const_iterator it = hostOpTimes.begin(); it != hostOpTimes.end(); ++it) { const ConnectionString& shardEndpoint = it->first; const HostOpTime hot = it->second; const Timestamp& opTime = hot.opTime; const OID& electionId = hot.electionId; LOG(3) << "enforcing write concern " << options << " on " << shardEndpoint.toString() << " at opTime " << opTime.toStringPretty() << " with electionID " << electionId; BSONObj gleCmd = buildGLECmdWithOpTime(options, opTime, electionId); dispatcher->addCommand(shardEndpoint, dbName, gleCmd); } dispatcher->sendAll(); vector<Status> failedStatuses; while (dispatcher->numPending() > 0) { ConnectionString shardEndpoint; RawBSONSerializable gleResponseSerial; Status dispatchStatus = dispatcher->recvAny(&shardEndpoint, &gleResponseSerial); if (!dispatchStatus.isOK()) { // We need to get all responses before returning failedStatuses.push_back(dispatchStatus); continue; } BSONObj gleResponse = stripNonWCInfo(gleResponseSerial.toBSON()); // Use the downconversion tools to determine if this GLE response is ok, a // write concern error, or an unknown error we should immediately abort for. GLEErrors errors; Status extractStatus = extractGLEErrors(gleResponse, &errors); if (!extractStatus.isOK()) { failedStatuses.push_back(extractStatus); continue; } LegacyWCResponse wcResponse; wcResponse.shardHost = shardEndpoint.toString(); wcResponse.gleResponse = gleResponse; if (errors.wcError.get()) { wcResponse.errToReport = errors.wcError->getErrMessage(); } legacyWCResponses->push_back(wcResponse); } if (failedStatuses.empty()) { return Status::OK(); } StringBuilder builder; builder << "could not enforce write concern"; for (vector<Status>::const_iterator it = failedStatuses.begin(); it != failedStatuses.end(); ++it) { const Status& failedStatus = *it; if (it == failedStatuses.begin()) { builder << causedBy(failedStatus.toString()); } else { builder << ":: and ::" << failedStatus.toString(); } } return Status(failedStatuses.size() == 1u ? failedStatuses.front().code() : ErrorCodes::MultipleErrorsOccurred, builder.str()); }
Status ShardingCatalogClientImpl::insertConfigDocument(OperationContext* opCtx, const NamespaceString& nss, const BSONObj& doc, const WriteConcernOptions& writeConcern) { invariant(nss.db() == NamespaceString::kAdminDb || nss.db() == NamespaceString::kConfigDb); const BSONElement idField = doc.getField("_id"); invariant(!idField.eoo()); BatchedCommandRequest request([&] { write_ops::Insert insertOp(nss); insertOp.setDocuments({doc}); return insertOp; }()); request.setWriteConcern(writeConcern.toBSON()); auto configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard(); for (int retry = 1; retry <= kMaxWriteRetry; retry++) { auto response = configShard->runBatchWriteCommand( opCtx, Shard::kDefaultConfigCommandTimeout, request, Shard::RetryPolicy::kNoRetry); Status status = response.toStatus(); if (retry < kMaxWriteRetry && configShard->isRetriableError(status.code(), Shard::RetryPolicy::kIdempotent)) { // Pretend like the operation is idempotent because we're handling DuplicateKey errors // specially continue; } // If we get DuplicateKey error on the first attempt to insert, this definitively means that // we are trying to insert the same entry a second time, so error out. If it happens on a // retry attempt though, it is not clear whether we are actually inserting a duplicate key // or it is because we failed to wait for write concern on the first attempt. In order to // differentiate, fetch the entry and check. if (retry > 1 && status == ErrorCodes::DuplicateKey) { LOG(1) << "Insert retry failed because of duplicate key error, rechecking."; auto fetchDuplicate = _exhaustiveFindOnConfig(opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, repl::ReadConcernLevel::kMajorityReadConcern, nss, idField.wrap(), BSONObj(), boost::none); if (!fetchDuplicate.isOK()) { return fetchDuplicate.getStatus(); } auto existingDocs = fetchDuplicate.getValue().value; if (existingDocs.empty()) { return {status.withContext( stream() << "DuplicateKey error was returned after a retry attempt, but no " "documents were found. This means a concurrent change occurred " "together with the retries.")}; } invariant(existingDocs.size() == 1); BSONObj existing = std::move(existingDocs.front()); if (existing.woCompare(doc) == 0) { // Documents match, so treat the operation as success return Status::OK(); } } return status; } MONGO_UNREACHABLE; }
int main(int argc, char** argv) { string impl = "printer"; const char* script = NULL; for (int32_t i = 1; i < argc; i++) { if (string(argv[i]) == "-j") { impl = "jit"; } else { script = argv[i]; } } Translator* translator = Translator::create(impl); if (translator == 0) { cout << "TODO: Implement translator factory in translator.cpp!!!!" << endl; return 1; } const char* expr = "double x; double y;" "x += 8.0; y = 2.0;" "print('Hello, x=',x,' y=',y,'\n');" ; bool isDefaultExpr = true; if (script != NULL) { expr = loadFile(script); if (expr == 0) { printf("Cannot read file: %s\n", script); return 1; } isDefaultExpr = false; } Code* code = 0; Status* translateStatus = translator->translate(expr, &code); if (translateStatus->isError()) { uint32_t position = translateStatus->getPosition(); uint32_t line = 0, offset = 0; positionToLineOffset(expr, position, line, offset); fprintf(stderr, "Cannot translate expression: expression at %d,%d; " "error '%s'\n", line, offset, translateStatus->getError().c_str()); } else { assert(code != 0); vector<Var*> vars; if (isDefaultExpr) { Var* xVar = new Var(VT_DOUBLE, "x"); Var* yVar = new Var(VT_DOUBLE, "y"); vars.push_back(xVar); vars.push_back(yVar); xVar->setDoubleValue(42.0); } Status* execStatus = code->execute(vars); if (execStatus->isError()) { fprintf(stderr, "Cannot execute expression: error: %s\n", execStatus->getError().c_str()); } else { if (isDefaultExpr) { printf("x evaluated to %f\n", vars[0]->getDoubleValue()); for (uint32_t i = 0; i < vars.size(); i++) { delete vars[i]; } } } delete code; delete execStatus; } delete translateStatus; delete translator; if (!isDefaultExpr) { delete [] expr; } return 0; }
/// Add or update an entry in the cache. /// /// @param[in] path Asset path. /// @param[in] subDataIndex Sub-data index associated with the cached data. /// @param[in] pData Data to cache. /// @param[in] timestamp Timestamp value to associate with the entry in the cache. /// @param[in] size Number of bytes to cache. /// /// @return True if the cache was updated successfully, false if not. bool Cache::CacheEntry( AssetPath path, uint32_t subDataIndex, const void* pData, int64_t timestamp, uint32_t size ) { HELIUM_ASSERT( pData || size == 0 ); Status status; status.Read( m_cacheFileName.GetData() ); int64_t cacheFileSize = status.m_Size; uint64_t entryOffset = ( cacheFileSize == -1 ? 0 : static_cast< uint64_t >( cacheFileSize ) ); HELIUM_ASSERT( m_pEntryPool ); Entry* pEntryUpdate = m_pEntryPool->Allocate(); HELIUM_ASSERT( pEntryUpdate ); pEntryUpdate->offset = entryOffset; pEntryUpdate->timestamp = timestamp; pEntryUpdate->path = path; pEntryUpdate->subDataIndex = subDataIndex; pEntryUpdate->size = size; uint64_t originalOffset = 0; int64_t originalTimestamp = 0; uint32_t originalSize = 0; EntryKey key; key.path = path; key.subDataIndex = subDataIndex; EntryMapType::Accessor entryAccessor; bool bNewEntry = m_entryMap.Insert( entryAccessor, KeyValue< EntryKey, Entry* >( key, pEntryUpdate ) ); if( bNewEntry ) { HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Adding \"%s\" to cache \"%s\".\n" ), *path.ToString(), *m_cacheFileName ); m_entries.Push( pEntryUpdate ); } else { HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Updating \"%s\" in cache \"%s\".\n" ), *path.ToString(), *m_cacheFileName ); m_pEntryPool->Release( pEntryUpdate ); pEntryUpdate = entryAccessor->Second(); HELIUM_ASSERT( pEntryUpdate ); originalOffset = pEntryUpdate->offset; originalTimestamp = pEntryUpdate->timestamp; originalSize = pEntryUpdate->size; if( originalSize < size ) { pEntryUpdate->offset = entryOffset; } else { entryOffset = originalOffset; } pEntryUpdate->timestamp = timestamp; pEntryUpdate->size = size; } AsyncLoader& rLoader = AsyncLoader::GetStaticInstance(); rLoader.Lock(); bool bCacheSuccess = true; FileStream* pCacheStream = FileStream::OpenFileStream( m_cacheFileName, FileStream::MODE_WRITE, false ); if( !pCacheStream ) { HELIUM_TRACE( TraceLevels::Error, TXT( "Cache: Failed to open cache \"%s\" for writing.\n" ), *m_cacheFileName ); bCacheSuccess = false; } else { HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Caching \"%s\" to \"%s\" (%" ) PRIu32 TXT( " bytes @ offset %" ) PRIu64 TXT( ").\n" ), *path.ToString(), *m_cacheFileName, size, entryOffset ); uint64_t seekOffset = static_cast< uint64_t >( pCacheStream->Seek( static_cast< int64_t >( entryOffset ), SeekOrigins::Begin ) ); if( seekOffset != entryOffset ) { HELIUM_TRACE( TraceLevels::Error, TXT( "Cache: Cache file offset seek failed.\n" ) ); if( bNewEntry ) { m_entries.Pop(); m_entryMap.Remove( entryAccessor ); m_pEntryPool->Release( pEntryUpdate ); } else { pEntryUpdate->offset = originalOffset; pEntryUpdate->timestamp = originalTimestamp; pEntryUpdate->size = originalSize; } bCacheSuccess = false; } else { size_t writeSize = pCacheStream->Write( pData, 1, size ); if( writeSize != size ) { HELIUM_TRACE( TraceLevels::Error, ( TXT( "Cache: Failed to write %" ) PRIu32 TXT( " bytes to cache \"%s\" (%" ) PRIuSZ TXT( " bytes written).\n" ) ), size, *m_cacheFileName, writeSize ); if( bNewEntry ) { m_entries.Pop(); m_entryMap.Remove( entryAccessor ); m_pEntryPool->Release( pEntryUpdate ); } else { pEntryUpdate->offset = originalOffset; pEntryUpdate->timestamp = originalTimestamp; pEntryUpdate->size = originalSize; } bCacheSuccess = false; } else { HELIUM_TRACE( TraceLevels::Info, TXT( "Cache: Rewriting TOC file \"%s\".\n" ), *m_tocFileName ); FileStream* pTocStream = FileStream::OpenFileStream( m_tocFileName, FileStream::MODE_WRITE, true ); if( !pTocStream ) { HELIUM_TRACE( TraceLevels::Error, TXT( "Cache: Failed to open TOC \"%s\" for writing.\n" ), *m_tocFileName ); } else { BufferedStream* pBufferedStream = new BufferedStream( pTocStream ); HELIUM_ASSERT( pBufferedStream ); pBufferedStream->Write( &TOC_MAGIC, sizeof( TOC_MAGIC ), 1 ); pBufferedStream->Write( &sm_Version, sizeof( sm_Version ), 1 ); uint32_t entryCount = static_cast< uint32_t >( m_entries.GetSize() ); pBufferedStream->Write( &entryCount, sizeof( entryCount ), 1 ); String entryPath; uint_fast32_t entryCountFast = entryCount; for( uint_fast32_t entryIndex = 0; entryIndex < entryCountFast; ++entryIndex ) { Entry* pEntry = m_entries[ entryIndex ]; HELIUM_ASSERT( pEntry ); pEntry->path.ToString( entryPath ); HELIUM_ASSERT( entryPath.GetSize() < UINT16_MAX ); uint16_t pathSize = static_cast< uint16_t >( entryPath.GetSize() ); pBufferedStream->Write( &pathSize, sizeof( pathSize ), 1 ); pBufferedStream->Write( *entryPath, sizeof( char ), pathSize ); pBufferedStream->Write( &pEntry->subDataIndex, sizeof( pEntry->subDataIndex ), 1 ); pBufferedStream->Write( &pEntry->offset, sizeof( pEntry->offset ), 1 ); pBufferedStream->Write( &pEntry->timestamp, sizeof( pEntry->timestamp ), 1 ); pBufferedStream->Write( &pEntry->size, sizeof( pEntry->size ), 1 ); } delete pBufferedStream; delete pTocStream; } } } delete pCacheStream; } rLoader.Unlock(); return bCacheSuccess; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; CanonicalQuery* cq; Status status = getRunner(q, &rawRunner, &cq); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); log() << "Running query on new system: " << cq->toString(); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Remove when impl'd if (pq.hasOption(QueryOption_OplogReplay)) { warning() << "haven't implemented findingstartcursor yet\n"; } // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable) && (1 != pq.getNumToReturn())) { // If pq.hasOption(tailable) the only plan the planner will output is a collscan with // tailable set. saveClientCursor = true; } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); log() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // append explain information to query results if (isExplain) { BSONObjBuilder bob; bob.append("n", numResults); BSONObj obj = bob.done(); bb.appendBuf((void*)obj.objdata(), obj.objsize()); // The explain output is actually a result. numResults = 1; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }