Ejemplo n.º 1
0
Status Strategy::commandOpUnsharded(OperationContext* txn,
                                    const std::string& db,
                                    const BSONObj& command,
                                    int options,
                                    const std::string& versionedNS,
                                    CommandResult* cmdResult) {
    // Note that this implementation will not handle targeting retries and fails when the
    // sharding metadata is too stale
    auto status = grid.catalogCache()->getDatabase(txn, db);
    if (!status.isOK()) {
        mongoutils::str::stream ss;
        ss << "Passthrough command failed: " << command.toString() << " on ns " << versionedNS
           << ". Caused by " << causedBy(status.getStatus());
        return Status(ErrorCodes::IllegalOperation, ss);
    }

    shared_ptr<DBConfig> conf = status.getValue();
    if (conf->isSharded(versionedNS)) {
        mongoutils::str::stream ss;
        ss << "Passthrough command failed: " << command.toString() << " on ns " << versionedNS
           << ". Cannot run on sharded namespace.";
        return Status(ErrorCodes::IllegalOperation, ss);
    }

    const auto primaryShard = grid.shardRegistry()->getShard(txn, conf->getPrimaryId());

    BSONObj shardResult;
    try {
        ShardConnection conn(primaryShard->getConnString(), "");

        // TODO: this can throw a stale config when mongos is not up-to-date -- fix.
        if (!conn->runCommand(db, command, shardResult, options)) {
            conn.done();
            return Status(ErrorCodes::OperationFailed,
                          str::stream() << "Passthrough command failed: " << command << " on ns "
                                        << versionedNS << "; result: " << shardResult);
        }
        conn.done();
    } catch (const DBException& ex) {
        return ex.toStatus();
    }

    // Fill out the command result.
    cmdResult->shardTargetId = conf->getPrimaryId();
    cmdResult->result = shardResult;
    cmdResult->target = primaryShard->getConnString();

    return Status::OK();
}
StatusWith<boost::optional<executor::RemoteCommandRequest>>
ShardingNetworkConnectionHook::makeRequest(const HostAndPort& remoteHost) {
    if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
        // TODO: SERVER-23973 Temporary crutch until we decide where to get the config server
        // connection string.
        return {boost::none};
    }

    auto shard = grid.shardRegistry()->getShardForHostNoReload(remoteHost);
    if (!shard) {
        return {ErrorCodes::ShardNotFound,
                str::stream() << "No shard found for host: " << remoteHost.toString()};
    }
    if (shard->isConfig()) {
        // No need to initialize sharding metadata if talking to a config server
        return {boost::none};
    }

    SetShardVersionRequest ssv = SetShardVersionRequest::makeForInitNoPersist(
        grid.shardRegistry()->getConfigServerConnectionString(),
        shard->getId(),
        shard->getConnString());
    executor::RemoteCommandRequest request;
    request.dbname = "admin";
    request.target = remoteHost;
    request.timeout = stdx::chrono::seconds{30};
    request.cmdObj = ssv.toBSON();

    return {request};
}
Ejemplo n.º 3
0
void ParallelSortClusteredCursor::setupVersionAndHandleSlaveOk(
    OperationContext* txn,
    PCStatePtr state,
    const ShardId& shardId,
    std::shared_ptr<Shard> primary,
    const NamespaceString& ns,
    const string& vinfo,
    std::shared_ptr<ChunkManager> manager) {
    if (manager) {
        state->manager = manager;
    } else if (primary) {
        state->primary = primary;
    }

    verify(!primary || shardId == primary->getId());

    // Setup conn
    if (!state->conn) {
        const auto shard = grid.shardRegistry()->getShard(txn, shardId);
        state->conn.reset(new ShardConnection(shard->getConnString(), ns.ns(), manager));
    }

    const DBClientBase* rawConn = state->conn->getRawConn();
    bool allowShardVersionFailure = rawConn->type() == ConnectionString::SET &&
        DBClientReplicaSet::isSecondaryQuery(_qSpec.ns(), _qSpec.query(), _qSpec.options());
    bool connIsDown = rawConn->isFailed();
    if (allowShardVersionFailure && !connIsDown) {
        // If the replica set connection believes that it has a valid primary that is up,
        // confirm that the replica set monitor agrees that the suspected primary is indeed up.
        const DBClientReplicaSet* replConn = dynamic_cast<const DBClientReplicaSet*>(rawConn);
        invariant(replConn);
        ReplicaSetMonitorPtr rsMonitor = ReplicaSetMonitor::get(replConn->getSetName());
        if (!rsMonitor->isHostUp(replConn->getSuspectedPrimaryHostAndPort())) {
            connIsDown = true;
        }
    }

    if (allowShardVersionFailure && connIsDown) {
        // If we're doing a secondary-allowed query and the primary is down, don't attempt to
        // set the shard version.

        state->conn->donotCheckVersion();

        // A side effect of this short circuiting is the mongos will not be able figure out that
        // the primary is now up on it's own and has to rely on other threads to refresh node
        // states.

        OCCASIONALLY {
            const DBClientReplicaSet* repl = dynamic_cast<const DBClientReplicaSet*>(rawConn);
            dassert(repl);
            warning() << "Primary for " << repl->getServerAddress()
                      << " was down before, bypassing setShardVersion."
                      << " The local replica set view and targeting may be stale.";
        }
    } else {
Ejemplo n.º 4
0
    BSONObj Shard::runCommand( const string& db , const BSONObj& cmd , bool internal ) const {
        scoped_ptr<ScopedDbConnection> conn;

        if ( internal ) {
            conn.reset( ScopedDbConnection::getInternalScopedDbConnection( getConnString() ) );
        } else {
            conn.reset( ScopedDbConnection::getScopedDbConnection( getConnString() ) );
        }
        BSONObj res;
        bool ok = conn->get()->runCommand( db , cmd , res );
        if ( ! ok ) {
            stringstream ss;
            ss << "runCommand (" << cmd << ") on shard (" << _name << ") failed : " << res;
            conn->done();
            throw UserException( 13136 , ss.str() );
        }
        res = res.getOwned();
        conn->done();
        return res;
    }
Ejemplo n.º 5
0
void ParallelSortClusteredCursor::setupVersionAndHandleSlaveOk(
    OperationContext* txn,
    PCStatePtr state,
    const ShardId& shardId,
    std::shared_ptr<Shard> primary,
    const NamespaceString& ns,
    const string& vinfo,
    std::shared_ptr<ChunkManager> manager) {
    if (manager) {
        state->manager = manager;
    } else if (primary) {
        state->primary = primary;
    }

    verify(!primary || shardId == primary->getId());

    // Setup conn
    if (!state->conn) {
        const auto shard = uassertStatusOK(Grid::get(txn)->shardRegistry()->getShard(txn, shardId));
        state->conn.reset(new ShardConnection(shard->getConnString(), ns.ns(), manager));
    }

    const DBClientBase* rawConn = state->conn->getRawConn();
    bool allowShardVersionFailure = rawConn->type() == ConnectionString::SET &&
        DBClientReplicaSet::isSecondaryQuery(_qSpec.ns(), _qSpec.query(), _qSpec.options());

    // Skip shard version checking if primary is known to be down.
    if (allowShardVersionFailure) {
        const DBClientReplicaSet* replConn = dynamic_cast<const DBClientReplicaSet*>(rawConn);
        invariant(replConn);
        ReplicaSetMonitorPtr rsMonitor = ReplicaSetMonitor::get(replConn->getSetName());
        uassert(16388,
                str::stream() << "cannot access unknown replica set: " << replConn->getSetName(),
                rsMonitor != nullptr);
        if (!rsMonitor->isKnownToHaveGoodPrimary()) {
            state->conn->donotCheckVersion();

            // A side effect of this short circuiting is the mongos will not be able figure out
            // that the primary is now up on it's own and has to rely on other threads to refresh
            // node states.

            OCCASIONALLY {
                const DBClientReplicaSet* repl = dynamic_cast<const DBClientReplicaSet*>(rawConn);
                dassert(repl);
                warning() << "Primary for " << repl->getServerAddress()
                          << " was down before, bypassing setShardVersion."
                          << " The local replica set view and targeting may be stale.";
            }

            return;
        }
    }
Status MigrationChunkClonerSourceLegacy::startClone(OperationContext* txn) {
    invariant(!txn->lockState()->isLocked());
    auto scopedGuard = MakeGuard([&] { cancelClone(txn); });

    // Resolve the donor and recipient shards and their connection string

    {
        auto donorShard = grid.shardRegistry()->getShard(txn, _args.getFromShardId());
        _donorCS = donorShard->getConnString();
    }

    {
        auto recipientShard = grid.shardRegistry()->getShard(txn, _args.getToShardId());
        auto shardHostStatus = recipientShard->getTargeter()->findHost(
            ReadPreferenceSetting{ReadPreference::PrimaryOnly});
        if (!shardHostStatus.isOK()) {
            return shardHostStatus.getStatus();
        }

        _recipientHost = std::move(shardHostStatus.getValue());
    }

    // Prepare the currently available documents
    Status status = _storeCurrentLocs(txn);
    if (!status.isOK()) {
        return status;
    }

    // Tell the recipient shard to start cloning
    BSONObjBuilder cmdBuilder;
    StartChunkCloneRequest::appendAsCommand(&cmdBuilder,
                                            _args.getNss(),
                                            _sessionId,
                                            _args.getConfigServerCS(),
                                            _donorCS,
                                            _args.getToShardId(),
                                            _args.getMinKey(),
                                            _args.getMaxKey(),
                                            _shardKeyPattern.toBSON(),
                                            _args.getSecondaryThrottle());

    auto responseStatus = _callRecipient(cmdBuilder.obj());
    if (!responseStatus.isOK()) {
        return responseStatus.getStatus();
    }

    scopedGuard.Dismiss();
    return Status::OK();
}
Ejemplo n.º 7
0
void ChunkManager::calcInitSplitsAndShards(const ShardId& primaryShardId,
                                           const vector<BSONObj>* initPoints,
                                           const set<ShardId>* initShardIds,
                                           vector<BSONObj>* splitPoints,
                                           vector<ShardId>* shardIds) const {
    verify(_chunkMap.size() == 0);

    unsigned long long numObjects = 0;
    Chunk c(this,
            _keyPattern.getKeyPattern().globalMin(),
            _keyPattern.getKeyPattern().globalMax(),
            primaryShardId);

    if (!initPoints || !initPoints->size()) {
        // discover split points
        {
            const auto primaryShard = grid.shardRegistry()->getShard(primaryShardId);
            // get stats to see if there is any data
            ScopedDbConnection shardConn(primaryShard->getConnString());

            numObjects = shardConn->count(getns());
            shardConn.done();
        }

        if (numObjects > 0)
            c.pickSplitVector(*splitPoints, Chunk::MaxChunkSize);

        // since docs alread exists, must use primary shard
        shardIds->push_back(primaryShardId);
    } else {
        // make sure points are unique and ordered
        set<BSONObj> orderedPts;
        for (unsigned i = 0; i < initPoints->size(); ++i) {
            BSONObj pt = (*initPoints)[i];
            orderedPts.insert(pt);
        }
        for (set<BSONObj>::iterator it = orderedPts.begin(); it != orderedPts.end(); ++it) {
            splitPoints->push_back(*it);
        }

        if (!initShardIds || !initShardIds->size()) {
            // If not specified, only use the primary shard (note that it's not safe for mongos
            // to put initial chunks on other shards without the primary mongod knowing).
            shardIds->push_back(primaryShardId);
        } else {
            std::copy(initShardIds->begin(), initShardIds->end(), std::back_inserter(*shardIds));
        }
    }
}
StatusWith<boost::optional<executor::RemoteCommandRequest>>
ShardingNetworkConnectionHook::makeRequest(const HostAndPort& remoteHost) {
    auto shard = grid.shardRegistry()->getShardForHostNoReload(remoteHost);
    if (!shard) {
        return {ErrorCodes::ShardNotFound,
                str::stream() << "No shard found for host: " << remoteHost.toString()};
    }
    if (shard->isConfig()) {
        // No need to initialize sharding metadata if talking to a config server
        return {boost::none};
    }

    SetShardVersionRequest ssv = SetShardVersionRequest::makeForInitNoPersist(
        grid.shardRegistry()->getConfigServerConnectionString(),
        shard->getId(),
        shard->getConnString());
    executor::RemoteCommandRequest request;
    request.dbname = "admin";
    request.target = remoteHost;
    request.timeout = Seconds{30};
    request.cmdObj = ssv.toBSON();

    return {request};
}
Ejemplo n.º 9
0
/**
 * Updates the remote cached version on the remote shard host (primary, in the case of replica
 * sets) if needed with a fully-qualified shard version for the given namespace:
 *   config server(s) + shard name + shard version
 *
 * If no remote cached version has ever been set, an initial shard version is sent.
 *
 * If the namespace is empty and no version has ever been sent, the config server + shard name
 * is sent to the remote shard host to initialize the connection as coming from mongos.
 * NOTE: This initialization is *best-effort only*.  Operations which wish to correctly version
 * must send the namespace.
 *
 * Config servers are special and are not (unless otherwise a shard) kept up to date with this
 * protocol.  This is safe so long as config servers only contain unversioned collections.
 *
 * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB).
 *
 * @return true if we contacted the remote host
 */
bool checkShardVersion(DBClientBase* conn_in,
                       const string& ns,
                       ChunkManagerPtr refManager,
                       bool authoritative,
                       int tryNumber) {
    // TODO: cache, optimize, etc...

    // Empty namespaces are special - we require initialization but not versioning
    if (ns.size() == 0) {
        return initShardVersionEmptyNS(conn_in);
    }

    auto status = grid.catalogCache()->getDatabase(nsToDatabase(ns));
    if (!status.isOK()) {
        return false;
    }

    shared_ptr<DBConfig> conf = status.getValue();

    DBClientBase* conn = getVersionable(conn_in);
    verify(conn);  // errors thrown above

    unsigned long long officialSequenceNumber = 0;

    ShardPtr primary;
    ChunkManagerPtr manager;
    if (authoritative)
        conf->getChunkManagerIfExists(ns, true);

    conf->getChunkManagerOrPrimary(ns, manager, primary);

    if (manager) {
        officialSequenceNumber = manager->getSequenceNumber();
    }

    const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress());
    uassert(ErrorCodes::ShardNotFound,
            str::stream() << conn->getServerAddress() << " is not recognized as a shard",
            shard);

    // Check this manager against the reference manager
    if (manager) {
        if (refManager && !refManager->compatibleWith(*manager, shard->getId())) {
            const ChunkVersion refVersion(refManager->getVersion(shard->getId()));
            const ChunkVersion currentVersion(manager->getVersion(shard->getId()));

            string msg(str::stream()
                       << "manager (" << currentVersion.toString() << " : "
                       << manager->getSequenceNumber() << ") "
                       << "not compatible with reference manager (" << refVersion.toString()
                       << " : " << refManager->getSequenceNumber() << ") "
                       << "on shard " << shard->getId() << " (" << shard->getConnString().toString()
                       << ")");

            throw SendStaleConfigException(ns, msg, refVersion, currentVersion);
        }
    } else if (refManager) {
        string msg(str::stream() << "not sharded ("
                                 << ((manager.get() == 0) ? string("<none>") : str::stream()
                                             << manager->getSequenceNumber())
                                 << ") but has reference manager ("
                                 << refManager->getSequenceNumber() << ") "
                                 << "on conn " << conn->getServerAddress() << " ("
                                 << conn_in->getServerAddress() << ")");

        throw SendStaleConfigException(
            ns, msg, refManager->getVersion(shard->getId()), ChunkVersion::UNSHARDED());
    }

    // Do not send setShardVersion to collections on the config servers - this causes problems
    // when config servers are also shards and get SSV with conflicting names.
    // TODO: Make config servers regular shards
    if (primary && primary->getId() == "config") {
        return false;
    }

    // Has the ChunkManager been reloaded since the last time we updated the shard version over
    // this connection?  If we've never updated the shard version, do so now.
    unsigned long long sequenceNumber = 0;
    if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) {
        if (sequenceNumber == officialSequenceNumber) {
            return false;
        }
    }

    ChunkVersion version = ChunkVersion(0, 0, OID());
    if (manager) {
        version = manager->getVersion(shard->getId());
    }

    LOG(1) << "setting shard version of " << version << " for " << ns << " on shard "
           << shard->toString();

    LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber
           << ", current chunk manager iteration is " << officialSequenceNumber;

    BSONObj result;
    if (setShardVersion(*conn,
                        ns,
                        grid.catalogManager()->connectionString().toString(),
                        version,
                        manager.get(),
                        authoritative,
                        result)) {
        LOG(1) << "      setShardVersion success: " << result;
        connectionShardStatus.setSequence(conn, ns, officialSequenceNumber);
        return true;
    }

    LOG(1) << "       setShardVersion failed!\n" << result << endl;

    if (result["need_authoritative"].trueValue())
        massert(10428, "need_authoritative set but in authoritative mode already", !authoritative);

    if (!authoritative) {
        // use the original connection and get a fresh versionable connection
        // since conn can be invalidated (or worse, freed) after the failure
        checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1);
        return true;
    }

    if (result["reloadConfig"].trueValue()) {
        if (result["version"].timestampTime() == Date_t()) {
            warning() << "reloading full configuration for " << conf->name()
                      << ", connection state indicates significant version changes";

            // reload db
            conf->reload();
        } else {
            // reload config
            conf->getChunkManager(ns, true);
        }
    }

    const int maxNumTries = 7;
    if (tryNumber < maxNumTries) {
        LOG(tryNumber < (maxNumTries / 2) ? 1 : 0)
            << "going to retry checkShardVersion shard: " << shard->toString() << " " << result;
        sleepmillis(10 * tryNumber);
        // use the original connection and get a fresh versionable connection
        // since conn can be invalidated (or worse, freed) after the failure
        checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1);
        return true;
    }

    string errmsg = str::stream() << "setShardVersion failed shard: " << shard->toString() << " "
                                  << result;
    log() << "     " << errmsg << endl;
    massert(10429, errmsg, 0);
    return true;
}
Ejemplo n.º 10
0
Status ClusterAggregate::runAggregate(OperationContext* txn,
                                      const Namespaces& namespaces,
                                      BSONObj cmdObj,
                                      int options,
                                      BSONObjBuilder* result) {
    auto dbname = namespaces.executionNss.db().toString();
    auto status = grid.catalogCache()->getDatabase(txn, dbname);
    if (!status.isOK()) {
        appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns());
        return Status::OK();
    }

    std::shared_ptr<DBConfig> conf = status.getValue();

    if (!conf->isShardingEnabled()) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }

    auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj);
    if (!request.isOK()) {
        return request.getStatus();
    }

    // Determine the appropriate collation and 'resolve' involved namespaces to make the
    // ExpressionContext.

    // We won't try to execute anything on a mongos, but we still have to populate this map so that
    // any $lookups, etc. will be able to have a resolved view definition. It's okay that this is
    // incorrect, we will repopulate the real resolved namespace map on the mongod. Note that we
    // need to check if any involved collections are sharded before forwarding an aggregation
    // command on an unsharded collection.
    StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces;
    LiteParsedPipeline liteParsedPipeline(request.getValue());
    for (auto&& ns : liteParsedPipeline.getInvolvedNamespaces()) {
        uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns()));
        resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}};
    }

    if (!conf->isSharded(namespaces.executionNss.ns())) {
        return aggPassthrough(txn, namespaces, conf, cmdObj, result, options);
    }
    auto chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns());

    std::unique_ptr<CollatorInterface> collation;
    if (!request.getValue().getCollation().isEmpty()) {
        collation = uassertStatusOK(CollatorFactoryInterface::get(txn->getServiceContext())
                                        ->makeFromBSON(request.getValue().getCollation()));
    } else if (chunkMgr->getDefaultCollator()) {
        collation = chunkMgr->getDefaultCollator()->clone();
    }

    boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext(
        txn, request.getValue(), std::move(collation), std::move(resolvedNamespaces));
    mergeCtx->inRouter = true;
    // explicitly *not* setting mergeCtx->tempDir

    // Parse and optimize the pipeline specification.
    auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx);
    if (!pipeline.isOK()) {
        return pipeline.getStatus();
    }

    pipeline.getValue()->optimizePipeline();

    // If the first $match stage is an exact match on the shard key (with a simple collation or
    // no string matching), we only have to send it to one shard, so send the command to that
    // shard.
    BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery();
    BSONObj shardKeyMatches;
    shardKeyMatches = uassertStatusOK(
        chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery));
    bool singleShard = false;
    if (!shardKeyMatches.isEmpty()) {
        auto chunk = chunkMgr->findIntersectingChunk(
            txn, shardKeyMatches, request.getValue().getCollation());
        if (chunk.isOK()) {
            singleShard = true;
        }
    }

    // Don't need to split pipeline if the first $match is an exact match on shard key, unless
    // there is a stage that needs to be run on the primary shard.
    const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger();
    const bool needSplit = !singleShard || needPrimaryShardMerger;

    // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true,
    // 'pipeline' will become the merger side.
    boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded()
                                                           : pipeline.getValue());

    // Create the command for the shards. The 'fromRouter' field means produce output to be
    // merged.
    MutableDocument commandBuilder(request.getValue().serializeToCommandObj());
    commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize());
    if (needSplit) {
        commandBuilder[AggregationRequest::kFromRouterName] = Value(true);
        commandBuilder[AggregationRequest::kCursorName] =
            Value(DOC(AggregationRequest::kBatchSizeName << 0));
    }

    // These fields are not part of the AggregationRequest since they are not handled by the
    // aggregation subsystem, so we serialize them separately.
    const std::initializer_list<StringData> fieldsToPropagateToShards = {
        "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS,
    };
    for (auto&& field : fieldsToPropagateToShards) {
        commandBuilder[field] = Value(cmdObj[field]);
    }

    BSONObj shardedCommand = commandBuilder.freeze().toBson();
    BSONObj shardQuery = shardPipeline->getInitialQuery();

    // Run the command on the shards
    // TODO need to make sure cursors are killed if a retry is needed
    std::vector<Strategy::CommandResult> shardResults;
    Strategy::commandOp(txn,
                        dbname,
                        shardedCommand,
                        options,
                        namespaces.executionNss.ns(),
                        shardQuery,
                        request.getValue().getCollation(),
                        &shardResults);

    if (mergeCtx->isExplain) {
        // This must be checked before we start modifying result.
        uassertAllShardsSupportExplain(shardResults);

        if (needSplit) {
            *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline"
                    << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart"
                                        << pipeline.getValue()->writeExplainOps());
        } else {
            *result << "splitPipeline" << BSONNULL;
        }

        BSONObjBuilder shardExplains(result->subobjStart("shards"));
        for (size_t i = 0; i < shardResults.size(); i++) {
            shardExplains.append(shardResults[i].shardTargetId,
                                 BSON("host" << shardResults[i].target.toString() << "stages"
                                             << shardResults[i].result["stages"]));
        }

        return Status::OK();
    }

    if (!needSplit) {
        invariant(shardResults.size() == 1);
        invariant(shardResults[0].target.getServers().size() == 1);
        auto executorPool = grid.getExecutorPool();
        const BSONObj reply =
            uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0],
                                                shardResults[0].result,
                                                namespaces.requestedNss,
                                                executorPool->getArbitraryExecutor(),
                                                grid.getCursorManager()));
        result->appendElements(reply);
        return getStatusFromCommandResult(reply);
    }

    pipeline.getValue()->addInitialSource(
        DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx));

    MutableDocument mergeCmd(request.getValue().serializeToCommandObj());
    mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize());
    mergeCmd["cursor"] = Value(cmdObj["cursor"]);

    if (cmdObj.hasField("$queryOptions")) {
        mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]);
    }

    if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) {
        mergeCmd[QueryRequest::cmdOptionMaxTimeMS] =
            Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]);
    }

    mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"]));
    mergeCmd.setField("readConcern", Value(cmdObj["readConcern"]));

    // If the user didn't specify a collation already, make sure there's a collation attached to
    // the merge command, since the merging shard may not have the collection metadata.
    if (mergeCmd.peek()["collation"].missing()) {
        mergeCmd.setField("collation",
                          mergeCtx->getCollator()
                              ? Value(mergeCtx->getCollator()->getSpec().toBSON())
                              : Value(Document{CollationSpec::kSimpleSpec}));
    }

    std::string outputNsOrEmpty;
    if (DocumentSourceOut* out =
            dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) {
        outputNsOrEmpty = out->getOutputNs().ns();
    }

    // Run merging command on random shard, unless a stage needs the primary shard. Need to use
    // ShardConnection so that the merging mongod is sent the config servers on connection init.
    auto& prng = txn->getClient()->getPrng();
    const auto& mergingShardId = (needPrimaryShardMerger || internalQueryAlwaysMergeOnPrimaryShard)
        ? conf->getPrimaryId()
        : shardResults[prng.nextInt32(shardResults.size())].shardTargetId;
    const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId));

    ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty);
    BSONObj mergedResults =
        aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options);
    conn.done();

    if (auto wcErrorElem = mergedResults["writeConcernError"]) {
        appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result);
    }

    // Copy output from merging (primary) shard to the output object from our command.
    // Also, propagates errmsg and code if ok == false.
    result->appendElementsUnique(mergedResults);

    return getStatusFromCommandResult(result->asTempObj());
}
Ejemplo n.º 11
0
bool appendRawResponses(OperationContext* opCtx,
                        std::string* errmsg,
                        BSONObjBuilder* output,
                        std::vector<AsyncRequestsSender::Response> shardResponses,
                        std::set<ErrorCodes::Error> ignoredErrors) {
    // Always include ShardNotFound as an ignored error, since this node may not have realized a
    // shard has been removed.
    ignoredErrors.insert(ErrorCodes::ShardNotFound);

    BSONObjBuilder subobj;  // Stores raw responses by ConnectionString

    // Stores all errors; we will remove ignoredErrors later if some shard returned success.
    std::vector<std::pair<std::string, Status>> errors;  // Stores errors by ConnectionString

    BSONElement wcErrorElem;  // Stores the first writeConcern error we encounter
    ShardId wcErrorShardId;   // Stores the shardId for the first writeConcern error we encounter
    bool hasWCError = false;  // Whether we have encountered a writeConcern error yet

    for (const auto& shardResponse : shardResponses) {
        // Get the Shard object in order to get the shard's ConnectionString.
        const auto swShard =
            Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardResponse.shardId);
        if (ErrorCodes::ShardNotFound == swShard.getStatus().code()) {
            // Store the error by ShardId, since we cannot know the shard connection string, and it
            // is only used for reporting.
            errors.push_back(std::make_pair(shardResponse.shardId.toString(), swShard.getStatus()));
            continue;
        }
        const auto shard = uassertStatusOK(swShard);
        const auto shardConnStr = shard->getConnString().toString();

        Status sendStatus = shardResponse.swResponse.getStatus();
        if (!sendStatus.isOK()) {
            // Convert the error status back into the form of a command result and append it as the
            // raw response.
            BSONObjBuilder statusObjBob;
            CommandHelpers::appendCommandStatusNoThrow(statusObjBob, sendStatus);
            subobj.append(shardConnStr, statusObjBob.obj());

            errors.push_back(std::make_pair(shardConnStr, sendStatus));
            continue;
        }

        // Got a response from the shard.

        auto& resObj = shardResponse.swResponse.getValue().data;

        // Append the shard's raw response.
        subobj.append(shardConnStr, CommandHelpers::filterCommandReplyForPassthrough(resObj));

        auto commandStatus = getStatusFromCommandResult(resObj);
        if (!commandStatus.isOK()) {
            errors.push_back(std::make_pair(shardConnStr, std::move(commandStatus)));
        }

        // Report the first writeConcern error we see.
        if (!hasWCError && (wcErrorElem = resObj["writeConcernError"])) {
            wcErrorShardId = shardResponse.shardId;
            hasWCError = true;
        }
    }

    output->append("raw", subobj.done());

    if (hasWCError) {
        appendWriteConcernErrorToCmdResponse(wcErrorShardId, wcErrorElem, *output);
    }

    // If any shard returned success, filter out ignored errors
    bool someShardReturnedOK = (errors.size() != shardResponses.size());

    BSONObjBuilder errorBob;
    int commonErrCode = -1;
    auto it = errors.begin();
    while (it != errors.end()) {
        if (someShardReturnedOK && ignoredErrors.find(it->second.code()) != ignoredErrors.end()) {
            // Ignore the error.
            it = errors.erase(it);
        } else {
            errorBob.append(it->first, it->second.reason());
            if (commonErrCode == -1) {
                commonErrCode = it->second.code();
            } else if (commonErrCode != it->second.code()) {
                commonErrCode = 0;
            }
            ++it;
        }
    }
    BSONObj errobj = errorBob.obj();

    if (!errobj.isEmpty()) {
        *errmsg = errobj.toString();

        // If every error has a code, and the code for all errors is the same, then add
        // a top-level field "code" with this value to the output object.
        if (commonErrCode > 0) {
            output->append("code", commonErrCode);
            output->append("codeName", ErrorCodes::errorString(ErrorCodes::Error(commonErrCode)));
            if (errors.size() == 1) {
                // Only propagate extra info if there was a single error object.
                if (auto extraInfo = errors.begin()->second.extraInfo()) {
                    extraInfo->serialize(output);
                }
            }
        }
        return false;
    }
    return true;
}
Ejemplo n.º 12
0
bool RunOnAllShardsCommand::run(OperationContext* txn,
                                const std::string& dbName,
                                BSONObj& cmdObj,
                                int options,
                                std::string& errmsg,
                                BSONObjBuilder& output) {
    LOG(1) << "RunOnAllShardsCommand db: " << dbName << " cmd:" << redact(cmdObj);

    if (_implicitCreateDb) {
        uassertStatusOK(ScopedShardDatabase::getOrCreate(txn, dbName));
    }

    std::vector<ShardId> shardIds;
    getShardIds(txn, dbName, cmdObj, shardIds);

    std::list<std::shared_ptr<Future::CommandResult>> futures;
    for (const ShardId& shardId : shardIds) {
        const auto shard = grid.shardRegistry()->getShard(txn, shardId);
        if (!shard) {
            continue;
        }

        futures.push_back(Future::spawnCommand(
            shard->getConnString().toString(), dbName, cmdObj, 0, NULL, _useShardConn));
    }

    std::vector<ShardAndReply> results;
    BSONObjBuilder subobj(output.subobjStart("raw"));
    BSONObjBuilder errors;
    int commonErrCode = -1;

    std::list<std::shared_ptr<Future::CommandResult>>::iterator futuresit;
    std::vector<ShardId>::const_iterator shardIdsIt;

    BSONElement wcErrorElem;
    ShardId wcErrorShardId;
    bool hasWCError = false;

    // We iterate over the set of shard ids and their corresponding futures in parallel.
    // TODO: replace with zip iterator if we ever decide to use one from Boost or elsewhere
    for (futuresit = futures.begin(), shardIdsIt = shardIds.cbegin();
         futuresit != futures.end() && shardIdsIt != shardIds.end();
         ++futuresit, ++shardIdsIt) {
        std::shared_ptr<Future::CommandResult> res = *futuresit;

        if (res->join(txn)) {
            // success :)
            BSONObj result = res->result();
            results.emplace_back(shardIdsIt->toString(), result);
            subobj.append(res->getServer(), result);

            if (!hasWCError) {
                if ((wcErrorElem = result["writeConcernError"])) {
                    wcErrorShardId = *shardIdsIt;
                    hasWCError = true;
                }
            }
            continue;
        }

        BSONObj result = res->result();

        if (!hasWCError) {
            if ((wcErrorElem = result["writeConcernError"])) {
                wcErrorShardId = *shardIdsIt;
                hasWCError = true;
            }
        }

        if (result["errmsg"].type() || result["code"].numberInt() != 0) {
            result = specialErrorHandler(res->getServer(), dbName, cmdObj, result);

            BSONElement errmsgObj = result["errmsg"];
            if (errmsgObj.eoo() || errmsgObj.String().empty()) {
                // it was fixed!
                results.emplace_back(shardIdsIt->toString(), result);
                subobj.append(res->getServer(), result);
                continue;
            }
        }

        // Handle "errmsg".
        if (!result["errmsg"].eoo()) {
            errors.appendAs(result["errmsg"], res->getServer());
        } else {
            // Can happen if message is empty, for some reason
            errors.append(res->getServer(),
                          str::stream() << "result without error message returned : " << result);
        }

        // Handle "code".
        int errCode = result["code"].numberInt();
        if (commonErrCode == -1) {
            commonErrCode = errCode;
        } else if (commonErrCode != errCode) {
            commonErrCode = 0;
        }
        results.emplace_back(shardIdsIt->toString(), result);
        subobj.append(res->getServer(), result);
    }

    subobj.done();

    if (hasWCError) {
        appendWriteConcernErrorToCmdResponse(wcErrorShardId, wcErrorElem, output);
    }

    BSONObj errobj = errors.done();

    if (!errobj.isEmpty()) {
        errmsg = errobj.toString();

        // If every error has a code, and the code for all errors is the same, then add
        // a top-level field "code" with this value to the output object.
        if (commonErrCode > 0) {
            output.append("code", commonErrCode);
        }

        return false;
    }

    aggregateResults(results, output);
    return true;
}
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn,
                                                 const string& ns,
                                                 const ShardKeyPattern& fieldsAndOrder,
                                                 bool unique,
                                                 const vector<BSONObj>& initPoints,
                                                 const set<ShardId>& initShardIds) {
    // Lock the collection globally so that no other mongos can try to shard or drop the collection
    // at the same time.
    auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection");
    if (!scopedDistLock.isOK()) {
        return scopedDistLock.getStatus();
    }

    auto status = getDatabase(txn, nsToDatabase(ns));
    if (!status.isOK()) {
        return status.getStatus();
    }

    ShardId dbPrimaryShardId = status.getValue().value.getPrimary();
    const auto primaryShard = grid.shardRegistry()->getShard(txn, dbPrimaryShardId);

    {
        // In 3.0 and prior we include this extra safety check that the collection is not getting
        // sharded concurrently by two different mongos instances. It is not 100%-proof, but it
        // reduces the chance that two invocations of shard collection will step on each other's
        // toes.  Now we take the distributed lock so going forward this check won't be necessary
        // but we leave it around for compatibility with other mongoses from 3.0.
        // TODO(spencer): Remove this after 3.2 ships.
        auto countStatus = _runCountCommandOnConfig(
            txn, NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns)));
        if (!countStatus.isOK()) {
            return countStatus.getStatus();
        }
        if (countStatus.getValue() > 0) {
            return Status(ErrorCodes::AlreadyInitialized,
                          str::stream() << "collection " << ns << " already sharded with "
                                        << countStatus.getValue() << " chunks.");
        }
    }

    // Record start in changelog
    {
        BSONObjBuilder collectionDetail;
        collectionDetail.append("shardKey", fieldsAndOrder.toBSON());
        collectionDetail.append("collection", ns);
        collectionDetail.append("primary", primaryShard->toString());

        {
            BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards"));
            for (const ShardId& shardId : initShardIds) {
                initialShards.append(shardId);
            }
        }

        collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1));

        logChange(txn,
                  txn->getClient()->clientAddress(true),
                  "shardCollection.start",
                  ns,
                  collectionDetail.obj());
    }

    shared_ptr<ChunkManager> manager(new ChunkManager(ns, fieldsAndOrder, unique));
    manager->createFirstChunks(txn, dbPrimaryShardId, &initPoints, &initShardIds);
    manager->loadExistingRanges(txn, nullptr);

    CollectionInfo collInfo;
    collInfo.useChunkManager(manager);
    collInfo.save(txn, ns);
    manager->reload(txn, true);

    // Tell the primary mongod to refresh its data
    // TODO:  Think the real fix here is for mongos to just
    //        assume that all collections are sharded, when we get there
    SetShardVersionRequest ssv = SetShardVersionRequest::makeForVersioningNoPersist(
        grid.shardRegistry()->getConfigServerConnectionString(),
        dbPrimaryShardId,
        primaryShard->getConnString(),
        NamespaceString(ns),
        manager->getVersion(),
        true);

    auto ssvStatus = grid.shardRegistry()->runCommandWithNotMasterRetries(
        txn, dbPrimaryShardId, "admin", ssv.toBSON());
    if (!ssvStatus.isOK()) {
        warning() << "could not update initial version of " << ns << " on shard primary "
                  << dbPrimaryShardId << ssvStatus.getStatus();
    }

    logChange(txn,
              txn->getClient()->clientAddress(true),
              "shardCollection",
              ns,
              BSON("version" << manager->getVersion().toString()));

    return Status::OK();
}