Example #1
0
void ClusterWriter::write(OperationContext* opCtx,
                          const BatchedCommandRequest& request,
                          BatchWriteExecStats* stats,
                          BatchedCommandResponse* response) {
    const NamespaceString& nss = request.getNS();

    LastError::Disabled disableLastError(&LastError::get(opCtx->getClient()));

    // Config writes and shard writes are done differently
    if (nss.db() == NamespaceString::kConfigDb || nss.db() == NamespaceString::kAdminDb) {
        Grid::get(opCtx)->catalogClient()->writeConfigServerDirect(opCtx, request, response);
    } else {
        TargeterStats targeterStats;

        {
            ChunkManagerTargeter targeter(request.getTargetingNS(), &targeterStats);

            Status targetInitStatus = targeter.init(opCtx);
            if (!targetInitStatus.isOK()) {
                toBatchError({targetInitStatus.code(),
                              str::stream() << "unable to target"
                                            << (request.isInsertIndexRequest() ? " index" : "")
                                            << " write op for collection "
                                            << request.getTargetingNS().ns()
                                            << causedBy(targetInitStatus)},
                             response);
                return;
            }

            BatchWriteExec::executeBatch(opCtx, targeter, request, response, stats);
        }

        splitIfNeeded(opCtx, request.getNS(), targeterStats);
    }
}
Example #2
0
    void ClusterWriter::shardWrite( const BatchedCommandRequest& request,
                                    BatchedCommandResponse* response ) {

        ChunkManagerTargeter targeter;
        Status targetInitStatus = targeter.init( NamespaceString( request.getTargetingNS() ) );

        if ( !targetInitStatus.isOK() ) {

            warning() << "could not initialize targeter for"
                      << ( request.isInsertIndexRequest() ? " index" : "" )
                      << " write op in collection " << request.getTargetingNS() << endl;

            // Errors will be reported in response if we are unable to target
        }

        DBClientShardResolver resolver;
        DBClientMultiCommand dispatcher;
        BatchWriteExec exec( &targeter, &resolver, &dispatcher );
        exec.executeBatch( request, response );

        if ( _autoSplit )
            splitIfNeeded( request.getNS(), *targeter.getStats() );

        _stats->setShardStats( exec.releaseStats() );
    }
Example #3
0
BatchedCommandResponse Shard::runBatchWriteCommandOnConfig(
    OperationContext* txn, const BatchedCommandRequest& batchRequest, RetryPolicy retryPolicy) {
    invariant(isConfig());

    const std::string dbname = batchRequest.getNS().db().toString();
    invariant(batchRequest.sizeWriteOps() == 1);

    const BSONObj cmdObj = batchRequest.toBSON();

    for (int retry = 1; retry <= kOnErrorNumRetries; ++retry) {
        auto response = _runCommand(txn,
                                    ReadPreferenceSetting{ReadPreference::PrimaryOnly},
                                    dbname,
                                    kDefaultConfigCommandTimeout,
                                    cmdObj);

        BatchedCommandResponse batchResponse;
        Status writeStatus =
            CommandResponse::processBatchWriteResponse(response.commandResponse, &batchResponse);

        if (!writeStatus.isOK() && response.host) {
            updateReplSetMonitor(response.host.get(), writeStatus);
        }

        if (retry < kOnErrorNumRetries && isRetriableError(writeStatus.code(), retryPolicy)) {
            LOG(2) << "Batch write command failed with retriable error and will be retried"
                   << causedBy(redact(writeStatus));
            continue;
        }

        return batchResponse;
    }
    MONGO_UNREACHABLE;
}
Example #4
0
BatchedCommandResponse Shard::runBatchWriteCommand(OperationContext* opCtx,
                                                   const Milliseconds maxTimeMS,
                                                   const BatchedCommandRequest& batchRequest,
                                                   RetryPolicy retryPolicy) {
    const std::string dbname = batchRequest.getNS().db().toString();

    const BSONObj cmdObj = batchRequest.toBSON();

    for (int retry = 1; retry <= kOnErrorNumRetries; ++retry) {
        // Note: write commands can only be issued against a primary.
        auto swResponse = _runCommand(
            opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, dbname, maxTimeMS, cmdObj);

        BatchedCommandResponse batchResponse;
        auto writeStatus = CommandResponse::processBatchWriteResponse(swResponse, &batchResponse);
        if (retry < kOnErrorNumRetries && isRetriableError(writeStatus.code(), retryPolicy)) {
            LOG(2) << "Batch write command to " << getId()
                   << " failed with retriable error and will be retried"
                   << causedBy(redact(writeStatus));
            continue;
        }

        return batchResponse;
    }
    MONGO_UNREACHABLE;
}
Example #5
0
void Strategy::writeOp(OperationContext* txn, int op, Request& request) {
    // make sure we have a last error
    dassert(&LastError::get(cc()));

    OwnedPointerVector<BatchedCommandRequest> commandRequestsOwned;
    vector<BatchedCommandRequest*>& commandRequests = commandRequestsOwned.mutableVector();

    msgToBatchRequests(request.m(), &commandRequests);

    for (vector<BatchedCommandRequest*>::iterator it = commandRequests.begin();
         it != commandRequests.end();
         ++it) {
        // Multiple commands registered to last error as multiple requests
        if (it != commandRequests.begin())
            LastError::get(cc()).startRequest();

        BatchedCommandRequest* commandRequest = *it;

        // Adjust namespaces for command
        NamespaceString fullNS(commandRequest->getNS());
        string cmdNS = fullNS.getCommandNS();
        // We only pass in collection name to command
        commandRequest->setNS(fullNS);

        BSONObjBuilder builder;
        BSONObj requestBSON = commandRequest->toBSON();

        {
            // Disable the last error object for the duration of the write cmd
            LastError::Disabled disableLastError(&LastError::get(cc()));
            Command::runAgainstRegistered(txn, cmdNS.c_str(), requestBSON, builder, 0);
        }

        BatchedCommandResponse commandResponse;
        bool parsed = commandResponse.parseBSON(builder.done(), NULL);
        (void)parsed;  // for compile
        dassert(parsed && commandResponse.isValid(NULL));

        // Populate the lastError object based on the write response
        LastError::get(cc()).reset();
        bool hadError =
            batchErrorToLastError(*commandRequest, commandResponse, &LastError::get(cc()));

        // Check if this is an ordered batch and we had an error which should stop processing
        if (commandRequest->getOrdered() && hadError)
            break;
    }
}
    void clusterWrite( const BatchedCommandRequest& request,
                       BatchedCommandResponse* response,
                       bool autoSplit ) {

        // App-level validation of a create index insert
        if ( request.isInsertIndexRequest() ) {
            if ( request.sizeWriteOps() != 1 || request.isWriteConcernSet() ) {

                // Invalid request to create index
                response->setOk( false );
                response->setErrCode( ErrorCodes::InvalidOptions );
                response->setErrMessage( "invalid batch request for index creation" );

                dassert( response->isValid( NULL ) );
                return;
            }
        }

        // Config writes and shard writes are done differently
        string dbName = NamespaceString( request.getNS() ).db().toString();
        if ( dbName == "config" || dbName == "admin" ) {

            bool verboseWC = request.isVerboseWC();

            // We only support batch sizes of one and {w:0} write concern for config writes
            if ( request.sizeWriteOps() != 1 || ( verboseWC && request.isWriteConcernSet() ) ) {
                // Invalid config server write
                response->setOk( false );
                response->setErrCode( ErrorCodes::InvalidOptions );
                response->setErrMessage( "invalid batch request for config write" );

                dassert( response->isValid( NULL ) );
                return;
            }

            // We need to support "best-effort" writes for pings to the config server.
            // {w:0} (!verbose) writes are interpreted as best-effort in this case - they may still
            // error, but do not do the initial fsync check.
            configWrite( request, response, verboseWC );
        }
        else {
            shardWrite( request, response, autoSplit );
        }
    }
void CatalogManagerReplicaSet::writeConfigServerDirect(const BatchedCommandRequest& batchRequest,
                                                       BatchedCommandResponse* batchResponse) {
    std::string dbname = batchRequest.getNS().db().toString();
    invariant(dbname == "config" || dbname == "admin");
    const BSONObj cmdObj = batchRequest.toBSON();

    auto response = _runConfigServerCommandWithNotMasterRetries(dbname, cmdObj);
    if (!response.isOK()) {
        _toBatchError(response.getStatus(), batchResponse);
        return;
    }

    string errmsg;
    if (!batchResponse->parseBSON(response.getValue(), &errmsg)) {
        _toBatchError(Status(ErrorCodes::FailedToParse,
                             str::stream() << "Failed to parse config server response: " << errmsg),
                      batchResponse);
    }
}
Example #8
0
    // static
    Status WriteBatchExecutor::validateBatch( const BatchedCommandRequest& request ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            return Status( ErrorCodes::InvalidNamespace,
                           nss.ns() + " is not a valid namespace" );
        }

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            return allowedStatus;
        }

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            return Status( ErrorCodes::InvalidOptions, errMsg );
        }

        return Status::OK();
    }
    void BatchSafeWriter::safeWriteBatch( DBClientBase* conn,
                                          const BatchedCommandRequest& request,
                                          BatchedCommandResponse* response ) {

        const NamespaceString nss( request.getNS() );

        // N starts at zero, and we add to it for each item
        response->setN( 0 );

        // GLE path always sets nModified to -1 (sentinel) to indicate we should omit it later.
        response->setNModified(-1);

        for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) {

            // Break on first error if we're ordered
            if ( request.getOrdered() && response->isErrDetailsSet() )
                break;

            BatchItemRef itemRef( &request, static_cast<int>( i ) );

            BSONObj gleResult;
            GLEErrors errors;
            Status status = _safeWriter->safeWrite( conn,
                                                    itemRef,
                                                    WriteConcernOptions::Acknowledged,
                                                    &gleResult );

            if ( status.isOK() ) {
                status = extractGLEErrors( gleResult, &errors );
            }

            if ( !status.isOK() ) {
                response->clear();
                response->setOk( false );
                response->setErrCode( ErrorCodes::RemoteResultsUnavailable );
                
                StringBuilder builder;
                builder << "could not get write error from safe write";
                builder << causedBy( status.toString() );
                response->setErrMessage( builder.str() );
                return;
            }

            if ( errors.wcError.get() ) {
                response->setWriteConcernError( errors.wcError.release() );
            }

            //
            // STATS HANDLING
            //

            GLEStats stats;
            extractGLEStats( gleResult, &stats );

            // Special case for making legacy "n" field result for insert match the write
            // command result.
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert
                 && !errors.writeError.get() ) {
                // n is always 0 for legacy inserts.
                dassert( stats.n == 0 );
                stats.n = 1;
            }

            response->setN( response->getN() + stats.n );

            if ( !stats.upsertedId.isEmpty() ) {
                BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail;
                upsertedId->setIndex( i );
                upsertedId->setUpsertedID( stats.upsertedId );
                response->addToUpsertDetails( upsertedId );
            }

            response->setLastOp( stats.lastOp );

            // Save write error
            if ( errors.writeError.get() ) {
                errors.writeError->setIndex( i );
                response->addToErrDetails( errors.writeError.release() );
            }
        }

        //
        // WRITE CONCERN ERROR HANDLING
        //

        // The last write is weird, since we enforce write concern and check the error through
        // the same GLE if possible.  If the last GLE was an error, the write concern may not
        // have been enforced in that same GLE, so we need to send another after resetting the
        // error.

        BSONObj writeConcern;
        if ( request.isWriteConcernSet() ) {
            writeConcern = request.getWriteConcern();
            // Pre-2.4.2 mongods react badly to 'w' being set on config servers
            if ( nss.db() == "config" )
                writeConcern = fixWCForConfig( writeConcern );
        }

        bool needToEnforceWC = WriteConcernOptions::Acknowledged.woCompare(writeConcern) != 0 &&
                WriteConcernOptions::Unacknowledged.woCompare(writeConcern) != 0;

        if ( needToEnforceWC &&
                ( !response->isErrDetailsSet() ||
                        ( !request.getOrdered() &&
                                // Not all errored. Note: implicit response->isErrDetailsSet().
                                response->sizeErrDetails() < request.sizeWriteOps() ))) {

            // Might have gotten a write concern validity error earlier, these are
            // enforced even if the wc isn't applied, so we ignore.
            response->unsetWriteConcernError();

            const string dbName( nss.db().toString() );

            Status status( Status::OK() );

            if ( response->isErrDetailsSet() ) {
                const WriteErrorDetail* lastError = response->getErrDetails().back();

                // If last write op was an error.
                if ( lastError->getIndex() == static_cast<int>( request.sizeWriteOps() - 1 )) {
                    // Reset previous errors so we can apply the write concern no matter what
                    // as long as it is valid.
                    status = _safeWriter->clearErrors( conn, dbName );
                }
            }

            BSONObj gleResult;
            if ( status.isOK() ) {
                status = _safeWriter->enforceWriteConcern( conn,
                                                           dbName,
                                                           writeConcern,
                                                           &gleResult );
            }

            GLEErrors errors;
            if ( status.isOK() ) {
                status = extractGLEErrors( gleResult, &errors );
            }
            
            if ( !status.isOK() ) {
                auto_ptr<WCErrorDetail> wcError( new WCErrorDetail );
                wcError->setErrCode( status.code() );
                wcError->setErrMessage( status.reason() );
                response->setWriteConcernError( wcError.release() ); 
            }
            else if ( errors.wcError.get() ) {
                response->setWriteConcernError( errors.wcError.release() );
            }
        }

        response->setOk( true );
        dassert( response->isValid( NULL ) );
    }
Example #10
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );
            return;
        }

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );
            return;
        }

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );
            return;
        }

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();
        }

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
            }
        }
        else {
            wcStatus = writeConcern.parse( wcDoc );
        }

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );
        }

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );
            return;
        }

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );
            return;
        }

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );
            return;
        }

        //
        // End validation
        //

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        //
        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.
        //

        bulkExecute( request, &upserted, &writeErrors );

        //
        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.
        //

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );
            }
        }

        //
        // Refresh metadata if needed
        //

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                //
                // First, we refresh metadata if we need to based on the requested version.
                //

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       requestMetadata->getShardVersion(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );
                }

                if ( queueForMigrationCommit ) {

                    //
                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.
                    //

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    //
                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.
                    //

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
                        }
                    }
                }
            }
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );
            }
        }

        //
        // Construct response
        //

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );
            }

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );
            }

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );
            }

            const repl::ReplicationCoordinator::Mode replMode =
                    repl::getGlobalReplicationCoordinator()->getReplicationMode();
            if (replMode != repl::ReplicationCoordinator::modeNone) {
                response->setLastOp( _client->getLastOp() );
                if (replMode == repl::ReplicationCoordinator::modeReplSet) {
                    response->setElectionId(repl::theReplSet->getElectionId());
                }
            }

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );
        }

        dassert( response->isValid( NULL ) );
    }
Example #11
0
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        // Current batch status
        bool refreshedTargeter = false;
        int rounds = 0;
        int numCompletedOps = 0;
        int numRoundsWithoutProgress = 0;

        while ( !batchOp.isFinished() ) {

            //
            // Get child batches to send using the targeter
            //
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            //
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            //
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            //
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            //

            vector<TargetedWriteBatch*> childBatches;

            // If we've already had a targeting error, we've refreshed the metadata once and can
            // record target errors definitively.
            bool recordTargetErrors = refreshedTargeter;
            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       recordTargetErrors,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                // Don't do anything until a targeter refresh
                _targeter->noteCouldNotTarget();
                refreshedTargeter = true;
                ++_stats->numTargetErrors;
                dassert( childBatches.size() == 0u );
            }

            //
            // Send all child batches
            //

            size_t numSent = 0;
            size_t numToSend = childBatches.size();
            bool remoteMetadataChanging = false;
            while ( numSent != numToSend ) {

                // Collect batches out on the network, mapped by endpoint
                HostBatchMap pendingBatches;

                //
                // Send side
                //

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    //
                    // Collect the info needed to dispatch our targeted batch
                    //

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;

                    // Figure out what host we need to dispatch our targeted batch
                    ConnectionString shardHost;
                    Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint()
                                                                           .shardName,
                                                                       &shardHost );
                    if ( !resolveStatus.isOK() ) {

                        ++_stats->numResolveErrors;

                        // Record a resolve failure
                        // TODO: It may be necessary to refresh the cache if stale, or maybe just
                        // cancel and retarget the batch
                        WriteErrorDetail error;
                        buildErrorFrom( resolveStatus, &error );
                        batchOp.noteBatchError( *nextBatch, error );

                        // We're done with this batch
                        *it = NULL;
                        --numToSend;
                        continue;
                    }

                    // If we already have a batch for this host, wait until the next time
                    HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost );
                    if ( pendingIt != pendingBatches.end() ) continue;

                    //
                    // We now have all the info needed to dispatch the batch
                    //

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( shardHost, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( shardHost, nextBatch ) );
                }

                // Send them all out
                _dispatcher->sendAll();
                numSent += pendingBatches.size();

                //
                // Recv side
                //

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString shardHost;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    dassert( pendingBatches.find( shardHost ) != pendingBatches.end() );
                    TargetedWriteBatch* batchRaw = pendingBatches.find( shardHost )->second;
                    scoped_ptr<TargetedWriteBatch> batch( batchRaw );

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                            ++_stats->numStaleBatches;
                        }

                        // Remember if the shard is actively changing metadata right now
                        if ( isShardMetadataChanging( staleErrors ) ) {
                            remoteMetadataChanging = true;
                        }

                        // Remember that we successfully wrote to this shard
                        // NOTE: This will record lastOps for shards where we actually didn't update
                        // or delete any documents, which preserves old behavior but is conservative
                        _stats->noteWriteAt( shardHost,
                                             response.isLastOpSet() ? 
                                             response.getLastOp() : OpTime(),
                                             response.isElectionIdSet() ?
                                             response.getElectionId() : OID());
                    }
                    else {

                        // Error occurred dispatching, note it
                        WriteErrorDetail error;
                        buildErrorFrom( dispatchStatus, &error );
                        batchOp.noteBatchError( *batch, error );
                    }
                }
            }

            ++rounds;
            ++_stats->numRounds;

            // If we're done, get out
            if ( batchOp.isFinished() )
                break;

            // MORE WORK TO DO

            //
            // Refresh the targeter if we need to (no-op if nothing stale)
            //

            bool targeterChanged = false;
            Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged );

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;
            }

            //
            // Ensure progress is being made toward completing the batch op
            //

            int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed );
            if ( currCompletedOps == numCompletedOps && !targeterChanged
                 && !remoteMetadataChanging ) {
                ++numRoundsWithoutProgress;
            }
            else {
                numRoundsWithoutProgress = 0;
            }
            numCompletedOps = currCompletedOps;

            if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) {

                stringstream msg;
                msg << "no progress was made executing batch write op in " << clientRequest.getNS()
                    << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                    << " ops completed in " << rounds << " rounds total)";

                WriteErrorDetail error;
                buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error );
                batchOp.setBatchError( error );
                break;
            }
        }

        batchOp.buildClientResponse( clientResponse );
    }
Example #12
0
void WriteBatchExecutor::execInserts( const BatchedCommandRequest& request,
                                      std::vector<WriteErrorDetail*>* errors ) {

    // Bulk insert is a bit different from other bulk operations in that multiple request docs
    // can be processed at once inside the write lock.

    const NamespaceString nss( request.getTargetingNS() );
    scoped_ptr<BatchItemRef> currInsertItem( new BatchItemRef( &request, 0 ) );

    // Go through our request and do some preprocessing on insert documents outside the lock to
    // validate and put them in a normalized form - i.e. put _id in front and fill in
    // timestamps.  The insert document may also be invalid.
    // TODO:  Might be more efficient to do in batches.
    vector<StatusWith<BSONObj> > normalInserts;
    normalizeInserts( request, &normalInserts );

    while ( currInsertItem->getItemIndex() < static_cast<int>( request.sizeWriteOps() ) ) {

        WriteOpResult currResult;

        // Don't (re-)acquire locks and create database until it's necessary
        if ( !normalInserts[currInsertItem->getItemIndex()].isOK() ) {
            currResult.error =
                toWriteError( normalInserts[currInsertItem->getItemIndex()].getStatus() );
        }
        else {

            PageFaultRetryableSection pFaultSection;

            ////////////////////////////////////
            Lock::DBWrite writeLock( nss.ns() );
            ////////////////////////////////////

            // Check version inside of write lock

            if ( checkIsMasterForCollection( nss, &currResult.error )
                    && checkShardVersion( &shardingState, request, &currResult.error )
                    && checkIndexConstraints( &shardingState, request, &currResult.error ) ) {

                //
                // Get the collection for the insert
                //

                scoped_ptr<Client::Context> writeContext;
                Collection* collection = NULL;

                try {
                    // Context once we're locked, to set more details in currentOp()
                    // TODO: better constructor?
                    writeContext.reset( new Client::Context( request.getNS(),
                                        storageGlobalParams.dbpath,
                                        false /* don't check version */) );

                    Database* database = writeContext->db();
                    dassert( database );
                    collection = database->getCollection( nss.ns() );

                    if ( !collection ) {
                        // Implicitly create if it doesn't exist
                        collection = database->createCollection( nss.ns() );
                        if ( !collection ) {
                            currResult.error =
                                toWriteError( Status( ErrorCodes::InternalError,
                                                      "could not create collection" ) );
                        }
                    }
                }
                catch ( const DBException& ex ) {
                    Status status(ex.toStatus());
                    if (ErrorCodes::isInterruption(status.code())) {
                        throw;
                    }
                    currResult.error = toWriteError(status);
                }

                //
                // Perform writes inside write lock
                //

                while ( collection
                        && currInsertItem->getItemIndex()
                        < static_cast<int>( request.sizeWriteOps() ) ) {

                    //
                    // BEGIN CURRENT OP
                    //

                    scoped_ptr<CurOp> currentOp( beginCurrentOp( _client, *currInsertItem ) );
                    incOpStats( *currInsertItem );

                    // Get the actual document we want to write, assuming it's valid
                    const StatusWith<BSONObj>& normalInsert = //
                        normalInserts[currInsertItem->getItemIndex()];

                    const BSONObj& normalInsertDoc =
                        normalInsert.getValue().isEmpty() ?
                        currInsertItem->getDocument() : normalInsert.getValue();

                    if ( !normalInsert.isOK() ) {
                        // This insert failed on preprocessing
                        currResult.error = toWriteError( normalInsert.getStatus() );
                    }
                    else if ( !request.isInsertIndexRequest() ) {
                        // Try the insert
                        singleInsert( *currInsertItem,
                                      normalInsertDoc,
                                      collection,
                                      &currResult );
                    }
                    else {
                        // Try the create index
                        singleCreateIndex( *currInsertItem,
                                           normalInsertDoc,
                                           collection,
                                           &currResult );
                    }

                    //
                    // END CURRENT OP
                    //

                    finishCurrentOp( _client, currentOp.get(), currResult.error );

                    // Faults release the write lock
                    if ( currResult.fault )
                        break;

                    // In general, we might have stats and errors
                    incWriteStats( *currInsertItem,
                                   currResult.stats,
                                   currResult.error,
                                   currentOp.get() );

                    // Errors release the write lock
                    if ( currResult.error )
                        break;

                    // Increment in the write lock and reset the stats for next time
                    currInsertItem.reset( new BatchItemRef( &request,
                                                            currInsertItem->getItemIndex()
                                                            + 1 ) );
                    currResult.reset();

                    // Destruct curop so that our parent curop is restored, so that we
                    // record the yield count in the parent.
                    currentOp.reset(NULL);

                    // yield sometimes
                    int micros = ClientCursor::suggestYieldMicros();
                    if (micros > 0) {
                        ClientCursor::staticYield(micros, "", NULL);
                    }
                }
            }

        } // END WRITE LOCK

        //
        // Store the current error if it exists
        //

        if ( currResult.error ) {

            errors->push_back( currResult.releaseError() );
            errors->back()->setIndex( currInsertItem->getItemIndex() );

            // Break early for ordered batches
            if ( request.getOrdered() )
                break;
        }

        //
        // Fault or increment
        //

        if ( currResult.fault ) {
            // Check page fault out of lock
            currResult.fault->touch();
        }
        else {
            // Increment if not a fault
            currInsertItem.reset( new BatchItemRef( &request,
                                                    currInsertItem->getItemIndex() + 1 ) );
        }
    }

}
Example #13
0
    /**
     * The core config write functionality.
     *
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     *
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
     */
    void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest,
                                          BatchedCommandResponse* clientResponse,
                                          bool fsyncCheck ) {

        NamespaceString nss( clientRequest.getNS() );
        dassert( nss.db() == "config" || nss.db() == "admin" );
        dassert( clientRequest.sizeWriteOps() == 1u );

        if ( fsyncCheck ) {

            //
            // Sanity check that all configs are still reachable using fsync, preserving legacy
            // behavior
            //

            OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned;
            vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector();

            //
            // Send side
            //

            for ( vector<ConnectionString>::iterator it = _configHosts.begin();
                it != _configHosts.end(); ++it ) {
                ConnectionString& configHost = *it;
                FsyncRequest fsyncRequest;
                _dispatcher->addCommand( configHost, "admin", fsyncRequest );
            }

            _dispatcher->sendAll();

            //
            // Recv side
            //

            bool fsyncError = false;
            while ( _dispatcher->numPending() > 0 ) {

                fsyncResponses.push_back( new ConfigFsyncResponse() );
                ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back();
                Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost,
                                                              &fsyncResponse.response );

                // We've got to recv everything, no matter what
                if ( !dispatchStatus.isOK() ) {
                    fsyncError = true;
                    buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response );
                }
                else if ( !fsyncResponse.response.getOk() ) {
                    fsyncError = true;
                }
            }

            if ( fsyncError ) {
                combineFsyncErrors( fsyncResponses, clientResponse );
                return;
            }
            else {
                fsyncResponsesOwned.clear();
            }
        }

        //
        // Do the actual writes
        //

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        //
        // Send the actual config writes
        //

        // Get as many batches as we can at once
        for ( vector<ConnectionString>::iterator it = _configHosts.begin();
            it != _configHosts.end(); ++it ) {
            ConnectionString& configHost = *it;
            _dispatcher->addCommand( configHost, nss.db(), configRequest );
        }

        // Send them all out
        _dispatcher->sendAll();

        //
        // Recv side
        //

        while ( _dispatcher->numPending() > 0 ) {

            // Get the response
            responses.push_back( new ConfigResponse() );
            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost,
                                                          &configResponse.response );

            if ( !dispatchStatus.isOK() ) {
                buildErrorFrom( dispatchStatus, &configResponse.response );
            }
        }

        combineResponses( responses, clientResponse );
    }
Example #14
0
bool WriteBatchExecutor::applyWriteItem( const BatchedCommandRequest& request,
        int index,
        WriteStats* stats,
        BatchedErrorDetail* error ) {
    const string& ns = request.getNS();

    // Clear operation's LastError before starting.
    _le->reset( true );

    //uint64_t itemTimeMicros = 0;
    bool opSuccess = true;

    // Each write operation executes in its own PageFaultRetryableSection.  This means that
    // a single batch can throw multiple PageFaultException's, which is not the case for
    // other operations.
    PageFaultRetryableSection s;
    while ( true ) {
        try {
            // Execute the write item as a child operation of the current operation.
            CurOp childOp( _client, _client->curop() );

            // TODO Modify CurOp "wrapped" constructor to take an opcode, so calling .reset()
            // is unneeded
            childOp.reset( _client->getRemote(), getOpCode( request.getBatchType() ) );

            childOp.ensureStarted();
            OpDebug& opDebug = childOp.debug();
            opDebug.ns = ns;
            {
                Client::WriteContext ctx( ns );

                switch ( request.getBatchType() ) {
                case BatchedCommandRequest::BatchType_Insert:
                    opSuccess =
                        applyInsert( ns,
                                     request.getInsertRequest()->getDocumentsAt( index ),
                                     &childOp,
                                     stats,
                                     error );
                    break;
                case BatchedCommandRequest::BatchType_Update:
                    opSuccess = applyUpdate( ns,
                                             *request.getUpdateRequest()->getUpdatesAt( index ),
                                             &childOp,
                                             stats,
                                             error );
                    break;
                default:
                    dassert( request.getBatchType() ==
                             BatchedCommandRequest::BatchType_Delete );
                    opSuccess = applyDelete( ns,
                                             *request.getDeleteRequest()->getDeletesAt( index ),
                                             &childOp,
                                             stats,
                                             error );
                    break;
                }
            }
            childOp.done();
            //itemTimeMicros = childOp.totalTimeMicros();

            opDebug.executionTime = childOp.totalTimeMillis();
            opDebug.recordStats();

            // Log operation if running with at least "-v", or if exceeds slow threshold.
            if ( logger::globalLogDomain()->shouldLog( logger::LogSeverity::Debug( 1 ) )
                    || opDebug.executionTime > cmdLine.slowMS + childOp.getExpectedLatencyMs() ) {

                MONGO_TLOG(1) << opDebug.report( childOp ) << endl;
            }

            // TODO Log operation if logLevel >= 3 and assertion thrown (as assembleResponse()
            // does).

            // Save operation to system.profile if shouldDBProfile().
            if ( childOp.shouldDBProfile( opDebug.executionTime ) ) {
                profile( *_client, getOpCode( request.getBatchType() ), childOp );
            }
            break;
        }
        catch ( PageFaultException& e ) {
            e.touch();
        }
    }

    return opSuccess;
}
Example #15
0
    /**
     * The core config write functionality.
     *
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     *
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
     */
    void ConfigCoordinator::executeBatch(const BatchedCommandRequest& clientRequest,
                                         BatchedCommandResponse* clientResponse) {

        const NamespaceString nss(clientRequest.getNS());

        // Should never use it for anything other than DBs residing on the config server
        dassert(nss.db() == "config" || nss.db() == "admin");
        dassert(clientRequest.sizeWriteOps() == 1u);

        // This is an opportunistic check that all config servers look healthy by calling
        // getLastError on each one of them. If there was some form of write/journaling error, get
        // last error would fail.
        {
            for (vector<ConnectionString>::iterator it = _configHosts.begin();
                 it != _configHosts.end();
                 ++it) {

                _dispatcher->addCommand(*it,
                                        "admin",
                                        RawBSONSerializable(BSON("getLastError" << true <<
                                                                 "fsync" << true)));
            }

            _dispatcher->sendAll();

            bool error = false;
            while (_dispatcher->numPending()) {
                ConnectionString host;
                RawBSONSerializable response;

                Status status = _dispatcher->recvAny(&host, &response);
                if (status.isOK()) {
                    BSONObj obj = response.toBSON();

                    LOG(3) << "Response " << obj.toString();

                    // If the ok field is anything other than 1, count it as error
                    if (!obj["ok"].trueValue()) {
                        error = true;
                        log() << "Config server check for host " << host
                              << " returned error: " << response;
                    }
                }
                else {
                    error = true;
                    log() << "Config server check for host " << host
                          << " failed with status: " << status;
                }
            }

            // All responses should have been gathered by this point
            if (error) {
                clientResponse->setOk(false);
                clientResponse->setErrCode(ErrorCodes::RemoteValidationError);
                clientResponse->setErrMessage("Could not verify that config servers were active"
                                              " and reachable before write");
                return;
            }
        }

        if (!_checkConfigString(clientResponse)) {
            return;
        }

        //
        // Do the actual writes
        //

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        //
        // Send the actual config writes
        //

        // Get as many batches as we can at once
        for (vector<ConnectionString>::const_iterator it = _configHosts.begin();
             it != _configHosts.end();
             ++it) {

            const ConnectionString& configHost = *it;
            _dispatcher->addCommand(configHost, nss.db(), configRequest);
        }

        // Send them all out
        _dispatcher->sendAll();

        //
        // Recv side
        //

        while (_dispatcher->numPending() > 0) {
            // Get the response
            responses.push_back(new ConfigResponse());

            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny(&configResponse.configHost,
                                                         &configResponse.response);

            if (!dispatchStatus.isOK()) {
                buildErrorFrom(dispatchStatus, &configResponse.response);
            }
        }

        combineResponses(responses, clientResponse);
    }
Example #16
0
    void ClusterWriter::write( const BatchedCommandRequest& request,
                               BatchedCommandResponse* response ) {

        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );
            return;
        }

        if ( !NamespaceString::validCollectionName( nss.coll() ) ) {
            toBatchError( Status( ErrorCodes::BadValue,
                                  str::stream() << "invalid collection name " << nss.coll() ),
                          response );
            return;
        }

        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::FailedToParse,
                                  str::stream() << "exceeded maximum write batch size of "
                                                << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );
            return;
        }

        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );
            return;
        }

        // Config writes and shard writes are done differently
        string dbName = nss.db().toString();
        if ( dbName == "config" || dbName == "admin" ) {

            bool verboseWC = request.isVerboseWC();

            // We only support batch sizes of one for config writes
            if ( request.sizeWriteOps() != 1 ) {
                toBatchError( Status( ErrorCodes::InvalidOptions,
                                      mongoutils::str::stream() << "Writes to config servers must "
                                              "have batch size of 1, found "
                                              << request.sizeWriteOps() ),
                              response );
                return;
            }

            // We only support {w: 0}, {w: 1}, and {w: 'majority'} write concern for config writes
            if ( request.isWriteConcernSet() && !validConfigWC( request.getWriteConcern() )) {
                toBatchError( Status( ErrorCodes::InvalidOptions,
                                      mongoutils::str::stream() << "Invalid write concern for write"
				              " to config servers: " << request.getWriteConcern() ),
                              response );
                return;
            }

            // We need to support "best-effort" writes for pings to the config server.
            // {w:0} (!verbose) writes are interpreted as best-effort in this case - they may still
            // error, but do not do the initial fsync check.
            configWrite( request, response, verboseWC );
        }
        else {
            shardWrite( request, response );
        }
    }
Example #17
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        Timer commandTimer;

        WriteStats stats;
        std::auto_ptr<BatchedErrorDetail> error( new BatchedErrorDetail );
        BSONObj upsertedID = BSONObj();
        bool batchSuccess = true;
        bool staleBatch = false;

        // Apply each batch item, stopping on an error if we were asked to apply the batch
        // sequentially.
        size_t numBatchOps = request.sizeWriteOps();
        bool verbose = verboseResponse( request );
        for ( size_t i = 0; i < numBatchOps; i++ ) {

            if ( applyWriteItem( BatchItemRef( &request, i ),
                                 &stats,
                                 &upsertedID,
                                 error.get() ) ) {

                // In case updates turned out to be upserts, the callers may be interested
                // in learning what _id was used for that document.
                if ( !upsertedID.isEmpty() ) {
                    if ( numBatchOps == 1 ) {
                        response->setSingleUpserted(upsertedID);
                    }
                    else if ( verbose ) {
                        std::auto_ptr<BatchedUpsertDetail> upsertDetail(new BatchedUpsertDetail);
                        upsertDetail->setIndex(i);
                        upsertDetail->setUpsertedID(upsertedID);
                        response->addToUpsertDetails(upsertDetail.release());
                    }
                    upsertedID = BSONObj();
                }

            }
            else {

                // The applyWriteItem did not go thgrou
                // If the error is sharding related, we'll have to investigate whether we
                // have a stale view of sharding state.
                if ( error->getErrCode() == ErrorCodes::StaleShardVersion ) staleBatch = true;

                // Don't bother recording if the user doesn't want a verbose answer. We want to
                // keep the error if this is a one-item batch, since we already compact the
                // response for those.
                if (verbose || numBatchOps == 1) {
                    error->setIndex( static_cast<int>( i ) );
                    response->addToErrDetails( error.release() );
                }

                batchSuccess = false;

                if ( request.getOrdered() ) break;

                error.reset( new BatchedErrorDetail );
            }
        }

        // So far, we may have failed some of the batch's items. So we record
        // that. Rergardless, we still need to apply the write concern.  If that generates a
        // more specific error, we'd replace for the intermediate error here. Note that we
        // "compatct" the error messge if this is an one-item batch. (See rationale later in
        // this file.)
        if ( !batchSuccess ) {

            if (numBatchOps > 1) {
                // TODO
                // Define the final error code here.
                // Might be used as a final error, depending on write concern success.
                response->setErrCode( 99999 );
                response->setErrMessage( "batch op errors occurred" );
            }
            else {
                // Promote the single error.
                const BatchedErrorDetail* error = response->getErrDetailsAt( 0 );
                response->setErrCode( error->getErrCode() );
                if ( error->isErrInfoSet() ) response->setErrInfo( error->getErrInfo() );
                response->setErrMessage( error->getErrMessage() );
                response->unsetErrDetails();
                error = NULL;
            }
        }

        // Apply write concern. Note, again, that we're only assembling a full response if the
        // user is interested in it.
        BSONObj writeConcern;
        if ( request.isWriteConcernSet() ) {
            writeConcern = request.getWriteConcern();
        }
        else {
            writeConcern = _defaultWriteConcern;
        }

        string errMsg;
        BSONObjBuilder wcResultsB;
        if ( !waitForWriteConcern( writeConcern, !batchSuccess, &wcResultsB, &errMsg ) ) {

            // TODO Revisit when user visible family error codes are set
            response->setErrCode( ErrorCodes::WriteConcernFailed );
            response->setErrMessage( errMsg );

            if ( verbose ) {
                response->setErrInfo( wcResultsB.obj() );
            }
        }

        // TODO: Audit where we want to queue here
        if ( staleBatch ) {
            ChunkVersion latestShardVersion;
            shardingState.refreshMetadataIfNeeded( request.getNS(),
                                                   request.getShardVersion(),
                                                   &latestShardVersion );
        }

        // Set the main body of the response. We assume that, if there was an error, the error
        // code would already be set.
        response->setOk( !response->isErrCodeSet() );
        response->setN( stats.numUpdated + stats.numInserted + stats.numDeleted );
    }
Example #18
0
    void BatchSafeWriter::safeWriteBatch( DBClientBase* conn,
                                          const BatchedCommandRequest& request,
                                          BatchedCommandResponse* response ) {

        const NamespaceString nss( request.getNS() );

        // N starts at zero, and we add to it for each item
        response->setN( 0 );

        for ( size_t i = 0; i < request.sizeWriteOps(); ++i ) {

            // Break on first error if we're ordered
            if ( request.getOrdered() && response->isErrDetailsSet() )
                break;

            BatchItemRef itemRef( &request, static_cast<int>( i ) );
            bool isLastItem = ( i == request.sizeWriteOps() - 1 );

            BSONObj writeConcern;
            if ( isLastItem && request.isWriteConcernSet() ) {
                writeConcern = request.getWriteConcern();
                // Pre-2.4.2 mongods react badly to 'w' being set on config servers
                if ( nss.db() == "config" )
                    writeConcern = fixWCForConfig( writeConcern );
            }

            BSONObj gleResult;
            GLEErrors errors;
            Status status = _safeWriter->safeWrite( conn, itemRef, writeConcern, &gleResult );
            if ( status.isOK() ) {
                status = extractGLEErrors( gleResult, &errors );
            }

            if ( !status.isOK() ) {
                response->clear();
                response->setOk( false );
                response->setErrCode( status.code() );
                response->setErrMessage( status.reason() );
                return;
            }

            //
            // STATS HANDLING
            //

            GLEStats stats;
            extractGLEStats( gleResult, &stats );

            // Special case for making legacy "n" field result for insert match the write
            // command result.
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert
                 && !errors.writeError.get() ) {
                // n is always 0 for legacy inserts.
                dassert( stats.n == 0 );
                stats.n = 1;
            }

            response->setN( response->getN() + stats.n );

            if ( !stats.upsertedId.isEmpty() ) {
                BatchedUpsertDetail* upsertedId = new BatchedUpsertDetail;
                upsertedId->setIndex( i );
                upsertedId->setUpsertedID( stats.upsertedId );
                response->addToUpsertDetails( upsertedId );
            }

            response->setLastOp( stats.lastOp );

            //
            // WRITE ERROR HANDLING
            //

            // If any error occurs (except stale config) the previous GLE was not enforced
            bool enforcedWC = !errors.writeError.get()
                              || errors.writeError->getErrCode() == ErrorCodes::StaleShardVersion;

            // Save write error
            if ( errors.writeError.get() ) {
                errors.writeError->setIndex( i );
                response->addToErrDetails( errors.writeError.release() );
            }

            //
            // WRITE CONCERN ERROR HANDLING
            //

            // The last write is weird, since we enforce write concern and check the error through
            // the same GLE if possible.  If the last GLE was an error, the write concern may not
            // have been enforced in that same GLE, so we need to send another after resetting the
            // error.
            if ( isLastItem ) {

                // Try to enforce the write concern if everything succeeded (unordered or ordered)
                // OR if something succeeded and we're unordered.
                bool needToEnforceWC =
                    !response->isErrDetailsSet()
                    || ( !request.getOrdered()
                         && response->sizeErrDetails() < request.sizeWriteOps() );

                if ( !enforcedWC && needToEnforceWC ) {
                    dassert( !errors.writeError.get() ); // emptied above

                    // Might have gotten a write concern validity error earlier, these are
                    // enforced even if the wc isn't applied, so we ignore.
                    errors.wcError.reset();

                    Status status = _safeWriter->enforceWriteConcern( conn,
                                                                      nss.db().toString(),
                                                                      writeConcern,
                                                                      &gleResult );

                    if ( status.isOK() ) {
                        status = extractGLEErrors( gleResult, &errors );
                    }

                    if ( !status.isOK() ) {
                        response->clear();
                        response->setOk( false );
                        response->setErrCode( status.code() );
                        response->setErrMessage( status.reason() );
                        return;
                    }
                }
                // END Write concern retry

                if ( errors.wcError.get() ) {
                    response->setWriteConcernError( errors.wcError.release() );
                }
            }
        }

        response->setOk( true );
        dassert( response->isValid( NULL ) );
    }