Exemplo n.º 1
0
    /**
     * The core config write functionality.
     *
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     *
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
     */
    void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest,
                                          BatchedCommandResponse* clientResponse,
                                          bool fsyncCheck ) {

        NamespaceString nss( clientRequest.getNS() );
        dassert( nss.db() == "config" || nss.db() == "admin" );
        dassert( clientRequest.sizeWriteOps() == 1u );

        if ( fsyncCheck ) {

            //
            // Sanity check that all configs are still reachable using fsync, preserving legacy
            // behavior
            //

            OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned;
            vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector();

            //
            // Send side
            //

            for ( vector<ConnectionString>::iterator it = _configHosts.begin();
                it != _configHosts.end(); ++it ) {
                ConnectionString& configHost = *it;
                FsyncRequest fsyncRequest;
                _dispatcher->addCommand( configHost, "admin", fsyncRequest );
            }

            _dispatcher->sendAll();

            //
            // Recv side
            //

            bool fsyncError = false;
            while ( _dispatcher->numPending() > 0 ) {

                fsyncResponses.push_back( new ConfigFsyncResponse() );
                ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back();
                Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost,
                                                              &fsyncResponse.response );

                // We've got to recv everything, no matter what
                if ( !dispatchStatus.isOK() ) {
                    fsyncError = true;
                    buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response );
                }
                else if ( !fsyncResponse.response.getOk() ) {
                    fsyncError = true;
                }
            }

            if ( fsyncError ) {
                combineFsyncErrors( fsyncResponses, clientResponse );
                return;
            }
            else {
                fsyncResponsesOwned.clear();
            }
        }

        //
        // Do the actual writes
        //

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        //
        // Send the actual config writes
        //

        // Get as many batches as we can at once
        for ( vector<ConnectionString>::iterator it = _configHosts.begin();
            it != _configHosts.end(); ++it ) {
            ConnectionString& configHost = *it;
            _dispatcher->addCommand( configHost, nss.db(), configRequest );
        }

        // Send them all out
        _dispatcher->sendAll();

        //
        // Recv side
        //

        while ( _dispatcher->numPending() > 0 ) {

            // Get the response
            responses.push_back( new ConfigResponse() );
            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost,
                                                          &configResponse.response );

            if ( !dispatchStatus.isOK() ) {
                buildErrorFrom( dispatchStatus, &configResponse.response );
            }
        }

        combineResponses( responses, clientResponse );
    }
Exemplo n.º 2
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );
            return;
        }

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );
            return;
        }

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );
            return;
        }

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();
        }

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
            }
        }
        else {
            wcStatus = writeConcern.parse( wcDoc );
        }

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );
        }

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );
            return;
        }

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );
            return;
        }

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );
            return;
        }

        //
        // End validation
        //

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        //
        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.
        //

        bulkExecute( request, &upserted, &writeErrors );

        //
        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.
        //

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );
            }
        }

        //
        // Refresh metadata if needed
        //

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                //
                // First, we refresh metadata if we need to based on the requested version.
                //

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       requestMetadata->getShardVersion(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );
                }

                if ( queueForMigrationCommit ) {

                    //
                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.
                    //

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    //
                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.
                    //

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
                        }
                    }
                }
            }
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );
            }
        }

        //
        // Construct response
        //

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );
            }

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );
            }

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );
            }

            const repl::ReplicationCoordinator::Mode replMode =
                    repl::getGlobalReplicationCoordinator()->getReplicationMode();
            if (replMode != repl::ReplicationCoordinator::modeNone) {
                response->setLastOp( _client->getLastOp() );
                if (replMode == repl::ReplicationCoordinator::modeReplSet) {
                    response->setElectionId(repl::theReplSet->getElectionId());
                }
            }

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );
        }

        dassert( response->isValid( NULL ) );
    }
Exemplo n.º 3
0
void CScriptDebugging::OnLuaMainDestroy ( CLuaMain* pLuaMain )
{
    dassert ( !ListContains ( m_LuaMainStack, pLuaMain ) );
    ListRemove ( m_LuaMainStack, pLuaMain );
}
Exemplo n.º 4
0
const UpdateIndexData& CollectionInfoCache::getIndexKeys(OperationContext* opCtx) const {
    // This requires "some" lock, and MODE_IS is an expression for that, for now.
    dassert(opCtx->lockState()->isCollectionLockedForMode(_collection->ns().ns(), MODE_IS));
    invariant(_keysComputed);
    return _indexedPaths;
}
Exemplo n.º 5
0
        void nfs_client_impl::end_get_file_size(
            ::dsn::error_code err,
            const ::dsn::service::get_file_size_response& resp,
            void* context)
        {
            user_request* ureq = (user_request*)context;

            if (err != ::dsn::ERR_OK)
            {
                derror("remote copy request failed");
                ureq->nfs_task->enqueue(err, 0, ureq->nfs_task->node());
                delete ureq;
                return;
            }

            err.set(resp.error);
            if (err != ::dsn::ERR_OK)
            {
                derror("remote copy request failed");
                error_code resp_err;
                resp_err.set(resp.error);
                ureq->nfs_task->enqueue(resp_err, 0, ureq->nfs_task->node());
                delete ureq;
                return;
            }

            for (size_t i = 0; i < resp.size_list.size(); i++) // file list
            {
                file_context *filec;
                uint64_t size = resp.size_list[i];

                filec = new file_context(ureq, resp.file_list[i], resp.size_list[i]);
                ureq->file_context_map.insert(std::pair<std::string, file_context*>(
                    ureq->file_size_req.dst_dir + resp.file_list[i], filec));

                //dinfo("this file size is %d, name is %s", size, resp.file_list[i].c_str());

                // new all the copy requests                

                uint64_t req_offset = 0;
                uint32_t req_size;
                if (size > _opts.nfs_copy_block_bytes)
                    req_size = _opts.nfs_copy_block_bytes;
                else
                    req_size = static_cast<uint32_t>(size);

                int idx = 0;
                for (;;) // send one file with multi-round rpc
                {
                    auto req = boost::intrusive_ptr<copy_request_ex>(new copy_request_ex(filec, idx++));
                    filec->copy_requests.push_back(req);

                    {
                        zauto_lock l(_copy_requests_lock);
                        _copy_requests.push(req);
                    }

                    req->copy_req.source = ureq->file_size_req.source;
                    req->copy_req.file_name = resp.file_list[i];
                    req->copy_req.offset = req_offset;
                    req->copy_req.size = req_size;
                    req->copy_req.dst_dir = ureq->file_size_req.dst_dir;
                    req->copy_req.source_dir = ureq->file_size_req.source_dir;
                    req->copy_req.overwrite = ureq->file_size_req.overwrite;
                    req->copy_req.is_last = (size <= req_size);

                    req_offset += req_size;
                    size -= req_size;
                    if (size <= 0)
                    {
                        dassert(size == 0, "last request must read exactly the remaing size of the file");
                        break;
                    }

                    if (size > _opts.nfs_copy_block_bytes)
                        req_size = _opts.nfs_copy_block_bytes;
                    else
                        req_size = static_cast<uint32_t>(size);
                }
            }

            continue_copy(0);
        }
Exemplo n.º 6
0
    Status DBClientShardResolver::chooseWriteHost( const string& shardName,
                                                   ConnectionString* shardHost ) const {

        // Declare up here for parsing later
        string errMsg;

        // Special-case for config and admin
        if ( shardName == "config" || shardName == "admin" ) {
            *shardHost = ConnectionString::parse( configServer.modelServer(), errMsg );
            dassert( errMsg == "" );
            return Status::OK();
        }

        //
        // First get the information about the shard from the shard cache
        //

        // Internally uses our shard cache, does no reload
        Shard shard = Shard::findIfExists( shardName );
        if ( shard.getName() == "" ) {
            return Status( ErrorCodes::ShardNotFound,
                           string("unknown shard name ") + shardName );
        }

        ConnectionString rawShardHost = ConnectionString::parse( shard.getConnString(), errMsg );
        dassert( errMsg == "" );
        dassert( rawShardHost.type() == ConnectionString::SET
                 || rawShardHost.type() == ConnectionString::MASTER );

        if ( rawShardHost.type() == ConnectionString::MASTER ) {
            *shardHost = rawShardHost;
            return Status::OK();
        }

        //
        // If we need to, then get the particular node we're targeting in the replica set
        //

        // Does not reload the monitor if it doesn't currently exist
        ReplicaSetMonitorPtr replMonitor = ReplicaSetMonitor::get( rawShardHost.getSetName(),
                                                                   false );
        if ( !replMonitor ) {
            return Status( ErrorCodes::ReplicaSetNotFound,
                           string("unknown replica set ") + rawShardHost.getSetName() );
        }

        try {
            // This can throw when we don't find a master!
            HostAndPort masterHostAndPort = replMonitor->getMaster();
            *shardHost = ConnectionString::parse( masterHostAndPort.toString( true ), errMsg );
            dassert( errMsg == "" );
            return Status::OK();
        }
        catch ( const DBException& ) {
            return Status( ErrorCodes::HostNotFound,
                           string("could not contact primary for replica set ")
                           + replMonitor->getName() );
        }

        // Unreachable
        dassert( false );
        return Status( ErrorCodes::UnknownError, "" );
    }
Exemplo n.º 7
0
KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
    b.appendBuf(rhs.data(), rhs.dataSize());
    _keyData = (const unsigned char*)b.buf();
    dassert(b.len() == dataSize());  // check datasize method is correct
    dassert((*_keyData & cNOTUSED) == 0);
}
Exemplo n.º 8
0
        error_code io_looper::bind_io_handle(
            dsn_handle_t handle,
            io_loop_callback* cb,
            unsigned int events,
            ref_counter* ctx
            )
        {
            int fd;
            short filters[2];
            int nr_filters;
            struct kevent e;           

            if (cb == nullptr)
            {
                derror("cb == nullptr");
                return ERR_INVALID_PARAMETERS;
            }

            fd = (int)(intptr_t)(handle);
            if (fd < 0)
            {
                if (fd != IO_LOOPER_USER_NOTIFICATION_FD)
                {
                    derror("The fd %d is less than 0.", fd);
                    return ERR_INVALID_PARAMETERS;
                }
            }

            if (_filters.find((short)events) == _filters.end())
            {
                derror("The filter %u is unsupported.", events);
                return ERR_INVALID_PARAMETERS;
            }

            if (fd > 0)
            {
                int flags = fcntl(fd, F_GETFL, 0);
                dassert (flags != -1, "fcntl failed, err = %s, fd = %d", strerror(errno), fd);

                if (!(flags & O_NONBLOCK))
                {
                    flags |= O_NONBLOCK;
                    flags = fcntl(fd, F_SETFL, flags);
                    dassert(flags != -1, "fcntl failed, err = %s, fd = %d", strerror(errno), fd);
                }
            }

            uintptr_t cb0 = (uintptr_t)cb;
            dassert((cb0 & 0x1) == 0, "the least one bit must be zero for the callback address");

            if (ctx)
            {
                cb0 |= 0x1; // has ref_counter

                utils::auto_lock<utils::ex_lock_nr_spin> l(_io_sessions_lock);
                auto pr = _io_sessions.insert(io_sessions::value_type(cb, ctx));
                dassert(pr.second, "the callback must not be registered before");
            }
            
            if ((short)events == EVFILT_READ_WRITE)
            {
                filters[0] = EVFILT_READ;
                filters[1] = EVFILT_WRITE;
                nr_filters = 2;
            }
            else
            {
                filters[0] = (short)events;
                nr_filters = 1;
            }

            for (int i = 0; i < nr_filters; i++)
            {
                EV_SET(&e, fd, filters[i], (EV_ADD | EV_ENABLE | EV_CLEAR), 0, 0, (void*)cb0);

                if (kevent(_io_queue, &e, 1, nullptr, 0, nullptr) == -1)
                {
                    derror("bind io handler to kqueue failed, err = %s, fd = %d", strerror(errno), fd);

                    if (ctx)
                    {
                        utils::auto_lock<utils::ex_lock_nr_spin> l(_io_sessions_lock);
                        auto r = _io_sessions.erase(cb);
                        dassert(r > 0, "the callback must be present");
                    }

                    for (int j = 0; j < i; j++)
                    {
                        EV_SET(&e, fd, filters[j], EV_DELETE, 0, 0, nullptr);
                        if (kevent(_io_queue, &e, 1, nullptr, 0, nullptr) == -1)
                        {
                            derror("Unregister kqueue failed, filter = %d, err = %s, fd = %d", filters[j], strerror(errno), fd);
                        }
                    }

                    return ERR_BIND_IOCP_FAILED;
                }
            }

            return ERR_OK;
        }
Exemplo n.º 9
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& cmdObj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        BSONElement first = cmdObj.firstElement();
        uassert(28528,
                str::stream() << "Argument to listIndexes must be of type String, not "
                              << typeName(first.type()),
                first.type() == String);
        StringData collectionName = first.valueStringData();
        uassert(28529,
                str::stream() << "Argument to listIndexes must be a collection name, "
                              << "not the empty string",
                !collectionName.empty());
        const NamespaceString ns(dbname, collectionName);

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        AutoGetCollectionForRead autoColl(txn, ns);
        if (!autoColl.getDb()) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no database"));
        }

        const Collection* collection = autoColl.getCollection();
        if (!collection) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no collection"));
        }

        const CollectionCatalogEntry* cce = collection->getCatalogEntry();
        invariant(cce);

        vector<string> indexNames;
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            indexNames.clear();
            cce->getAllIndexes(txn, &indexNames);
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

        auto ws = make_unique<WorkingSet>();
        auto root = make_unique<QueuedDataStage>(txn, ws.get());

        for (size_t i = 0; i < indexNames.size(); i++) {
            BSONObj indexSpec;
            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                indexSpec = cce->getIndexSpec(txn, indexNames[i]);
            }
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->keyData.clear();
            member->loc = RecordId();
            member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned());
            member->transitionToOwnedObj();
            root->pushBack(id);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "."
                                                    << ns.coll();
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS());
        dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            exec->detachFromOperationContext();
            ClientCursor* cursor =
                new ClientCursor(CursorManager::getGlobalCursorManager(),
                                 exec.release(),
                                 cursorNamespace,
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot());
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
Exemplo n.º 10
0
    Status MetadataLoader::initChunks( const string& ns,
                                       const string& shard,
                                       const CollectionMetadata* oldMetadata,
                                       CollectionMetadata* metadata ) const
    {
        map<string, ChunkVersion> versionMap;

        // Preserve the epoch
        versionMap[shard] = metadata->_shardVersion;
        OID epoch = metadata->getCollVersion().epoch();
        bool fullReload = true;

        // Check to see if we should use the old version or not.
        if ( oldMetadata ) {

            // If our epochs are compatible, it's useful to use the old metadata for diffs
            if ( oldMetadata->getCollVersion().hasCompatibleEpoch( epoch ) ) {

                fullReload = false;
                dassert( oldMetadata->isValid() );

                versionMap[shard] = oldMetadata->_shardVersion;
                metadata->_collVersion = oldMetadata->_collVersion;

                // TODO: This could be made more efficient if copying not required, but
                // not as frequently reloaded as in mongos.
                metadata->_chunksMap = oldMetadata->_chunksMap;

                LOG( 2 ) << "loading new chunks for collection " << ns
                         << " using old metadata w/ version " << oldMetadata->getShardVersion()
                         << " and " << metadata->_chunksMap.size() << " chunks" << endl;
            }
            else {
                warning() << "reloading collection metadata for " << ns << " with new epoch "
                          << epoch.toString() << ", the current epoch is "
                          << oldMetadata->getCollVersion().epoch().toString() << endl;
            }
        }


        // Exposes the new metadata's range map and version to the "differ," who
        // would ultimately be responsible of filling them up.
        SCMConfigDiffTracker differ( shard );
        differ.attach( ns, metadata->_chunksMap, metadata->_collVersion, versionMap );

        try {

            ScopedDbConnection conn( _configLoc.toString(), 30 );

            auto_ptr<DBClientCursor> cursor = conn->query( ChunkType::ConfigNS,
                                                           differ.configDiffQuery() );

            if ( !cursor.get() ) {

                // Make our metadata invalid
                metadata->_collVersion = ChunkVersion( 0, 0, OID() );
                metadata->_chunksMap.clear();
                conn.done();

                return Status( ErrorCodes::HostUnreachable,
                               "problem opening chunk metadata cursor" );
            }

            //
            // The diff tracker should always find at least one chunk (the highest chunk we saw
            // last time).  If not, something has changed on the config server (potentially between
            // when we read the collection data and when we read the chunks data).
            //

            int diffsApplied = differ.calculateConfigDiff( *cursor );
            if ( diffsApplied > 0 ) {

                // Chunks found, return ok

                LOG(2) << "loaded " << diffsApplied << " chunks into new metadata for " << ns
                           << " with version " << metadata->_collVersion << endl;

                metadata->_shardVersion = versionMap[shard];
                metadata->fillRanges();
                conn.done();

                dassert( metadata->isValid() );
                return Status::OK();
            }
            else if ( diffsApplied == 0 ) {

                // No chunks found, the collection is dropping or we're confused
                // If this is a full reload, assume it is a drop for backwards compatibility
                // TODO: drop the config.collections entry *before* the chunks and eliminate this
                // ambiguity

                string errMsg =
                    str::stream() << "no chunks found when reloading " << ns
                                  << ", previous version was "
                                  << metadata->_collVersion.toString()
                                  << ( fullReload ? ", this is a drop" : "" );

                warning() << errMsg << endl;

                metadata->_collVersion = ChunkVersion( 0, 0, OID() );
                metadata->_chunksMap.clear();
                conn.done();

                return fullReload ? Status( ErrorCodes::NamespaceNotFound, errMsg ) :
                                    Status( ErrorCodes::RemoteChangeDetected, errMsg );
            }
            else {

                // Invalid chunks found, our epoch may have changed because we dropped/recreated
                // the collection.

                string errMsg = // br
                        str::stream() << "invalid chunks found when reloading " << ns
                                      << ", previous version was "
                                      << metadata->_collVersion.toString()
                                      << ", this should be rare";

                warning() << errMsg << endl;

                metadata->_collVersion = ChunkVersion( 0, 0, OID() );
                metadata->_chunksMap.clear();
                conn.done();

                return Status( ErrorCodes::RemoteChangeDetected, errMsg );
            }
        }
        catch ( const DBException& e ) {
            string errMsg = str::stream() << "problem querying chunks metadata" << causedBy( e );

            // We deliberately do not return connPtr to the pool, since it was involved
            // with the error here.

            return Status( ErrorCodes::HostUnreachable, errMsg );
        }
    }
Exemplo n.º 11
0
    Status MetadataLoader::initCollection( const string& ns,
                                           const string& shard,
                                           CollectionMetadata* metadata ) const
    {
        //
        // Bring collection entry from the config server.
        //

        BSONObj collDoc;
        {
            try {
                ScopedDbConnection conn( _configLoc.toString(), 30 );
                collDoc = conn->findOne( CollectionType::ConfigNS, QUERY(CollectionType::ns()<<ns));
                conn.done();
            }
            catch ( const DBException& e ) {
                string errMsg = str::stream() << "could not query collection metadata"
                                              << causedBy( e );

                // We deliberately do not return conn to the pool, since it was involved
                // with the error here.

                return Status( ErrorCodes::HostUnreachable, errMsg );
            }
        }

        string errMsg;
        if ( collDoc.isEmpty() ) {

            errMsg = str::stream() << "could not load metadata, collection " << ns << " not found";
            warning() << errMsg << endl;

            return Status( ErrorCodes::NamespaceNotFound, errMsg );
        }

        CollectionType collInfo;
        if ( !collInfo.parseBSON( collDoc, &errMsg ) || !collInfo.isValid( &errMsg ) ) {

            errMsg = str::stream() << "could not parse metadata for collection " << ns
                                   << causedBy( errMsg );
            warning() << errMsg << endl;

            return Status( ErrorCodes::FailedToParse, errMsg );
        }

        if ( collInfo.isDroppedSet() && collInfo.getDropped() ) {

            errMsg = str::stream() << "could not load metadata, collection " << ns
                                   << " was dropped";
            warning() << errMsg << endl;

            return Status( ErrorCodes::NamespaceNotFound, errMsg );
        }

        if ( collInfo.isKeyPatternSet() && !collInfo.getKeyPattern().isEmpty() ) {

            // Sharded collection, need to load chunks

            metadata->_keyPattern = collInfo.getKeyPattern();
            metadata->_shardVersion = ChunkVersion( 0, 0, collInfo.getEpoch() );
            metadata->_collVersion = ChunkVersion( 0, 0, collInfo.getEpoch() );

            return Status::OK();
        }
        else if ( collInfo.isPrimarySet() && collInfo.getPrimary() == shard ) {

            // A collection with a non-default primary

            // Empty primary field not allowed if set
            dassert( collInfo.getPrimary() != "" );

            metadata->_keyPattern = BSONObj();
            metadata->_shardVersion = ChunkVersion( 1, 0, collInfo.getEpoch() );
            metadata->_collVersion = metadata->_shardVersion;

            return Status::OK();
        }
        else {

            // A collection with a primary that doesn't match this shard or is empty, the primary
            // may have changed before we loaded.

            errMsg = // br
                    str::stream() << "collection " << ns << " does not have a shard key "
                                  << "and primary "
                                  << ( collInfo.isPrimarySet() ? collInfo.getPrimary() : "" )
                                  << " does not match this shard " << shard;

            warning() << errMsg << endl;

            metadata->_collVersion = ChunkVersion( 0, 0, OID() );

            return Status( ErrorCodes::RemoteChangeDetected, errMsg );
        }
    }
Exemplo n.º 12
0
Status ReadConcernArgs::initialize(const BSONElement& readConcernElem) {
    invariant(isEmpty());  // only legal to call on uninitialized object.

    if (readConcernElem.eoo()) {
        return Status::OK();
    }

    dassert(readConcernElem.fieldNameStringData() == kReadConcernFieldName);

    if (readConcernElem.type() != Object) {
        return Status(ErrorCodes::FailedToParse,
                      str::stream() << kReadConcernFieldName << " field should be an object");
    }

    BSONObj readConcernObj = readConcernElem.Obj();
    for (auto&& field : readConcernObj) {
        auto fieldName = field.fieldNameStringData();
        if (fieldName == kAfterOpTimeFieldName) {
            OpTime opTime;
            // TODO pass field in rather than scanning again.
            auto opTimeStatus =
                bsonExtractOpTimeField(readConcernObj, kAfterOpTimeFieldName, &opTime);
            if (!opTimeStatus.isOK()) {
                return opTimeStatus;
            }
            _opTime = opTime;
        } else if (fieldName == kAfterClusterTimeFieldName) {
            Timestamp afterClusterTime;
            auto afterClusterTimeStatus = bsonExtractTimestampField(
                readConcernObj, kAfterClusterTimeFieldName, &afterClusterTime);
            if (!afterClusterTimeStatus.isOK()) {
                return afterClusterTimeStatus;
            }
            _afterClusterTime = LogicalTime(afterClusterTime);
        } else if (fieldName == kAtClusterTimeFieldName) {
            Timestamp atClusterTime;
            auto atClusterTimeStatus =
                bsonExtractTimestampField(readConcernObj, kAtClusterTimeFieldName, &atClusterTime);
            if (!atClusterTimeStatus.isOK()) {
                return atClusterTimeStatus;
            }
            _atClusterTime = LogicalTime(atClusterTime);
        } else if (fieldName == kLevelFieldName) {
            std::string levelString;
            // TODO pass field in rather than scanning again.
            auto readCommittedStatus =
                bsonExtractStringField(readConcernObj, kLevelFieldName, &levelString);

            if (!readCommittedStatus.isOK()) {
                return readCommittedStatus;
            }

            if (levelString == kLocalReadConcernStr) {
                _level = ReadConcernLevel::kLocalReadConcern;
            } else if (levelString == kMajorityReadConcernStr) {
                _level = ReadConcernLevel::kMajorityReadConcern;
            } else if (levelString == kLinearizableReadConcernStr) {
                _level = ReadConcernLevel::kLinearizableReadConcern;
            } else if (levelString == kAvailableReadConcernStr) {
                _level = ReadConcernLevel::kAvailableReadConcern;
            } else if (levelString == kSnapshotReadConcernStr) {
                _level = ReadConcernLevel::kSnapshotReadConcern;
            } else {
                return Status(ErrorCodes::FailedToParse,
                              str::stream() << kReadConcernFieldName << '.' << kLevelFieldName
                                            << " must be either 'local', 'majority', "
                                               "'linearizable', 'available', or 'snapshot'");
            }
        } else {
            return Status(ErrorCodes::InvalidOptions,
                          str::stream() << "Unrecognized option in " << kReadConcernFieldName
                                        << ": "
                                        << fieldName);
        }
    }

    if (_afterClusterTime && _opTime) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << "Can not specify both " << kAfterClusterTimeFieldName
                                    << " and "
                                    << kAfterOpTimeFieldName);
    }

    if (_afterClusterTime && _atClusterTime) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << "Can not specify both " << kAfterClusterTimeFieldName
                                    << " and "
                                    << kAtClusterTimeFieldName);
    }

    // Note: 'available' should not be used with after cluster time, as cluster time can wait for
    // replication whereas the premise of 'available' is to avoid waiting. 'linearizable' should not
    // be used with after cluster time, since linearizable reads are inherently causally consistent.
    if (_afterClusterTime && getLevel() != ReadConcernLevel::kMajorityReadConcern &&
        getLevel() != ReadConcernLevel::kLocalReadConcern &&
        getLevel() != ReadConcernLevel::kSnapshotReadConcern) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << kAfterClusterTimeFieldName << " field can be set only if "
                                    << kLevelFieldName
                                    << " is equal to "
                                    << kMajorityReadConcernStr
                                    << ", "
                                    << kLocalReadConcernStr
                                    << ", or "
                                    << kSnapshotReadConcernStr);
    }

    if (_opTime && getLevel() == ReadConcernLevel::kSnapshotReadConcern) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << kAfterOpTimeFieldName << " field cannot be set if "
                                    << kLevelFieldName
                                    << " is equal to "
                                    << kSnapshotReadConcernStr);
    }

    if (_atClusterTime && getLevel() != ReadConcernLevel::kSnapshotReadConcern) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << kAtClusterTimeFieldName << " field can be set only if "
                                    << kLevelFieldName
                                    << " is equal to "
                                    << kSnapshotReadConcernStr);
    }

    if (_afterClusterTime && _afterClusterTime == LogicalTime::kUninitialized) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << kAfterClusterTimeFieldName << " cannot be a null timestamp");
    }

    if (_atClusterTime && _atClusterTime == LogicalTime::kUninitialized) {
        return Status(ErrorCodes::InvalidOptions,
                      str::stream() << kAtClusterTimeFieldName << " cannot be a null timestamp");
    }

    return Status::OK();
}
Exemplo n.º 13
0
void FindAndModifyRequest::setUpsert(bool upsert) {
    dassert(_update);
    _isUpsert = upsert;
}
Exemplo n.º 14
0
void FindAndModifyRequest::setShouldReturnNew(bool shouldReturnNew) {
    dassert(_update);
    _shouldReturnNew = shouldReturnNew;
}
Exemplo n.º 15
0
    // @param reconf true if this is a reconfiguration and not an initial load of the configuration.
    // @return true if ok; throws if config really bad; false if config doesn't include self
    bool ReplSetImpl::initFromConfig(OperationContext* txn, ReplSetConfig& c, bool reconf) {
        // NOTE: haveNewConfig() writes the new config to disk before we get here.  So
        //       we cannot error out at this point, except fatally.  Check errors earlier.
        lock lk(this);

        if (!getLastErrorDefault.isEmpty() || !c.getLastErrorDefaults.isEmpty()) {
            getLastErrorDefault = c.getLastErrorDefaults;
        }

        list<ReplSetConfig::MemberCfg*> newOnes;
        // additive short-cuts the new config setup. If we are just adding a
        // node/nodes and nothing else is changing, this is additive. If it's
        // not a reconfig, we're not adding anything
        bool additive = reconf;
        bool updateConfigs = false;
        {
            unsigned nfound = 0;
            int me = 0;
            for (vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin();
                    i != c.members.end();
                    i++) {
                
                ReplSetConfig::MemberCfg& m = *i;
                if (isSelf(m.h)) {
                    me++;
                }
                
                if (reconf) {
                    const Member *old = findById(m._id);
                    if (old) {
                        nfound++;
                        verify((int) old->id() == m._id);
                        if (!old->config().isSameIgnoringTags(m)) {
                            additive = false;
                        }
                        if (!updateConfigs && old->config() != m) {
                            updateConfigs = true;
                        }
                    }
                    else {
                        newOnes.push_back(&m);
                    }
                }
            }
            if (me == 0) { // we're not in the config -- we must have been removed
                if (state().shunned()) {
                    // already took note of our ejection from the set
                    // so just sit tight and poll again
                    return false;
                }

                _members.orphanAll();

                // kill off rsHealthPoll threads (because they Know Too Much about our past)
                endOldHealthTasks();

                // close sockets to force clients to re-evaluate this member
                MessagingPort::closeAllSockets(0);

                // take note of our ejection
                changeState(MemberState::RS_SHUNNED);

                // go into holding pattern
                log() << "replSet info self not present in the repl set configuration:" << rsLog;
                log() << c.toString() << rsLog;

                loadConfig(txn);  // redo config from scratch
                return false; 
            }
            uassert(13302, "replSet error self appears twice in the repl set configuration", me<=1);

            // if we found different members that the original config, reload everything
            if (reconf && config().members.size() != nfound)
                additive = false;
        }

        // If we are changing chaining rules, we don't want this to be an additive reconfig so that
        // the primary can step down and the sync targets change.
        // TODO: This can be removed once SERVER-5208 is fixed.
        if (reconf && config().chainingAllowed() != c.chainingAllowed()) {
            additive = false;
        }

        _cfg = new ReplSetConfig(c);
        // config() is same thing but const, so we use that when we can for clarity below
        dassert(&config() == _cfg);
        verify(config().ok());
        verify(_name.empty() || _name == config()._id);
        _name = config()._id;
        verify(!_name.empty());
        // this is a shortcut for simple changes
        if (additive) {
            log() << "replSet info : additive change to configuration" << rsLog;
            if (updateConfigs) {
                // we have new configs for existing members, so we need to repopulate _members
                // with the most recent configs
                _members.orphanAll();

                // for logging
                string members = "";

                // not setting _self to 0 as other threads use _self w/o locking
                int me = 0;
                for(vector<ReplSetConfig::MemberCfg>::const_iterator i = config().members.begin();
                    i != config().members.end(); i++) {
                    const ReplSetConfig::MemberCfg& m = *i;
                    Member *mi;
                    members += (members == "" ? "" : ", ") + m.h.toString();
                    if (isSelf(m.h)) {
                        verify(me++ == 0);
                        mi = new Member(m.h, m._id, &m, true);
                        setSelfTo(mi);
                    }
                    else {
                        mi = new Member(m.h, m._id, &m, false);
                        _members.push(mi);
                    }
                }
                // trigger a handshake to update the syncSource of our writeconcern information
                syncSourceFeedback.forwardSlaveHandshake();
            }

            // add any new members
            for (list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin();
                    i != newOnes.end();
                    i++) {
                ReplSetConfig::MemberCfg *m = *i;
                Member *mi = new Member(m->h, m->_id, m, false);

                // we will indicate that new members are up() initially so that we don't relinquish
                // our primary state because we can't (transiently) see a majority. they should be
                // up as we check that new members are up before getting here on reconfig anyway.
                mi->get_hbinfo().health = 0.1;

                _members.push(mi);
                startHealthTaskFor(mi);
            }

            // if we aren't creating new members, we may have to update the
            // groups for the current ones
            _cfg->updateMembers(_members);

            return true;
        }

        // start with no members.  if this is a reconfig, drop the old ones.
        _members.orphanAll();

        endOldHealthTasks();
        
        int oldPrimaryId = -1;
        {
            const Member *p = box.getPrimary();
            if (p)
                oldPrimaryId = p->id();
        }
        forgetPrimary(txn);

        // not setting _self to 0 as other threads use _self w/o locking
        int me = 0;

        // For logging
        string members = "";

        for (vector<ReplSetConfig::MemberCfg>::const_iterator i = config().members.begin();
                i != config().members.end();
                i++) {
            const ReplSetConfig::MemberCfg& m = *i;
            Member *mi;
            members += (members == "" ? "" : ", ") + m.h.toString();
            if (isSelf(m.h)) {
                verify(me++ == 0);
                mi = new Member(m.h, m._id, &m, true);
                if (!reconf) {
                    log() << "replSet I am " << m.h.toString() << rsLog;
                }
                setSelfTo(mi);

                if ((int)mi->id() == oldPrimaryId)
                    box.setSelfPrimary(mi);
            }
            else {
                mi = new Member(m.h, m._id, &m, false);
                _members.push(mi);
                if ((int)mi->id() == oldPrimaryId)
                    box.setOtherPrimary(mi);
            }
        }

        if (me == 0){
            log() << "replSet warning did not detect own host in full reconfig, members "
                  << members << " config: " << c << rsLog;
        }
        else {
            // Do this after we've found ourselves, since _self needs
            // to be set before we can start the heartbeat tasks
            for (Member *mb = _members.head(); mb; mb=mb->next()) {
                startHealthTaskFor(mb);
            }
        }
        return true;
    }
Exemplo n.º 16
0
        error_code deploy_svc_service_impl::start()
        {
            std::string pdir = utils::filesystem::path_combine(dsn_get_current_app_data_dir(), "services");
            _service_dir = dsn_config_get_value_string("deploy.service",
                "deploy_dir",
                pdir.c_str(),
                "where to put temporal deployment resources"
                );

            // load clusters
            const char* clusters[100];
            int sz = 100;
            int count = dsn_config_get_all_keys("deploy.service.clusters", clusters, &sz);
            dassert(count <= 100, "too many clusters");

            for (int i = 0; i < count; i++)
            {
                std::string cluster_name = dsn_config_get_value_string(
                    clusters[i],
                    "name",
                    "",
                    "cluster name"
                    );

                if (nullptr != get_cluster(cluster_name))
                {
                    derror("cluster %s already defined", cluster_name.c_str());
                    return ERR_CLUSTER_ALREADY_EXIST;
                }

                std::string cluster_factory_type = dsn_config_get_value_string(
                    clusters[i],
                    "factory",
                    "",
                    "factory name to create the target cluster scheduler"
                    );

                auto cluster = ::dsn::utils::factory_store<cluster_scheduler>::create(
                    cluster_factory_type.c_str(),
                    PROVIDER_TYPE_MAIN
                    );

                if (nullptr == cluster)
                {
                    derror("cluster type %s is not defined", cluster_factory_type.c_str());
                    return ERR_OBJECT_NOT_FOUND;
                }

                std::shared_ptr<cluster_ex> ce(new cluster_ex);
                ce->scheduler.reset(cluster);
                ce->cluster.name = cluster_name;
                ce->cluster.type = cluster->type();

                _clusters[cluster_name] = ce;
            }

            _cli_deploy = dsn_cli_app_register(
                "deploy",
                "deploy deploy_request(in json format)",
                "deploy an app via our deployment service",
                (void*)this,
                [](void *context, int argc, const char **argv, dsn_cli_reply *reply)
                {
                    auto this_ = (deploy_svc_service_impl*)context;
                    this_->on_deploy_cli(context, argc, argv, reply);
                },
                __svc_cli_freeer__
                );

            _cli_undeploy = dsn_cli_app_register(
                "undeploy",
                "undeploy service_name(in json format)",
                "undeploy an app via our deployment service",
                (void*)this,
                [](void *context, int argc, const char **argv, dsn_cli_reply *reply)
                {
                    auto this_ = (deploy_svc_service_impl*)context;
                    this_->on_undeploy_cli(context, argc, argv, reply);
                },
                __svc_cli_freeer__
                );

            _cli_get_service_list = dsn_cli_app_register(
                "service_list",
                "service_list package_id(in json format)",
                "get service list of a package via our deployment service",
                (void*)this,
                [](void *context, int argc, const char **argv, dsn_cli_reply *reply)
                {
                    auto this_ = (deploy_svc_service_impl*)context;
                    this_->on_get_service_list_cli(context, argc, argv, reply);
                },
                __svc_cli_freeer__
                );

            _cli_get_service_info = dsn_cli_app_register(
                "service_info",
                "service_info service_name(in json format)",
                "get service info of a service via our deployment service",
                (void*)this,
                [](void *context, int argc, const char **argv, dsn_cli_reply *reply)
                {
                    auto this_ = (deploy_svc_service_impl*)context;
                    this_->on_get_service_info_cli(context, argc, argv, reply);
                },
                __svc_cli_freeer__
                );

            _cli_get_cluster_list = dsn_cli_app_register(
                "cluster_list",
                "cluster_list format(in json format)",
                "get cluster list with a specific format via our deployment service",
                (void*)this,
                [](void *context, int argc, const char **argv, dsn_cli_reply *reply)
                {
                    auto this_ = (deploy_svc_service_impl*)context;
                    this_->on_get_cluster_list_cli(context, argc, argv, reply);
                },
                __svc_cli_freeer__
                );

            return ERR_OK;
        }
Exemplo n.º 17
0
void scheduler::schedule()
{
    _is_scheduling = true;

    check(); // check before schedule

    while (true)
    {
        // run ready workers whenever possible
        std::vector<int> ready_workers;
        for (auto& s : _threads)
        {
            if ((s->in_continuation && s->is_continuation_ready)
                || (!s->in_continuation && s->worker->queue()->approx_count() > 0)
                )
            {
                ready_workers.push_back(s->index);
            }
        }

        if (ready_workers.size() > 0)
        {
            int i = dsn_random32(0, (uint32_t)ready_workers.size() - 1);
            _threads[ready_workers[i]]->runnable.release();
            
            _is_scheduling = false;
            return;
        }

        // otherwise, run the timed tasks
        uint64_t ts = 0;
        auto events = _wheel.pop_next_events(ts);
        if (events)
        {
            {
                utils::auto_lock< ::dsn::utils::ex_lock> l(_lock);
                _time_ns = ts;
            }

            // randomize the events, and see
            std::random_shuffle(events->begin(), events->end(), [](int n) { return dsn_random32(0, n - 1); });

            for (auto e : *events)
            {
                if (e.app_task != nullptr)
                {
                    task* t = e.app_task;

                    {
                        node_scoper ns(t->node());
                        t->enqueue();
                    }

                    t->release_ref(); // added by previous t->enqueue from app
                }
                else
                {
                    dassert(e.system_task != nullptr, "app and system tasks cannot be both empty");
                    e.system_task();
                }
            }

            delete events;
            continue;
        }

        // wait a moment
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
    }

    _is_scheduling = false;
}
        error_code native_posix_aio_provider::aio_internal(aio_task* aio_tsk, bool async, __out_param uint32_t* pbytes /*= nullptr*/)
        {
            auto aio = (posix_disk_aio_context *)aio_tsk->aio();
            int r;

            aio->this_ = this;
            aio->cb.aio_fildes = static_cast<int>((ssize_t)aio->file);
            aio->cb.aio_buf = aio->buffer;
            aio->cb.aio_nbytes = aio->buffer_size;
            aio->cb.aio_offset = aio->file_offset;

            // set up callback
            aio->cb.aio_sigevent.sigev_notify = SIGEV_THREAD;
            aio->cb.aio_sigevent.sigev_notify_function = aio_completed;
            aio->cb.aio_sigevent.sigev_notify_attributes = nullptr;
            aio->cb.aio_sigevent.sigev_value.sival_ptr = aio;

            if (!async)
            {
                aio->evt = new utils::notify_event();
                aio->err = ERR_OK;
                aio->bytes = 0;
            }

            switch (aio->type)
            {
            case AIO_Read:
                r = aio_read(&aio->cb);
                break;
            case AIO_Write:
                r = aio_write(&aio->cb);
                break;
            default:
                dassert (false, "unknown aio type %u", static_cast<int>(aio->type));
                break;
            }

            if (r != 0)
            {
                derror("file op failed, err = %d (%s). On FreeBSD, you may need to load"
                       " aio kernel module by running 'sudo kldload aio'.", errno, strerror(errno));

                if (async)
                {
                    complete_io(aio_tsk, ERR_FILE_OPERATION_FAILED, 0);
                }
                else
                {
                    delete aio->evt;
                    aio->evt = nullptr;
                }
                return ERR_FILE_OPERATION_FAILED;
            }
            else 
            {
                if (async)
                {
                    return ERR_IO_PENDING;
                }
                else
                {
                    aio->evt->wait();
                    delete aio->evt;
                    aio->evt = nullptr;
                    *pbytes = aio->bytes;
                    return aio->err;
                }
            }
        }
Exemplo n.º 19
0
    Status ModifierRename::prepare(mutablebson::Element root,
                                   const StringData& matchedField,
                                   ExecInfo* execInfo) {
        // Rename doesn't work with positional fields ($)
        dassert(matchedField.empty());

        _preparedState.reset(new PreparedState(root));

        // Locate the to field name in 'root', which must exist.
        size_t fromIdxFound;
        Status status = pathsupport::findLongestPrefix(_fromFieldRef,
                                                       root,
                                                       &fromIdxFound,
                                                       &_preparedState->fromElemFound);

        // If we can't find the full element in the from field then we can't do anything.
        if (!status.isOK()) {
            execInfo->noOp = true;
            _preparedState->fromElemFound = root.getDocument().end();

            // TODO: remove this special case from existing behavior
            if (status.code() == ErrorCodes::PathNotViable) {
                return status;
            }

            return Status::OK();
        }

        // Ensure no array in ancestry if what we found is not at the root
        mutablebson::Element curr = _preparedState->fromElemFound.parent();
        if (curr != curr.getDocument().root())
            while (curr.ok() && (curr != curr.getDocument().root())) {
                if (curr.getType() == Array)
                    return Status(ErrorCodes::BadValue,
                                  str::stream() << "The source field cannot be an array element, '"
                                  << _fromFieldRef.dottedField() << "' in doc with "
                                  << findElementNamed(root.leftChild(), "_id").toString()
                                  << " has an array field called '" << curr.getFieldName() << "'");
                curr = curr.parent();
            }

        // "To" side validation below

        status = pathsupport::findLongestPrefix(_toFieldRef,
                                                root,
                                                &_preparedState->toIdxFound,
                                                &_preparedState->toElemFound);

        // FindLongestPrefix may return not viable or any other error and then we cannot proceed.
        if (status.code() == ErrorCodes::NonExistentPath) {
            // Not an error condition as we will create the "to" path as needed.
        } else if (!status.isOK()) {
            return status;
        }

        const bool destExists = _preparedState->toElemFound.ok() &&
                                (_preparedState->toIdxFound == (_toFieldRef.numParts()-1));

        // Ensure no array in ancestry of "to" Element
        // Set to either parent, or node depending on if the full path element was found
        curr = (destExists ? _preparedState->toElemFound.parent() : _preparedState->toElemFound);
        if (curr != curr.getDocument().root()) {
            while (curr.ok()) {
                if (curr.getType() == Array)
                    return Status(ErrorCodes::BadValue,
                                  str::stream()
                                  << "The destination field cannot be an array element, '"
                                  << _fromFieldRef.dottedField() << "' in doc with "
                                  << findElementNamed(root.leftChild(), "_id").toString()
                                  << " has an array field called '" << curr.getFieldName() << "'");
                curr = curr.parent();
            }
        }

        // We register interest in the field name. The driver needs this info to sort out if
        // there is any conflict among mods.
        execInfo->fieldRef[0] = &_fromFieldRef;
        execInfo->fieldRef[1] = &_toFieldRef;

        execInfo->noOp = false;

        return Status::OK();
    }
Exemplo n.º 20
0
 bool LogComponentSettings::hasMinimumLogSeverity(LogComponent component) const {
     dassert(int(component) >= 0 && int(component) < LogComponent::kNumLogComponents);
     return _hasMinimumLoggedSeverity[component];
 }
Exemplo n.º 21
0
// fromBSON to Key format
KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
    BSONObj::iterator i(obj);
    unsigned char bits = 0;
    while (1) {
        BSONElement e = i.next();
        if (i.more())
            bits |= cHASMORE;
        switch (e.type()) {
            case MinKey:
                b.appendUChar(cminkey | bits);
                break;
            case jstNULL:
                b.appendUChar(cnull | bits);
                break;
            case MaxKey:
                b.appendUChar(cmaxkey | bits);
                break;
            case Bool:
                b.appendUChar((e.boolean() ? ctrue : cfalse) | bits);
                break;
            case jstOID:
                b.appendUChar(coid | bits);
                b.appendBuf(e.__oid().view().view(), OID::kOIDSize);
                break;
            case BinData: {
                int t = e.binDataType();
                // 0-7 and 0x80 to 0x87 are supported by Key
                if ((t & 0x78) == 0 && t != ByteArrayDeprecated) {
                    int len;
                    const char* d = e.binData(len);
                    if (len <= BinDataLenMax) {
                        int code = BinDataLengthToCode[len];
                        if (code >= 0) {
                            if (t >= 128)
                                t = (t - 128) | 0x08;
                            dassert((code & t) == 0);
                            b.appendUChar(cbindata | bits);
                            b.appendUChar(code | t);
                            b.appendBuf(d, len);
                            break;
                        }
                    }
                }
                traditional(obj);
                return;
            }
            case Date:
                b.appendUChar(cdate | bits);
                b.appendStruct(e.date());
                break;
            case String: {
                b.appendUChar(cstring | bits);
                // note we do not store the terminating null, to save space.
                unsigned x = (unsigned)e.valuestrsize() - 1;
                if (x > 255) {
                    traditional(obj);
                    return;
                }
                b.appendUChar(x);
                b.appendBuf(e.valuestr(), x);
                break;
            }
            case NumberInt:
                b.appendUChar(cint | bits);
                b.appendNum((double)e._numberInt());
                break;
            case NumberLong: {
                long long n = e._numberLong();
                long long m = 2LL << 52;
                DEV {
                    long long d = m - 1;
                    verify(((long long)((double)-d)) == -d);
                }
                if (n >= m || n <= -m) {
                    // can't represent exactly as a double
                    traditional(obj);
                    return;
                }
                b.appendUChar(clong | bits);
                b.appendNum((double)n);
                break;
            }
            case NumberDouble: {
                double d = e._numberDouble();
                if (isNaN(d)) {
                    traditional(obj);
                    return;
                }
                b.appendUChar(cdouble | bits);
                b.appendNum(d);
                break;
            }
            default:
                // if other types involved, store as traditional BSON
                traditional(obj);
                return;
        }
        if (!i.more())
            break;
        bits = 0;
    }
    _keyData = (const unsigned char*)b.buf();
    dassert(b.len() == dataSize());  // check datasize method is correct
    dassert((*_keyData & cNOTUSED) == 0);
}
Exemplo n.º 22
0
 LogSeverity LogComponentSettings::getMinimumLogSeverity(LogComponent component) const {
     dassert(int(component) >= 0 && int(component) < LogComponent::kNumLogComponents);
     return LogSeverity::cast(_minimumLoggedSeverity[component]);
 }
Exemplo n.º 23
0
void ChunkManager::getAllShardIds(set<ShardId>* all) const {
    dassert(all);

    all->insert(_shardIds.begin(), _shardIds.end());
}
Exemplo n.º 24
0
    void BatchWriteExec::executeBatch( const BatchedCommandRequest& clientRequest,
                                       BatchedCommandResponse* clientResponse ) {

        BatchWriteOp batchOp;
        batchOp.initClientRequest( &clientRequest );

        // Current batch status
        bool refreshedTargeter = false;
        int rounds = 0;
        int numCompletedOps = 0;
        int numRoundsWithoutProgress = 0;

        while ( !batchOp.isFinished() ) {

            //
            // Get child batches to send using the targeter
            //
            // Targeting errors can be caused by remote metadata changing (the collection could have
            // been dropped and recreated, for example with a new shard key).  If a remote metadata
            // change occurs *before* a client sends us a batch, we need to make sure that we don't
            // error out just because we're staler than the client - otherwise mongos will be have
            // unpredictable behavior.
            //
            // (If a metadata change happens *during* or *after* a client sends us a batch, however,
            // we make no guarantees about delivery.)
            //
            // For this reason, we don't record targeting errors until we've refreshed our targeting
            // metadata at least once *after* receiving the client batch - at that point, we know:
            //
            // 1) our new metadata is the same as the metadata when the client sent a batch, and so
            //    targeting errors are real.
            // OR
            // 2) our new metadata is a newer version than when the client sent a batch, and so
            //    the metadata must have changed after the client batch was sent.  We don't need to
            //    deliver in this case, since for all the client knows we may have gotten the batch
            //    exactly when the metadata changed.
            //

            OwnedPointerVector<TargetedWriteBatch> childBatchesOwned;
            vector<TargetedWriteBatch*>& childBatches = childBatchesOwned.mutableVector();

            // If we've already had a targeting error, we've refreshed the metadata once and can
            // record target errors definitively.
            bool recordTargetErrors = refreshedTargeter;
            Status targetStatus = batchOp.targetBatch( *_targeter,
                                                       recordTargetErrors,
                                                       &childBatches );
            if ( !targetStatus.isOK() ) {
                // Don't do anything until a targeter refresh
                _targeter->noteCouldNotTarget();
                refreshedTargeter = true;
                ++_stats->numTargetErrors;
                dassert( childBatches.size() == 0u );
            }

            //
            // Send all child batches
            //

            size_t numSent = 0;
            size_t numToSend = childBatches.size();
            bool remoteMetadataChanging = false;
            while ( numSent != numToSend ) {

                // Collect batches out on the network, mapped by endpoint
                HostBatchMap pendingBatches;

                //
                // Send side
                //

                // Get as many batches as we can at once
                for ( vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                    it != childBatches.end(); ++it ) {

                    //
                    // Collect the info needed to dispatch our targeted batch
                    //

                    TargetedWriteBatch* nextBatch = *it;
                    // If the batch is NULL, we sent it previously, so skip
                    if ( nextBatch == NULL ) continue;

                    // Figure out what host we need to dispatch our targeted batch
                    ConnectionString shardHost;
                    Status resolveStatus = _resolver->chooseWriteHost( nextBatch->getEndpoint()
                                                                           .shardName,
                                                                       &shardHost );
                    if ( !resolveStatus.isOK() ) {

                        ++_stats->numResolveErrors;

                        // Record a resolve failure
                        // TODO: It may be necessary to refresh the cache if stale, or maybe just
                        // cancel and retarget the batch
                        WriteErrorDetail error;
                        buildErrorFrom( resolveStatus, &error );
                        batchOp.noteBatchError( *nextBatch, error );

                        // We're done with this batch
                        *it = NULL;
                        --numToSend;
                        continue;
                    }

                    // If we already have a batch for this host, wait until the next time
                    HostBatchMap::iterator pendingIt = pendingBatches.find( shardHost );
                    if ( pendingIt != pendingBatches.end() ) continue;

                    //
                    // We now have all the info needed to dispatch the batch
                    //

                    BatchedCommandRequest request( clientRequest.getBatchType() );
                    batchOp.buildBatchRequest( *nextBatch, &request );

                    // Internally we use full namespaces for request/response, but we send the
                    // command to a database with the collection name in the request.
                    NamespaceString nss( request.getNS() );
                    request.setNS( nss.coll() );

                    _dispatcher->addCommand( shardHost, nss.db(), request );

                    // Indicate we're done by setting the batch to NULL
                    // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                    // endpoints for the same host, so this should be pretty efficient without
                    // moving stuff around.
                    *it = NULL;

                    // Recv-side is responsible for cleaning up the nextBatch when used
                    pendingBatches.insert( make_pair( shardHost, nextBatch ) );
                }

                // Send them all out
                _dispatcher->sendAll();
                numSent += pendingBatches.size();

                //
                // Recv side
                //

                while ( _dispatcher->numPending() > 0 ) {

                    // Get the response
                    ConnectionString shardHost;
                    BatchedCommandResponse response;
                    Status dispatchStatus = _dispatcher->recvAny( &shardHost, &response );

                    // Get the TargetedWriteBatch to find where to put the response
                    dassert( pendingBatches.find( shardHost ) != pendingBatches.end() );
                    TargetedWriteBatch* batch = pendingBatches.find( shardHost )->second;

                    if ( dispatchStatus.isOK() ) {

                        TrackedErrors trackedErrors;
                        trackedErrors.startTracking( ErrorCodes::StaleShardVersion );

                        // Dispatch was ok, note response
                        batchOp.noteBatchResponse( *batch, response, &trackedErrors );

                        // Note if anything was stale
                        const vector<ShardError*>& staleErrors =
                            trackedErrors.getErrors( ErrorCodes::StaleShardVersion );

                        if ( staleErrors.size() > 0 ) {
                            noteStaleResponses( staleErrors, _targeter );
                            ++_stats->numStaleBatches;
                        }

                        // Remember if the shard is actively changing metadata right now
                        if ( isShardMetadataChanging( staleErrors ) ) {
                            remoteMetadataChanging = true;
                        }

                        // Remember that we successfully wrote to this shard
                        // NOTE: This will record lastOps for shards where we actually didn't update
                        // or delete any documents, which preserves old behavior but is conservative
                        _stats->noteWriteAt( shardHost,
                                             response.isLastOpSet() ? 
                                             response.getLastOp() : OpTime(),
                                             response.isElectionIdSet() ?
                                             response.getElectionId() : OID());
                    }
                    else {

                        // Error occurred dispatching, note it

                        stringstream msg;
                        msg << "write results unavailable from " << shardHost.toString()
                            << causedBy( dispatchStatus.toString() );

                        WriteErrorDetail error;
                        buildErrorFrom( Status( ErrorCodes::RemoteResultsUnavailable, msg.str() ),
                                        &error );
                        batchOp.noteBatchError( *batch, error );
                    }
                }
            }

            ++rounds;
            ++_stats->numRounds;

            // If we're done, get out
            if ( batchOp.isFinished() )
                break;

            // MORE WORK TO DO

            //
            // Refresh the targeter if we need to (no-op if nothing stale)
            //

            bool targeterChanged = false;
            Status refreshStatus = _targeter->refreshIfNeeded( &targeterChanged );

            if ( !refreshStatus.isOK() ) {

                // It's okay if we can't refresh, we'll just record errors for the ops if
                // needed.
                warning() << "could not refresh targeter" << causedBy( refreshStatus.reason() )
                          << endl;
            }

            //
            // Ensure progress is being made toward completing the batch op
            //

            int currCompletedOps = batchOp.numWriteOpsIn( WriteOpState_Completed );
            if ( currCompletedOps == numCompletedOps && !targeterChanged
                 && !remoteMetadataChanging ) {
                ++numRoundsWithoutProgress;
            }
            else {
                numRoundsWithoutProgress = 0;
            }
            numCompletedOps = currCompletedOps;

            if ( numRoundsWithoutProgress > kMaxRoundsWithoutProgress ) {

                stringstream msg;
                msg << "no progress was made executing batch write op in " << clientRequest.getNS()
                    << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                    << " ops completed in " << rounds << " rounds total)";

                WriteErrorDetail error;
                buildErrorFrom( Status( ErrorCodes::NoProgressMade, msg.str() ), &error );
                batchOp.abortBatch( error );
                break;
            }
        }

        batchOp.buildClientResponse( clientResponse );
    }
Exemplo n.º 25
0
    void Socket::handleSendError(int ret, const char* context) {

#if defined(_WIN32)
        const int mongo_errno = WSAGetLastError();
        if ( mongo_errno == WSAETIMEDOUT && _timeout != 0 ) {
#else
        const int mongo_errno = errno;
        if ( ( mongo_errno == EAGAIN || mongo_errno == EWOULDBLOCK ) && _timeout != 0 ) {
#endif
            LOG(_logLevel) << "Socket " << context << 
                " send() timed out " << remoteString() << endl;
            throw SocketException(SocketException::SEND_TIMEOUT , remoteString());
        }
        else {
            LOG(_logLevel) << "Socket " << context << " send() "
                           << errnoWithDescription(mongo_errno) << ' ' << remoteString() << endl;
            throw SocketException(SocketException::SEND_ERROR , remoteString());            
        }
    }

    void Socket::handleRecvError(int ret, int len) {
        if (ret == 0) {
            LOG(3) << "Socket recv() conn closed? " << remoteString() << endl;
            throw SocketException(SocketException::CLOSED , remoteString());
        }
     
        // ret < 0
#if defined(_WIN32)
        int e = WSAGetLastError();
#else
        int e = errno;
# if defined(EINTR)
        if (e == EINTR) {
            LOG(_logLevel) << "EINTR returned from recv(), retrying";
            return;
        }
# endif
#endif

#if defined(_WIN32)
        // Windows
        if ((e == EAGAIN || e == WSAETIMEDOUT) && _timeout > 0) { 
#else
        if (e == EAGAIN && _timeout > 0) { 
#endif
            // this is a timeout
            LOG(_logLevel) << "Socket recv() timeout  " << remoteString() <<endl;
            throw SocketException(SocketException::RECV_TIMEOUT, remoteString());
        }

        LOG(_logLevel) << "Socket recv() " << 
            errnoWithDescription(e) << " " << remoteString() <<endl;
        throw SocketException(SocketException::RECV_ERROR , remoteString());
    }

    void Socket::setTimeout( double secs ) {
        setSockTimeouts( _fd, secs );
    }

    // TODO: allow modification?
    //
    // <positive value> : secs to wait between stillConnected checks
    // 0 : always check
    // -1 : never check
    const int Socket::errorPollIntervalSecs( 5 );

    // Patch to allow better tolerance of flaky network connections that get broken
    // while we aren't looking.
    // TODO: Remove when better async changes come.
    //
    // isStillConnected() polls the socket at max every Socket::errorPollIntervalSecs to determine
    // if any disconnection-type events have happened on the socket.
    bool Socket::isStillConnected() {

        if (MONGO_FAIL_POINT(notStillConnected)) { return false; }

        if (_fd == -1) {
            // According to the man page, poll will respond with POLLVNAL for invalid or
            // unopened descriptors, but it doesn't seem to be properly implemented in
            // some platforms - it can return 0 events and 0 for revent. Hence this workaround.
            return false;
        }

        if ( errorPollIntervalSecs < 0 ) return true;
        if ( ! isPollSupported() ) return true; // nothing we can do

        time_t now = time( 0 );
        time_t idleTimeSecs = now - _lastValidityCheckAtSecs;

        // Only check once every 5 secs
        if ( idleTimeSecs < errorPollIntervalSecs ) return true;
        // Reset our timer, we're checking the connection
        _lastValidityCheckAtSecs = now;

        // It's been long enough, poll to see if our socket is still connected

        pollfd pollInfo;
        pollInfo.fd = _fd;
        // We only care about reading the EOF message on clean close (and errors)
        pollInfo.events = POLLIN;

        // Poll( info[], size, timeout ) - timeout == 0 => nonblocking
        int nEvents = socketPoll( &pollInfo, 1, 0 );

        LOG( 2 ) << "polling for status of connection to " << remoteString()
                 << ", " << ( nEvents == 0 ? "no events" :
                              nEvents == -1 ? "error detected" :
                                               "event detected" ) << endl;

        if ( nEvents == 0 ) {
            // No events incoming, return still connected AFAWK
            return true;
        }
        else if ( nEvents < 0 ) {
            // Poll itself failed, this is weird, warn and log errno
            warning() << "Socket poll() failed during connectivity check"
                      << " (idle " << idleTimeSecs << " secs,"
                      << " remote host " << remoteString() << ")"
                      << causedBy(errnoWithDescription()) << endl;

            // Return true since it's not clear that we're disconnected.
            return true;
        }

        dassert( nEvents == 1 );
        dassert( pollInfo.revents > 0 );

        // Return false at this point, some event happened on the socket, but log what the
        // actual event was.

        if ( pollInfo.revents & POLLIN ) {

            // There shouldn't really be any data to recv here, so make sure this
            // is a clean hangup.

            const int testBufLength = 1024;
            char testBuf[testBufLength];

            int recvd = ::recv( _fd, testBuf, testBufLength, portRecvFlags );

            if ( recvd < 0 ) {
                // An error occurred during recv, warn and log errno
                warning() << "Socket recv() failed during connectivity check"
                          << " (idle " << idleTimeSecs << " secs,"
                          << " remote host " << remoteString() << ")"
                          << causedBy(errnoWithDescription()) << endl;
            }
            else if ( recvd > 0 ) {
                // We got nonzero data from this socket, very weird?
                // Log and warn at runtime, log and abort at devtime
                // TODO: Dump the data to the log somehow?
                error() << "Socket found pending " << recvd
                        << " bytes of data during connectivity check"
                        << " (idle " << idleTimeSecs << " secs,"
                        << " remote host " << remoteString() << ")" << endl;
                dassert( false );
            }
            else {
                // recvd == 0, socket closed remotely, just return false
                LOG( 0 ) << "Socket closed remotely, no longer connected"
                         << " (idle " << idleTimeSecs << " secs,"
                         << " remote host " << remoteString() << ")" << endl;
            }
        }
        else if ( pollInfo.revents & POLLHUP ) {
            // A hangup has occurred on this socket
            LOG( 0 ) << "Socket hangup detected, no longer connected" << " (idle "
                         << idleTimeSecs << " secs," << " remote host " << remoteString() << ")"
                         << endl;
        }
        else if ( pollInfo.revents & POLLERR ) {
            // An error has occurred on this socket
            LOG( 0 ) << "Socket error detected, no longer connected" << " (idle "
                         << idleTimeSecs << " secs," << " remote host " << remoteString() << ")"
                         << endl;
        }
        else if ( pollInfo.revents & POLLNVAL ) {
            // Socket descriptor itself is weird
            // Log and warn at runtime, log and abort at devtime
            error() << "Socket descriptor detected as invalid"
                    << " (idle " << idleTimeSecs << " secs,"
                    << " remote host " << remoteString() << ")" << endl;
            dassert( false );
        }
        else {
            // Don't know what poll is saying here
            // Log and warn at runtime, log and abort at devtime
            error() << "Socket had unknown event (" << static_cast<int>(pollInfo.revents) << ")"
                    << " (idle " << idleTimeSecs << " secs,"
                    << " remote host " << remoteString() << ")" << endl;
            dassert( false );
        }

        return false;
    }

#if defined(_WIN32)
    MONGO_INITIALIZER(SockWSAStartup)(InitializerContext * context) {
        WSADATA d;
        if ( WSAStartup(MAKEWORD(2,2), &d) != 0 ) {
            log() << "ERROR: wsastartup failed " << errnoWithDescription() << endl;
            abort();
        }

        return Status::OK();
    }
Exemplo n.º 26
0
ChunkRange::ChunkRange(BSONObj minKey, BSONObj maxKey)
    : _minKey(std::move(minKey)), _maxKey(std::move(maxKey)) {
    dassert(SimpleBSONObjComparator::kInstance.evaluate(_minKey < _maxKey));
}
Exemplo n.º 27
0
 void FieldRefSet::fillFrom(const std::vector<FieldRef*>& fields) {
     dassert(_fieldSet.empty());
     _fieldSet.insert(fields.begin(), fields.end());
 }
Exemplo n.º 28
0
    UpdateResult update(
            OperationContext* txn,
            Database* db,
            const UpdateRequest& request,
            OpDebug* opDebug,
            UpdateDriver* driver,
            CanonicalQuery* cq) {

        LOG(3) << "processing update : " << request;

        std::auto_ptr<CanonicalQuery> cqHolder(cq);
        const NamespaceString& nsString = request.getNamespaceString();
        UpdateLifecycle* lifecycle = request.getLifecycle();

        Collection* collection = db->getCollection(nsString.ns());

        validateUpdate(nsString.ns().c_str(), request.getUpdates(), request.getQuery());


        // TODO: This seems a bit circuitious.
        opDebug->updateobj = request.getUpdates();

        if (lifecycle) {
            lifecycle->setCollection(collection);
            driver->refreshIndexKeys(lifecycle->getIndexKeys());
        }

        Runner* rawRunner;
        Status status = cq ?
            getRunner(collection, cqHolder.release(), &rawRunner) :
            getRunner(collection, nsString.ns(), request.getQuery(), &rawRunner, &cq);
        uassert(17243,
                "could not get runner " + request.getQuery().toString() + "; " + causedBy(status),
                status.isOK());

        // Create the runner and setup all deps.
        auto_ptr<Runner> runner(rawRunner);

        // Register Runner with ClientCursor
        const ScopedRunnerRegistration safety(runner.get());

        //
        // We'll start assuming we have one or more documents for this update. (Otherwise,
        // we'll fall-back to insert case (if upsert is true).)
        //

        // We are an update until we fall into the insert case below.
        driver->setContext(ModifierInterface::ExecInfo::UPDATE_CONTEXT);

        int numMatched = 0;

        // If the update was in-place, we may see it again.  This only matters if we're doing
        // a multi-update; if we're not doing a multi-update we stop after one update and we
        // won't see any more docs.
        //
        // For example: If we're scanning an index {x:1} and performing {$inc:{x:5}}, we'll keep
        // moving the document forward and it will continue to reappear in our index scan.
        // Unless the index is multikey, the underlying query machinery won't de-dup.
        //
        // If the update wasn't in-place we may see it again.  Our query may return the new
        // document and we wouldn't want to update that.
        //
        // So, no matter what, we keep track of where the doc wound up.
        typedef unordered_set<DiskLoc, DiskLoc::Hasher> DiskLocSet;
        const scoped_ptr<DiskLocSet> updatedLocs(request.isMulti() ? new DiskLocSet : NULL);

        // Reset these counters on each call. We might re-enter this function to retry this
        // update if we throw a page fault exception below, and we rely on these counters
        // reflecting only the actions taken locally. In particlar, we must have the no-op
        // counter reset so that we can meaningfully comapre it with numMatched above.
        opDebug->nscanned = 0;
        opDebug->nscannedObjects = 0;
        opDebug->nModified = 0;

        // Get the cached document from the update driver.
        mutablebson::Document& doc = driver->getDocument();
        mutablebson::DamageVector damages;

        // Used during iteration of docs
        BSONObj oldObj;

        // Get first doc, and location
        Runner::RunnerState state = Runner::RUNNER_ADVANCED;

        uassert(ErrorCodes::NotMaster,
                mongoutils::str::stream() << "Not primary while updating " << nsString.ns(),
                !request.shouldCallLogOp() || isMasterNs(nsString.ns().c_str()));

        while (true) {
            // Get next doc, and location
            DiskLoc loc;
            state = runner->getNext(&oldObj, &loc);

            if (state != Runner::RUNNER_ADVANCED) {
                if (state == Runner::RUNNER_EOF) {
                    // We have reached the logical end of the loop, so do yielding recovery
                    break;
                }
                else {
                    uassertStatusOK(Status(ErrorCodes::InternalError,
                                           str::stream() << " Update query failed -- "
                                                         << Runner::statestr(state)));
                }
            }

            // We fill this with the new locs of moved doc so we don't double-update.
            if (updatedLocs && updatedLocs->count(loc) > 0) {
                continue;
            }

            // We count how many documents we scanned even though we may skip those that are
            // deemed duplicated. The final 'numMatched' and 'nscanned' numbers may differ for
            // that reason.
            // TODO: Do we want to pull this out of the underlying query plan?
            opDebug->nscanned++;

            // Found a matching document
            opDebug->nscannedObjects++;
            numMatched++;

            // Ask the driver to apply the mods. It may be that the driver can apply those "in
            // place", that is, some values of the old document just get adjusted without any
            // change to the binary layout on the bson layer. It may be that a whole new
            // document is needed to accomodate the new bson layout of the resulting document.
            doc.reset(oldObj, mutablebson::Document::kInPlaceEnabled);
            BSONObj logObj;


            FieldRefSet updatedFields;

            Status status = Status::OK();
            if (!driver->needMatchDetails()) {
                // If we don't need match details, avoid doing the rematch
                status = driver->update(StringData(), &doc, &logObj, &updatedFields);
            }
            else {
                // If there was a matched field, obtain it.
                MatchDetails matchDetails;
                matchDetails.requestElemMatchKey();

                dassert(cq);
                verify(cq->root()->matchesBSON(oldObj, &matchDetails));

                string matchedField;
                if (matchDetails.hasElemMatchKey())
                    matchedField = matchDetails.elemMatchKey();

                // TODO: Right now, each mod checks in 'prepare' that if it needs positional
                // data, that a non-empty StringData() was provided. In principle, we could do
                // that check here in an else clause to the above conditional and remove the
                // checks from the mods.

                status = driver->update(matchedField, &doc, &logObj, &updatedFields);
            }

            if (!status.isOK()) {
                uasserted(16837, status.reason());
            }

            // Ensure _id exists and is first
            uassertStatusOK(ensureIdAndFirst(doc));

            // If the driver applied the mods in place, we can ask the mutable for what
            // changed. We call those changes "damages". :) We use the damages to inform the
            // journal what was changed, and then apply them to the original document
            // ourselves. If, however, the driver applied the mods out of place, we ask it to
            // generate a new, modified document for us. In that case, the file manager will
            // take care of the journaling details for us.
            //
            // This code flow is admittedly odd. But, right now, journaling is baked in the file
            // manager. And if we aren't using the file manager, we have to do jounaling
            // ourselves.
            bool docWasModified = false;
            BSONObj newObj;
            const char* source = NULL;
            bool inPlace = doc.getInPlaceUpdates(&damages, &source);

            // If something changed in the document, verify that no immutable fields were changed
            // and data is valid for storage.
            if ((!inPlace || !damages.empty()) ) {
                if (!(request.isFromReplication() || request.isFromMigration())) {
                    const std::vector<FieldRef*>* immutableFields = NULL;
                    if (lifecycle)
                        immutableFields = lifecycle->getImmutableFields();

                    uassertStatusOK(validate(oldObj,
                                             updatedFields,
                                             doc,
                                             immutableFields,
                                             driver->modOptions()) );
                }
            }

            // Save state before making changes
            runner->saveState();

            if (inPlace && !driver->modsAffectIndices()) {

                // If a set of modifiers were all no-ops, we are still 'in place', but there is
                // no work to do, in which case we want to consider the object unchanged.
                if (!damages.empty() ) {
                    collection->updateDocumentWithDamages( txn, loc, source, damages );
                    docWasModified = true;
                    opDebug->fastmod = true;
                }

                newObj = oldObj;
            }
            else {

                // The updates were not in place. Apply them through the file manager.
                newObj = doc.getObject();
                uassert(17419,
                        str::stream() << "Resulting document after update is larger than "
                                      << BSONObjMaxUserSize,
                        newObj.objsize() <= BSONObjMaxUserSize);
                StatusWith<DiskLoc> res = collection->updateDocument(txn,
                                                                     loc,
                                                                     newObj,
                                                                     true,
                                                                     opDebug);
                uassertStatusOK(res.getStatus());
                DiskLoc newLoc = res.getValue();
                docWasModified = true;

                // If the document moved, we might see it again in a collection scan (maybe it's
                // a document after our current document).
                //
                // If the document is indexed and the mod changes an indexed value, we might see it
                // again.  For an example, see the comment above near declaration of updatedLocs.
                if (updatedLocs && (newLoc != loc || driver->modsAffectIndices())) {
                    updatedLocs->insert(newLoc);
                }
            }

            // Restore state after modification
            uassert(17278,
                    "Update could not restore runner state after updating a document.",
                    runner->restoreState());

            // Call logOp if requested.
            if (request.shouldCallLogOp() && !logObj.isEmpty()) {
                BSONObj idQuery = driver->makeOplogEntryQuery(newObj, request.isMulti());
                logOp(txn, "u", nsString.ns().c_str(), logObj , &idQuery,
                      NULL, request.isFromMigration());
            }

            // Only record doc modifications if they wrote (exclude no-ops)
            if (docWasModified)
                opDebug->nModified++;

            if (!request.isMulti()) {
                break;
            }

            // Opportunity for journaling to write during the update.
            txn->recoveryUnit()->commitIfNeeded();
        }

        // TODO: Can this be simplified?
        if ((numMatched > 0) || (numMatched == 0 && !request.isUpsert()) ) {
            opDebug->nMatched = numMatched;
            return UpdateResult(numMatched > 0 /* updated existing object(s) */,
                                !driver->isDocReplacement() /* $mod or obj replacement */,
                                opDebug->nModified /* number of modified docs, no no-ops */,
                                numMatched /* # of docs matched/updated, even no-ops */,
                                BSONObj());
        }

        //
        // We haven't found any existing document so an insert is done
        // (upsert is true).
        //
        opDebug->upsert = true;

        // Since this is an insert (no docs found and upsert:true), we will be logging it
        // as an insert in the oplog. We don't need the driver's help to build the
        // oplog record, then. We also set the context of the update driver to the INSERT_CONTEXT.
        // Some mods may only work in that context (e.g. $setOnInsert).
        driver->setLogOp(false);
        driver->setContext(ModifierInterface::ExecInfo::INSERT_CONTEXT);

        // Reset the document we will be writing to
        doc.reset();

        // This remains the empty object in the case of an object replacement, but in the case
        // of an upsert where we are creating a base object from the query and applying mods,
        // we capture the query as the original so that we can detect immutable field mutations.
        BSONObj original = BSONObj();

        // Calling createFromQuery will populate the 'doc' with fields from the query which
        // creates the base of the update for the inserterd doc (because upsert was true)
        if (cq) {
            uassertStatusOK(driver->populateDocumentWithQueryFields(cq, doc));
            // Validate the base doc, as taken from the query -- no fields means validate all.
            FieldRefSet noFields;
            uassertStatusOK(validate(BSONObj(), noFields, doc, NULL, driver->modOptions()));
            if (!driver->isDocReplacement()) {
                opDebug->fastmodinsert = true;
                // We need all the fields from the query to compare against for validation below.
                original = doc.getObject();
            }
            else {
                original = request.getQuery();
            }
        }
        else {
            fassert(17354, CanonicalQuery::isSimpleIdQuery(request.getQuery()));
            BSONElement idElt = request.getQuery()["_id"];
            original = idElt.wrap();
            fassert(17352, doc.root().appendElement(idElt));
        }

        // Apply the update modifications and then log the update as an insert manually.
        FieldRefSet updatedFields;
        status = driver->update(StringData(), &doc, NULL, &updatedFields);
        if (!status.isOK()) {
            uasserted(16836, status.reason());
        }

        // Ensure _id exists and is first
        uassertStatusOK(ensureIdAndFirst(doc));

        // Validate that the object replacement or modifiers resulted in a document
        // that contains all the immutable keys and can be stored.
        if (!(request.isFromReplication() || request.isFromMigration())){
            const std::vector<FieldRef*>* immutableFields = NULL;
            if (lifecycle)
                immutableFields = lifecycle->getImmutableFields();

            // This will only validate the modified fields if not a replacement.
            uassertStatusOK(validate(original,
                                     updatedFields,
                                     doc,
                                     immutableFields,
                                     driver->modOptions()) );
        }

        // Only create the collection if the doc will be inserted.
        if (!collection) {
            collection = db->getCollection(request.getNamespaceString().ns());
            if (!collection) {
                collection = db->createCollection(txn, request.getNamespaceString().ns());
            }
        }

        // Insert the doc
        BSONObj newObj = doc.getObject();
        uassert(17420,
                str::stream() << "Document to upsert is larger than " << BSONObjMaxUserSize,
                newObj.objsize() <= BSONObjMaxUserSize);

        StatusWith<DiskLoc> newLoc = collection->insertDocument(txn,
                                                                newObj,
                                                                !request.isGod() /*enforceQuota*/);
        uassertStatusOK(newLoc.getStatus());
        if (request.shouldCallLogOp()) {
            logOp(txn, "i", nsString.ns().c_str(), newObj,
                   NULL, NULL, request.isFromMigration());
        }

        opDebug->nMatched = 1;
        return UpdateResult(false /* updated a non existing document */,
                            !driver->isDocReplacement() /* $mod or obj replacement? */,
                            1 /* docs written*/,
                            1 /* count of updated documents */,
                            newObj /* object that was upserted */ );
    }
Exemplo n.º 29
0
    bool WriteCmd::run(const string& dbName,
                       BSONObj& cmdObj,
                       int options,
                       string& errMsg,
                       BSONObjBuilder& result,
                       bool fromRepl) {

        // Can't be run on secondaries (logTheOp() == false, slaveOk() == false).
        dassert( !fromRepl );
        BatchedCommandRequest request( _writeType );
        BatchedCommandResponse response;

        if ( !request.parseBSON( cmdObj, &errMsg ) || !request.isValid( &errMsg ) ) {

            // Batch parse failure
            response.setOk( false );
            response.setN( 0 );
            response.setErrCode( ErrorCodes::FailedToParse );
            response.setErrMessage( errMsg );

            dassert( response.isValid( &errMsg ) );
            result.appendElements( response.toBSON() );

            // TODO
            // There's a pending issue about how to report response here. If we use
            // the command infra-structure, we should reuse the 'errmsg' field. But
            // we have already filed that message inside the BatchCommandResponse.
            // return response.getOk();
            return true;
        }

        // Note that this is a runCommmand, and therefore, the database and the collection name
        // are in different parts of the grammar for the command. But it's more convenient to
        // work with a NamespaceString. We built it here and replace it in the parsed command.
        // Internally, everything work with the namespace string as opposed to just the
        // collection name.
        NamespaceString nss(dbName, request.getNS());
        request.setNS(nss.ns());

        Status status = userAllowedWriteNS( nss );
        if ( !status.isOK() )
            return appendCommandStatus( result, status );

        if ( cc().curop() )
            cc().curop()->setNS( nss.ns() );

        if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) {
            // check all docs
            BatchedInsertRequest* insertRequest = request.getInsertRequest();
            vector<BSONObj>& docsToInsert = insertRequest->getDocuments();
            for ( size_t i = 0; i < docsToInsert.size(); i++ ) {
                StatusWith<BSONObj> fixed = fixDocumentForInsert( docsToInsert[i] );
                if ( !fixed.isOK() ) {
                    // we don't return early since each doc can be handled independantly
                    continue;
                }
                if ( fixed.getValue().isEmpty() ) {
                    continue;
                }
                docsToInsert[i] = fixed.getValue();
            }
        }

        BSONObj defaultWriteConcern;
        // This is really bad - it's only safe because we leak the defaults by overriding them with
        // new defaults and because we never reset to an empty default.
        // TODO: fix this for sane behavior where we query repl set object
        if ( getLastErrorDefault ) defaultWriteConcern = *getLastErrorDefault;
        if ( defaultWriteConcern.isEmpty() ) {
            BSONObjBuilder b;
            b.append( "w", 1 );
            defaultWriteConcern = b.obj();
        }

        WriteBatchExecutor writeBatchExecutor(defaultWriteConcern,
                                              &cc(),
                                              &globalOpCounters,
                                              lastError.get());

        writeBatchExecutor.executeBatch( request, &response );

        result.appendElements( response.toBSON() );

        // TODO
        // There's a pending issue about how to report response here. If we use
        // the command infra-structure, we should reuse the 'errmsg' field. But
        // we have already filed that message inside the BatchCommandResponse.
        // return response.getOk();
        return true;
    }
Exemplo n.º 30
0
        /** we will build an output buffer ourself and then use O_DIRECT
            we could be in read lock for this
            caller handles locking 
            */
        static void PREPLOGBUFFER() { 
            assert( cmdLine.dur );
            AlignedBuilder& bb = commitJob._ab;
            bb.reset();

            unsigned lenOfs;
            // JSectHeader
            {
                bb.appendStr("\nHH\n", false);
                lenOfs = bb.skip(4);
            }

            // ops other than basic writes
            {
                for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { 
                    (*i)->serialize(bb);
                }
            }

            // write intents
            {
                scoped_lock lk(privateViews._mutex());
                string lastFilePath;
                for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
                    size_t ofs;
                    MongoMMF *mmf = privateViews._find(i->p, ofs);
                    if( mmf == 0 ) {
                        string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p;
                        journalingFailure(s.c_str()); // asserts
                        return;
                    }

                    if( !mmf->willNeedRemap() ) {
                        mmf->willNeedRemap() = true; // usually it will already be dirty so don't bother writing then
                    }
                    //size_t ofs = ((char *)i->p) - ((char*)mmf->getView().p);
                    i->w_ptr = ((char*)mmf->view_write()) + ofs;
                    if( mmf->filePath() != lastFilePath ) { 
                        lastFilePath = mmf->filePath();
                        JDbContext c;
                        bb.appendStruct(c);
                        bb.appendStr(lastFilePath);
                    }
                    JEntry e;
                    e.len = i->len;
                    assert( ofs <= 0x80000000 );
                    e.ofs = (unsigned) ofs;
                    e.fileNo = mmf->fileSuffixNo();
                    bb.appendStruct(e);
                    bb.appendBuf(i->p, i->len);
                }
            }

            {
                JSectFooter f(bb.buf(), bb.len());
                bb.appendStruct(f);
            }

            {
                assert( 0xffffe000 == (~(Alignment-1)) );
                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment
                dassert( L >= (unsigned) bb.len() );
                *((unsigned*)bb.atOfs(lenOfs)) = L;
                unsigned padding = L - bb.len();
                bb.skip(padding);
                dassert( bb.len() % Alignment == 0 );
            }

            return;
        }