Example #1
0
const S2Polygon& BigSimplePolygon::GetPolygonBorder() const {
    if (_borderPoly)
        return *_borderPoly;

    unique_ptr<S2Loop> cloned(_loop->Clone());

    // Any loop in polygon should be than a hemisphere (2*Pi).
    cloned->Normalize();

    OwnedPointerVector<S2Loop> loops;
    loops.mutableVector().push_back(cloned.release());
    _borderPoly.reset(new S2Polygon(&loops.mutableVector()));
    return *_borderPoly;
}
Example #2
0
bool BigSimplePolygon::Contains(const S2Polyline& line) const {
    //
    // A line is contained within a loop if the result of subtracting the loop from the line is
    // nothing.
    //
    // Also, a line is contained within a loop if the result of clipping the line to the
    // complement of the loop is nothing.
    //
    // If we can't subtract the loop itself using S2, we clip (intersect) to the inverse.  Every
    // point in S2 is contained in exactly one of these loops.
    //
    // TODO: Polygon borders are actually kind of weird, and this is somewhat inconsistent with
    // Intersects().  A point might Intersect() a boundary exactly, but not be Contain()ed
    // within the Polygon.  Think the right thing to do here is custom intersection functions.
    //
    const S2Polygon& polyBorder = GetPolygonBorder();

    OwnedPointerVector<S2Polyline> clippedOwned;
    vector<S2Polyline*>& clipped = clippedOwned.mutableVector();

    if (_isNormalized) {
        // Polygon border is the same as the loop
        polyBorder.SubtractFromPolyline(&line, &clipped);
        return clipped.size() == 0;
    } else {
        // Polygon border is the complement of the loop
        polyBorder.IntersectWithPolyline(&line, &clipped);
        return clipped.size() == 0;
    }
}
Example #3
0
    PlanCacheEntry* PlanCacheEntry::clone() const {
        OwnedPointerVector<QuerySolution> solutions;
        for (size_t i = 0; i < plannerData.size(); ++i) {
            QuerySolution* qs = new QuerySolution();
            qs->cacheData.reset(plannerData[i]->clone());
            solutions.mutableVector().push_back(qs);
        }
        PlanCacheEntry* entry = new PlanCacheEntry(solutions.vector(), decision->clone());

        entry->backupSoln = backupSoln;

        // Copy query shape.
        entry->query = query.getOwned();
        entry->sort = sort.getOwned();
        entry->projection = projection.getOwned();

        // Copy performance stats.
        for (size_t i = 0; i < feedback.size(); ++i) {
            PlanCacheEntryFeedback* fb = new PlanCacheEntryFeedback();
            fb->stats.reset(feedback[i]->stats->clone());
            fb->score = feedback[i]->score;
            entry->feedback.push_back(fb);
        }
        entry->averageScore = averageScore;
        entry->stddevScore = stddevScore;
        return entry;
    }
Example #4
0
IndexBounds ChunkManager::getIndexBoundsForQuery(const BSONObj& key,
                                                 const CanonicalQuery& canonicalQuery) {
    // $text is not allowed in planning since we don't have text index on mongos.
    //
    // TODO: Treat $text query as a no-op in planning. So with shard key {a: 1},
    //       the query { a: 2, $text: { ... } } will only target to {a: 2}.
    if (QueryPlannerCommon::hasNode(canonicalQuery.root(), MatchExpression::TEXT)) {
        IndexBounds bounds;
        IndexBoundsBuilder::allValuesBounds(key, &bounds);  // [minKey, maxKey]
        return bounds;
    }

    // Consider shard key as an index
    string accessMethod = IndexNames::findPluginName(key);
    dassert(accessMethod == IndexNames::BTREE || accessMethod == IndexNames::HASHED);

    // Use query framework to generate index bounds
    QueryPlannerParams plannerParams;
    // Must use "shard key" index
    plannerParams.options = QueryPlannerParams::NO_TABLE_SCAN;
    IndexEntry indexEntry(key,
                          accessMethod,
                          false /* multiKey */,
                          false /* sparse */,
                          false /* unique */,
                          "shardkey",
                          NULL /* filterExpr */,
                          BSONObj());
    plannerParams.indices.push_back(indexEntry);

    OwnedPointerVector<QuerySolution> solutions;
    Status status = QueryPlanner::plan(canonicalQuery, plannerParams, &solutions.mutableVector());
    uassert(status.code(), status.reason(), status.isOK());

    IndexBounds bounds;

    for (vector<QuerySolution*>::const_iterator it = solutions.begin();
         bounds.size() == 0 && it != solutions.end();
         it++) {
        // Try next solution if we failed to generate index bounds, i.e. bounds.size() == 0
        bounds = collapseQuerySolution((*it)->root.get());
    }

    if (bounds.size() == 0) {
        // We cannot plan the query without collection scan, so target to all shards.
        IndexBoundsBuilder::allValuesBounds(key, &bounds);  // [minKey, maxKey]
    }
    return bounds;
}
Example #5
0
void Strategy::writeOp(OperationContext* txn, int op, Request& request) {
    // make sure we have a last error
    dassert(&LastError::get(cc()));

    OwnedPointerVector<BatchedCommandRequest> commandRequestsOwned;
    vector<BatchedCommandRequest*>& commandRequests = commandRequestsOwned.mutableVector();

    msgToBatchRequests(request.m(), &commandRequests);

    for (vector<BatchedCommandRequest*>::iterator it = commandRequests.begin();
         it != commandRequests.end();
         ++it) {
        // Multiple commands registered to last error as multiple requests
        if (it != commandRequests.begin())
            LastError::get(cc()).startRequest();

        BatchedCommandRequest* commandRequest = *it;

        // Adjust namespaces for command
        NamespaceString fullNS(commandRequest->getNS());
        string cmdNS = fullNS.getCommandNS();
        // We only pass in collection name to command
        commandRequest->setNS(fullNS);

        BSONObjBuilder builder;
        BSONObj requestBSON = commandRequest->toBSON();

        {
            // Disable the last error object for the duration of the write cmd
            LastError::Disabled disableLastError(&LastError::get(cc()));
            Command::runAgainstRegistered(txn, cmdNS.c_str(), requestBSON, builder, 0);
        }

        BatchedCommandResponse commandResponse;
        bool parsed = commandResponse.parseBSON(builder.done(), NULL);
        (void)parsed;  // for compile
        dassert(parsed && commandResponse.isValid(NULL));

        // Populate the lastError object based on the write response
        LastError::get(cc()).reset();
        bool hadError =
            batchErrorToLastError(*commandRequest, commandResponse, &LastError::get(cc()));

        // Check if this is an ordered batch and we had an error which should stop processing
        if (commandRequest->getOrdered() && hadError)
            break;
    }
}
    BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const {
        RangeDeleter* deleter = getDeleter();
        if (!deleter) {
            return BSONObj();
        }

        BSONObjBuilder result;

        OwnedPointerVector<DeleteJobStats> statsList;
        deleter->getStatsHistory(&statsList.mutableVector());
        BSONArrayBuilder oldStatsBuilder;
        for (OwnedPointerVector<DeleteJobStats>::const_iterator it = statsList.begin();
             it != statsList.end();
             ++it) {
            BSONObjBuilder entryBuilder;
            entryBuilder.append("deletedDocs", (*it)->deletedDocCount);

            if ((*it)->queueEndTS > Date_t()) {
                entryBuilder.append("queueStart", (*it)->queueStartTS);
                entryBuilder.append("queueEnd", (*it)->queueEndTS);
            }

            if ((*it)->deleteEndTS > Date_t()) {
                entryBuilder.append("deleteStart", (*it)->deleteStartTS);
                entryBuilder.append("deleteEnd", (*it)->deleteEndTS);

                if ((*it)->waitForReplEndTS > Date_t()) {
                    entryBuilder.append("waitForReplStart", (*it)->waitForReplStartTS);
                    entryBuilder.append("waitForReplEnd", (*it)->waitForReplEndTS);
                }
            }

            oldStatsBuilder.append(entryBuilder.obj());
        }
        result.append("lastDeleteStats", oldStatsBuilder.arr());

        return result.obj();
    }
Example #7
0
/*static*/ int MongoFile::_flushAll(bool sync) {
    if (!sync) {
        int num = 0;
        LockMongoFilesShared lk;
        for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) {
            num++;
            MongoFile* mmf = *i;
            if (!mmf)
                continue;

            mmf->flush(sync);
        }
        return num;
    }

    // want to do it sync

    // get a thread-safe Flushable object for each file first in a single lock
    // so that we can iterate and flush without doing any locking here
    OwnedPointerVector<Flushable> thingsToFlushWrapper;
    vector<Flushable*>& thingsToFlush = thingsToFlushWrapper.mutableVector();
    {
        LockMongoFilesShared lk;
        for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) {
            MongoFile* mmf = *i;
            if (!mmf)
                continue;
            thingsToFlush.push_back(mmf->prepareFlush());
        }
    }

    for (size_t i = 0; i < thingsToFlush.size(); i++) {
        thingsToFlush[i]->flush();
    }

    return thingsToFlush.size();
}
Example #8
0
// static
Status ListFilters::list(const QuerySettings& querySettings, BSONObjBuilder* bob) {
    invariant(bob);

    // Format of BSON result:
    //
    // {
    //     hints: [
    //         {
    //             query: <query>,
    //             sort: <sort>,
    //             projection: <projection>,
    //             indexes: [<index1>, <index2>, <index3>, ...]
    //         }
    //  }
    BSONArrayBuilder hintsBuilder(bob->subarrayStart("filters"));
    OwnedPointerVector<AllowedIndexEntry> entries;
    entries.mutableVector() = querySettings.getAllAllowedIndices();
    for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin();
            i != entries.end(); ++i) {
        AllowedIndexEntry* entry = *i;
        invariant(entry);

        BSONObjBuilder hintBob(hintsBuilder.subobjStart());
        hintBob.append("query", entry->query);
        hintBob.append("sort", entry->sort);
        hintBob.append("projection", entry->projection);
        BSONArrayBuilder indexesBuilder(hintBob.subarrayStart("indexes"));
        for (vector<BSONObj>::const_iterator j = entry->indexKeyPatterns.begin();
                j != entry->indexKeyPatterns.end(); ++j) {
            const BSONObj& index = *j;
            indexesBuilder.append(index);
        }
        indexesBuilder.doneFast();
    }
    hintsBuilder.doneFast();
    return Status::OK();
}
void BatchWriteExec::executeBatch(const BatchedCommandRequest& clientRequest,
                                  BatchedCommandResponse* clientResponse) {
    LOG(4) << "starting execution of write batch of size "
           << static_cast<int>(clientRequest.sizeWriteOps()) << " for " << clientRequest.getNS()
           << endl;

    BatchWriteOp batchOp;
    batchOp.initClientRequest(&clientRequest);

    // Current batch status
    bool refreshedTargeter = false;
    int rounds = 0;
    int numCompletedOps = 0;
    int numRoundsWithoutProgress = 0;

    while (!batchOp.isFinished()) {
        //
        // Get child batches to send using the targeter
        //
        // Targeting errors can be caused by remote metadata changing (the collection could have
        // been dropped and recreated, for example with a new shard key).  If a remote metadata
        // change occurs *before* a client sends us a batch, we need to make sure that we don't
        // error out just because we're staler than the client - otherwise mongos will be have
        // unpredictable behavior.
        //
        // (If a metadata change happens *during* or *after* a client sends us a batch, however,
        // we make no guarantees about delivery.)
        //
        // For this reason, we don't record targeting errors until we've refreshed our targeting
        // metadata at least once *after* receiving the client batch - at that point, we know:
        //
        // 1) our new metadata is the same as the metadata when the client sent a batch, and so
        //    targeting errors are real.
        // OR
        // 2) our new metadata is a newer version than when the client sent a batch, and so
        //    the metadata must have changed after the client batch was sent.  We don't need to
        //    deliver in this case, since for all the client knows we may have gotten the batch
        //    exactly when the metadata changed.
        //

        OwnedPointerVector<TargetedWriteBatch> childBatchesOwned;
        vector<TargetedWriteBatch*>& childBatches = childBatchesOwned.mutableVector();

        // If we've already had a targeting error, we've refreshed the metadata once and can
        // record target errors definitively.
        bool recordTargetErrors = refreshedTargeter;
        Status targetStatus = batchOp.targetBatch(*_targeter, recordTargetErrors, &childBatches);
        if (!targetStatus.isOK()) {
            // Don't do anything until a targeter refresh
            _targeter->noteCouldNotTarget();
            refreshedTargeter = true;
            ++_stats->numTargetErrors;
            dassert(childBatches.size() == 0u);
        }

        //
        // Send all child batches
        //

        size_t numSent = 0;
        size_t numToSend = childBatches.size();
        bool remoteMetadataChanging = false;
        while (numSent != numToSend) {
            // Collect batches out on the network, mapped by endpoint
            OwnedHostBatchMap ownedPendingBatches;
            OwnedHostBatchMap::MapType& pendingBatches = ownedPendingBatches.mutableMap();

            //
            // Send side
            //

            // Get as many batches as we can at once
            for (vector<TargetedWriteBatch*>::iterator it = childBatches.begin();
                 it != childBatches.end();
                 ++it) {
                //
                // Collect the info needed to dispatch our targeted batch
                //

                TargetedWriteBatch* nextBatch = *it;
                // If the batch is NULL, we sent it previously, so skip
                if (nextBatch == NULL)
                    continue;

                // Figure out what host we need to dispatch our targeted batch
                ConnectionString shardHost;
                Status resolveStatus =
                    _resolver->chooseWriteHost(nextBatch->getEndpoint().shardName, &shardHost);
                if (!resolveStatus.isOK()) {
                    ++_stats->numResolveErrors;

                    // Record a resolve failure
                    // TODO: It may be necessary to refresh the cache if stale, or maybe just
                    // cancel and retarget the batch
                    WriteErrorDetail error;
                    buildErrorFrom(resolveStatus, &error);

                    LOG(4) << "unable to send write batch to " << shardHost.toString()
                           << causedBy(resolveStatus.toString()) << endl;

                    batchOp.noteBatchError(*nextBatch, error);

                    // We're done with this batch
                    // Clean up when we can't resolve a host
                    delete *it;
                    *it = NULL;
                    --numToSend;
                    continue;
                }

                // If we already have a batch for this host, wait until the next time
                OwnedHostBatchMap::MapType::iterator pendingIt = pendingBatches.find(shardHost);
                if (pendingIt != pendingBatches.end())
                    continue;

                //
                // We now have all the info needed to dispatch the batch
                //

                BatchedCommandRequest request(clientRequest.getBatchType());
                batchOp.buildBatchRequest(*nextBatch, &request);

                // Internally we use full namespaces for request/response, but we send the
                // command to a database with the collection name in the request.
                NamespaceString nss(request.getNS());
                request.setNS(nss.coll());

                LOG(4) << "sending write batch to " << shardHost.toString() << ": "
                       << request.toString() << endl;

                _dispatcher->addCommand(shardHost, nss.db(), request);

                // Indicate we're done by setting the batch to NULL
                // We'll only get duplicate hostEndpoints if we have broadcast and non-broadcast
                // endpoints for the same host, so this should be pretty efficient without
                // moving stuff around.
                *it = NULL;

                // Recv-side is responsible for cleaning up the nextBatch when used
                pendingBatches.insert(make_pair(shardHost, nextBatch));
            }

            // Send them all out
            _dispatcher->sendAll();
            numSent += pendingBatches.size();

            //
            // Recv side
            //

            while (_dispatcher->numPending() > 0) {
                // Get the response
                ConnectionString shardHost;
                BatchedCommandResponse response;
                Status dispatchStatus = _dispatcher->recvAny(&shardHost, &response);

                // Get the TargetedWriteBatch to find where to put the response
                dassert(pendingBatches.find(shardHost) != pendingBatches.end());
                TargetedWriteBatch* batch = pendingBatches.find(shardHost)->second;

                if (dispatchStatus.isOK()) {
                    TrackedErrors trackedErrors;
                    trackedErrors.startTracking(ErrorCodes::StaleShardVersion);

                    LOG(4) << "write results received from " << shardHost.toString() << ": "
                           << response.toString() << endl;

                    // Dispatch was ok, note response
                    batchOp.noteBatchResponse(*batch, response, &trackedErrors);

                    // Note if anything was stale
                    const vector<ShardError*>& staleErrors =
                        trackedErrors.getErrors(ErrorCodes::StaleShardVersion);

                    if (staleErrors.size() > 0) {
                        noteStaleResponses(staleErrors, _targeter);
                        ++_stats->numStaleBatches;
                    }

                    // Remember if the shard is actively changing metadata right now
                    if (isShardMetadataChanging(staleErrors)) {
                        remoteMetadataChanging = true;
                    }

                    // Remember that we successfully wrote to this shard
                    // NOTE: This will record lastOps for shards where we actually didn't update
                    // or delete any documents, which preserves old behavior but is conservative
                    _stats->noteWriteAt(shardHost,
                                        response.isLastOpSet() ? response.getLastOp() : OpTime(),
                                        response.isElectionIdSet() ? response.getElectionId()
                                                                   : OID());
                } else {
                    // Error occurred dispatching, note it

                    stringstream msg;
                    msg << "write results unavailable from " << shardHost.toString()
                        << causedBy(dispatchStatus.toString());

                    WriteErrorDetail error;
                    buildErrorFrom(Status(ErrorCodes::RemoteResultsUnavailable, msg.str()), &error);

                    LOG(4) << "unable to receive write results from " << shardHost.toString()
                           << causedBy(dispatchStatus.toString()) << endl;

                    batchOp.noteBatchError(*batch, error);
                }
            }
        }

        ++rounds;
        ++_stats->numRounds;

        // If we're done, get out
        if (batchOp.isFinished())
            break;

        // MORE WORK TO DO

        //
        // Refresh the targeter if we need to (no-op if nothing stale)
        //

        bool targeterChanged = false;
        Status refreshStatus = _targeter->refreshIfNeeded(&targeterChanged);

        if (!refreshStatus.isOK()) {
            // It's okay if we can't refresh, we'll just record errors for the ops if
            // needed.
            warning() << "could not refresh targeter" << causedBy(refreshStatus.reason()) << endl;
        }

        //
        // Ensure progress is being made toward completing the batch op
        //

        int currCompletedOps = batchOp.numWriteOpsIn(WriteOpState_Completed);
        if (currCompletedOps == numCompletedOps && !targeterChanged && !remoteMetadataChanging) {
            ++numRoundsWithoutProgress;
        } else {
            numRoundsWithoutProgress = 0;
        }
        numCompletedOps = currCompletedOps;

        if (numRoundsWithoutProgress > kMaxRoundsWithoutProgress) {
            stringstream msg;
            msg << "no progress was made executing batch write op in " << clientRequest.getNS()
                << " after " << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                << " ops completed in " << rounds << " rounds total)";

            WriteErrorDetail error;
            buildErrorFrom(Status(ErrorCodes::NoProgressMade, msg.str()), &error);
            batchOp.abortBatch(error);
            break;
        }
    }

    batchOp.buildClientResponse(clientResponse);

    LOG(4) << "finished execution of write batch"
           << (clientResponse->isErrDetailsSet() ? " with write errors" : "")
           << (clientResponse->isErrDetailsSet() && clientResponse->isWriteConcernErrorSet()
                   ? " and"
                   : "")
           << (clientResponse->isWriteConcernErrorSet() ? " with write concern error" : "")
           << " for " << clientRequest.getNS() << endl;
}
Example #10
0
    Status WriteOp::targetWrites( const NSTargeter& targeter,
                                  std::vector<TargetedWrite*>* targetedWrites ) {

        bool isUpdate = _itemRef.getOpType() == BatchedCommandRequest::BatchType_Update;
        bool isDelete = _itemRef.getOpType() == BatchedCommandRequest::BatchType_Delete;

        // In case of error, don't leak.
        OwnedPointerVector<ShardEndpoint> endpointsOwned;
        vector<ShardEndpoint*>& endpoints = endpointsOwned.mutableVector();

        if ( isUpdate || isDelete ) {

            // Updates/deletes targeted by query

            BSONObj queryDoc =
                isUpdate ? _itemRef.getUpdate()->getQuery() : _itemRef.getDelete()->getQuery();

            Status targetStatus = targeter.targetQuery( queryDoc, &endpoints );

            if ( targetStatus.isOK() ) {
                targetStatus =
                    isUpdate ?
                        updateTargetsOk( *this, endpoints ) : deleteTargetsOk( *this, endpoints );
            }

            if ( !targetStatus.isOK() ) return targetStatus;
        }
        else {
            dassert( _itemRef.getOpType() == BatchedCommandRequest::BatchType_Insert );

            // Inserts targeted by doc itself

            ShardEndpoint* endpoint = NULL;
            Status targetStatus = targeter.targetDoc( _itemRef.getDocument(), &endpoint );

            if ( !targetStatus.isOK() ) {
                dassert( NULL == endpoint );
                return targetStatus;
            }

            dassert( NULL != endpoint );
            endpoints.push_back( endpoint );
        }

        for ( vector<ShardEndpoint*>::iterator it = endpoints.begin(); it != endpoints.end();
            ++it ) {

            ShardEndpoint* endpoint = *it;

            _childOps.push_back( new ChildWriteOp( this ) );

            WriteOpRef ref( _itemRef.getItemIndex(), _childOps.size() - 1 );

            // For now, multiple endpoints imply no versioning
            if ( endpoints.size() == 1u ) {
                targetedWrites->push_back( new TargetedWrite( *endpoint, ref ) );
            }
            else {
                ShardEndpoint broadcastEndpoint( endpoint->shardName,
                                                 ChunkVersion::IGNORED(),
                                                 endpoint->shardHost );
                targetedWrites->push_back( new TargetedWrite( broadcastEndpoint, ref ) );
            }

            _childOps.back()->pendingWrite = targetedWrites->back();
            _childOps.back()->state = WriteOpState_Pending;
        }

        _state = WriteOpState_Pending;
        return Status::OK();
    }
Example #11
0
    bool mergeChunks( OperationContext* txn,
                      const NamespaceString& nss,
                      const BSONObj& minKey,
                      const BSONObj& maxKey,
                      const OID& epoch,
                      string* errMsg ) {

        //
        // Get sharding state up-to-date
        //

        ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(),
                                                              *errMsg );
        if ( !configLoc.isValid() ){
            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get the distributed lock
        //

        ScopedDistributedLock collLock( configLoc, nss.ns() );
        collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from "
                                          << minKey << " to " << maxKey );

        Status acquisitionStatus = collLock.tryAcquire();
        if (!acquisitionStatus.isOK()) {
            *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                               << " to merge chunks in [" << minKey << "," << maxKey << ")"
                               << causedBy(acquisitionStatus);

            warning() << *errMsg << endl;
            return false;
        }

        //
        // We now have the collection lock, refresh metadata to latest version and sanity check
        //

        ChunkVersion shardVersion;
        Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion);

        if ( !status.isOK() ) {

            *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                    << nss.ns() << causedBy( status.reason() );

            warning() << *errMsg << endl;
            return false;
        }

        if ( epoch.isSet() && shardVersion.epoch() != epoch ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has changed" << " since merge was sent" << "(sent epoch : "
                               << epoch.toString()
                               << ", current epoch : " << shardVersion.epoch().toString() << ")";

            warning() << *errMsg << endl;
            return false;
        }

        CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() );

        if ( !metadata || metadata->getKeyPattern().isEmpty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " is not sharded";

            warning() << *errMsg << endl;
            return false;
        }

        dassert( metadata->getShardVersion().equals( shardVersion ) );

        if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) {

            *errMsg = stream() << "could not merge chunks, the range "
                               << rangeToString( minKey, maxKey ) << " is not valid"
                               << " for collection " << nss.ns() << " with key pattern "
                               << metadata->getKeyPattern();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get merged chunk information
        //

        ChunkVersion mergeVersion = metadata->getCollVersion();
        mergeVersion.incMinor();

        OwnedPointerVector<ChunkType> chunksToMerge;

        ChunkType itChunk;
        itChunk.setMin( minKey );
        itChunk.setMax( minKey );
        itChunk.setNS( nss.ns() );
        itChunk.setShard( shardingState.getShardName() );

        while ( itChunk.getMax().woCompare( maxKey ) < 0 &&
                metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) {
            auto_ptr<ChunkType> saved( new ChunkType );
            itChunk.cloneTo( saved.get() );
            chunksToMerge.mutableVector().push_back( saved.release() );
        }

        if ( chunksToMerge.empty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " and ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Validate the range starts and ends at chunks and has no holes, error if not valid
        //

        BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin();
        BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax();
        // minKey is inclusive
        bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey );

        if ( !minKeyInRange ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin();
        BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax();
        // maxKey is exclusive
        bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 &&
                lastDocMax.woCompare( maxKey ) >= 0;

        if ( !maxKeyInRange ) {
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0;
        bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0;

        if ( !validRangeStartKey || !validRangeEndKey ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " does not contain a chunk "
                               << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" )
                               << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" )
                               << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" );

            warning() << *errMsg << endl;
            return false;
        }

        if ( chunksToMerge.size() == 1 ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " already contains chunk for " << rangeToString( minKey, maxKey );

            warning() << *errMsg << endl;
            return false;
        }

        bool holeInRange = false;

        // Look for hole in range
        ChunkType* prevChunk = *chunksToMerge.begin();
        ChunkType* nextChunk = NULL;
        for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin();
                it != chunksToMerge.end(); ++it ) {
            if ( it == chunksToMerge.begin() ) continue;

            nextChunk = *it;
            if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) {
                holeInRange = true;
                break;
            }
            prevChunk = nextChunk;
        }

        if ( holeInRange ) {

            dassert( NULL != nextChunk );
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has a hole in the range " << rangeToString( minKey, maxKey )
                               << " at " << rangeToString( prevChunk->getMax(),
                                                           nextChunk->getMin() );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Run apply ops command
        //

        BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge,
                                                shardVersion,
                                                mergeVersion );

        bool ok;
        BSONObj result;
        try {
            ScopedDbConnection conn( configLoc, 30.0 );
            ok = conn->runCommand( "config", applyOpsCmd, result );
            if ( !ok ) *errMsg = result.toString();
            conn.done();
        }
        catch( const DBException& ex ) {
            ok = false;
            *errMsg = ex.toString();
        }

        if ( !ok ) {
            *errMsg = stream() << "could not merge chunks for " << nss.ns()
                               << ", writing to config failed" << causedBy( errMsg );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Install merged chunk metadata
        //

        {
            Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X);
            shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
        }

        //
        // Log change
        //

        BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge,
                                                    shardVersion,
                                                    mergeVersion );

        configServer.logChange( "merge", nss.ns(), mergeLogEntry );

        return true;
    }
Example #12
0
    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
     */
    const DiskLoc DataFileMgr::updateRecord(
        const char *ns,
        Collection* collection,
        Record *toupdate, const DiskLoc& dl,
        const char *_buf, int _len, OpDebug& debug,  bool god) {

        dassert( toupdate == dl.rec() );

        BSONObj objOld = BSONObj::make(toupdate);
        BSONObj objNew(_buf);
        DEV verify( objNew.objsize() == _len );
        DEV verify( objNew.objdata() == _buf );

        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
            /* add back the old _id value if the update removes it.  Note this implementation is slow
               (copies entire object multiple times), but this shouldn't happen often, so going for simple
               code, not speed.
            */
            BSONObjBuilder b;
            BSONElement e;
            verify( objOld.getObjectID(e) );
            b.append(e); // put _id first, for best performance
            b.appendElements(objNew);
            objNew = b.obj();
        }

        NamespaceString nsstring(ns);
        if (nsstring.coll() == "system.users") {
            V2UserDocumentParser parser;
            uassertStatusOK(parser.checkValidUserDocument(objNew));
        }

        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew,
                objNew["_id"] == objOld["_id"]);

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerVector<UpdateTicket> updateTickets;
        updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount());
        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern())
                                    || descriptor->unique())
                                  || ignoreUniqueIndex(descriptor->getOnDisk());
            updateTickets.mutableVector()[i] = new UpdateTicket();
            Status ret = iam->validateUpdate(objOld, objNew, dl, options,
                                             updateTickets.mutableVector()[i]);

            if (Status::OK() != ret) {
                uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString());
            }
        }

        if ( toupdate->netLength() < objNew.objsize() ) {
            // doesn't fit.  reallocate -----------------------------------------------------
            moveCounter.increment();
            uassert( 10003,
                     "failing update: objects in a capped ns cannot grow",
                     !(collection && collection->details()->isCapped()));
            collection->details()->paddingTooSmall();
            deleteRecord(ns, toupdate, dl);
            DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god);

            if (debug.nmoved == -1) // default of -1 rather than 0
                debug.nmoved = 1;
            else
                debug.nmoved += 1;

            return res;
        }

        collection->infoCache()->notifyOfWriteOp();
        collection->details()->paddingFits();

        debug.keyUpdates = 0;

        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            int64_t updatedKeys;
            Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys);
            if (Status::OK() != ret) {
                // This shouldn't happen unless something disastrous occurred.
                massert(16799, "update failed: " + ret.toString(), false);
            }
            debug.keyUpdates += updatedKeys;
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz);
        return dl;
    }
Example #13
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // TODO: Lift write concern parsing out of this entirely.
        WriteConcernOptions writeConcern;
        Status status = Status::OK();

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();
        }

        if ( wcDoc.isEmpty() ) {
            status = writeConcern.parse( _defaultWriteConcern );
        }
        else {
            status = writeConcern.parse( wcDoc );
        }

        if ( status.isOK() ) {
            status = validateWriteConcern( writeConcern );
        }

        if ( !status.isOK() ) {
            response->setErrCode( status.code() );
            response->setErrMessage( status.reason() );
            response->setOk( false );
            dassert( response->isValid(NULL) );
            return;
        }

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        //
        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.
        //

        bulkExecute( request, &upserted, &writeErrors );

        //
        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.
        //

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            status = waitForWriteConcern( writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );
            }
        }

        //
        // Refresh metadata if needed
        //

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                //
                // First, we refresh metadata if we need to based on the requested version.
                //

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       requestMetadata->getShardVersion(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );
                }

                if ( queueForMigrationCommit ) {

                    //
                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.
                    //

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    //
                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.
                    //

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
                        }
                    }
                }
            }
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );
            }
        }

        //
        // Construct response
        //

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );
                upserted.clear();
            }

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );
                writeErrors.clear();
            }

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );
            }

            if ( anyReplEnabled() ) {
                response->setLastOp( _client->getLastOp() );
                if (theReplSet) {
                    response->setElectionId( theReplSet->getElectionId() );
                }
            }

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );
        }

        dassert( response->isValid( NULL ) );
    }
Example #14
0
    static StatusWith<double> computeGeoNearDistance(const GeoNearParams& nearParams,
                                                     WorkingSetMember* member) {

        //
        // Generic GeoNear distance computation
        // Distances are computed by projecting the stored geometry into the query CRS, and
        // computing distance in that CRS.
        //

        // Must have an object in order to get geometry out of it.
        invariant(member->hasObj());

        CRS queryCRS = nearParams.nearQuery.centroid.crs;

        // Extract all the geometries out of this document for the near query
        OwnedPointerVector<StoredGeometry> geometriesOwned;
        vector<StoredGeometry*>& geometries = geometriesOwned.mutableVector();
        extractGeometries(member->obj, nearParams.nearQuery.field, &geometries);

        // Compute the minimum distance of all the geometries in the document
        double minDistance = -1;
        BSONObj minDistanceObj;
        for (vector<StoredGeometry*>::iterator it = geometries.begin(); it != geometries.end();
            ++it) {

            StoredGeometry& stored = **it;

            // NOTE: A stored document with STRICT_SPHERE CRS is treated as a malformed document
            // and ignored. Since GeoNear requires an index, there's no stored STRICT_SPHERE shape.
            // So we don't check it here.

            // NOTE: For now, we're sure that if we get this far in the query we'll have an
            // appropriate index which validates the type of geometry we're pulling back here.
            // TODO: It may make sense to change our semantics and, by default, only return
            // shapes in the same CRS from $geoNear.
            if (!stored.geometry.supportsProject(queryCRS))
                continue;
            stored.geometry.projectInto(queryCRS);

            double nextDistance = stored.geometry.minDistance(nearParams.nearQuery.centroid);

            if (minDistance < 0 || nextDistance < minDistance) {
                minDistance = nextDistance;
                minDistanceObj = stored.element.Obj();
            }
        }

        if (minDistance < 0) {
            // No distance to report
            return StatusWith<double>(-1);
        }

        if (nearParams.addDistMeta) {
            if (nearParams.nearQuery.unitsAreRadians) {
                // Hack for nearSphere
                // TODO: Remove nearSphere?
                invariant(SPHERE == queryCRS);
                member->addComputed(new GeoDistanceComputedData(minDistance
                                                                / kRadiusOfEarthInMeters));
            }
            else {
                member->addComputed(new GeoDistanceComputedData(minDistance));
            }
        }

        if (nearParams.addPointMeta) {
            member->addComputed(new GeoNearPointComputedData(minDistanceObj));
        }

        return StatusWith<double>(minDistance);
    }
Example #15
0
    /**
     * The core config write functionality.
     *
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     *
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
     */
    void ConfigCoordinator::executeBatch(const BatchedCommandRequest& clientRequest,
                                         BatchedCommandResponse* clientResponse) {

        const NamespaceString nss(clientRequest.getNS());

        // Should never use it for anything other than DBs residing on the config server
        dassert(nss.db() == "config" || nss.db() == "admin");
        dassert(clientRequest.sizeWriteOps() == 1u);

        // This is an opportunistic check that all config servers look healthy by calling
        // getLastError on each one of them. If there was some form of write/journaling error, get
        // last error would fail.
        {
            for (vector<ConnectionString>::iterator it = _configHosts.begin();
                 it != _configHosts.end();
                 ++it) {

                _dispatcher->addCommand(*it,
                                        "admin",
                                        RawBSONSerializable(BSON("getLastError" << true <<
                                                                 "fsync" << true)));
            }

            _dispatcher->sendAll();

            bool error = false;
            while (_dispatcher->numPending()) {
                ConnectionString host;
                RawBSONSerializable response;

                Status status = _dispatcher->recvAny(&host, &response);
                if (status.isOK()) {
                    BSONObj obj = response.toBSON();

                    LOG(3) << "Response " << obj.toString();

                    // If the ok field is anything other than 1, count it as error
                    if (!obj["ok"].trueValue()) {
                        error = true;
                        log() << "Config server check for host " << host
                              << " returned error: " << response;
                    }
                }
                else {
                    error = true;
                    log() << "Config server check for host " << host
                          << " failed with status: " << status;
                }
            }

            // All responses should have been gathered by this point
            if (error) {
                clientResponse->setOk(false);
                clientResponse->setErrCode(ErrorCodes::RemoteValidationError);
                clientResponse->setErrMessage("Could not verify that config servers were active"
                                              " and reachable before write");
                return;
            }
        }

        if (!_checkConfigString(clientResponse)) {
            return;
        }

        //
        // Do the actual writes
        //

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        //
        // Send the actual config writes
        //

        // Get as many batches as we can at once
        for (vector<ConnectionString>::const_iterator it = _configHosts.begin();
             it != _configHosts.end();
             ++it) {

            const ConnectionString& configHost = *it;
            _dispatcher->addCommand(configHost, nss.db(), configRequest);
        }

        // Send them all out
        _dispatcher->sendAll();

        //
        // Recv side
        //

        while (_dispatcher->numPending() > 0) {
            // Get the response
            responses.push_back(new ConfigResponse());

            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny(&configResponse.configHost,
                                                         &configResponse.response);

            if (!dispatchStatus.isOK()) {
                buildErrorFrom(dispatchStatus, &configResponse.response);
            }
        }

        combineResponses(responses, clientResponse);
    }
Example #16
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );
            return;
        }

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );
            return;
        }

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );
            return;
        }

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();
        }

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
            }
        }
        else {
            wcStatus = writeConcern.parse( wcDoc );
        }

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );
        }

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );
            return;
        }

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );
            return;
        }

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );
            return;
        }

        //
        // End validation
        //

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        //
        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.
        //

        bulkExecute( request, &upserted, &writeErrors );

        //
        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.
        //

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );
            }
        }

        //
        // Refresh metadata if needed
        //

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                //
                // First, we refresh metadata if we need to based on the requested version.
                //

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       requestMetadata->getShardVersion(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );
                }

                if ( queueForMigrationCommit ) {

                    //
                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.
                    //

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    //
                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.
                    //

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
                        }
                    }
                }
            }
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );
            }
        }

        //
        // Construct response
        //

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );
            }

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );
            }

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );
            }

            const repl::ReplicationCoordinator::Mode replMode =
                    repl::getGlobalReplicationCoordinator()->getReplicationMode();
            if (replMode != repl::ReplicationCoordinator::modeNone) {
                response->setLastOp( _client->getLastOp() );
                if (replMode == repl::ReplicationCoordinator::modeReplSet) {
                    response->setElectionId(repl::theReplSet->getElectionId());
                }
            }

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );
        }

        dassert( response->isValid( NULL ) );
    }
Example #17
0
    StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) {

        if ( isCapped() )
            return StatusWith<CompactStats>( ErrorCodes::BadValue,
                                             "cannot compact capped collection" );

        if ( _indexCatalog.numIndexesInProgress() )
            return StatusWith<CompactStats>( ErrorCodes::BadValue,
                                             "cannot compact when indexes in progress" );

        NamespaceDetails* d = details();

        // this is a big job, so might as well make things tidy before we start just to be nice.
        getDur().commitIfNeeded();

        list<DiskLoc> extents;
        for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext )
            extents.push_back(L);
        log() << "compact " << extents.size() << " extents" << endl;

        // same data, but might perform a little different after compact?
        _infoCache.reset();

        vector<BSONObj> indexSpecs;
        {
            IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) );
            while ( ii.more() ) {
                IndexDescriptor* descriptor = ii.next();
                indexSpecs.push_back( _compactAdjustIndexSpec( descriptor->infoObj() ) );
            }
        }

        log() << "compact orphan deleted lists" << endl;
        d->orphanDeletedList();

        // Start over from scratch with our extent sizing and growth
        d->setLastExtentSize( 0 );

        // before dropping indexes, at least make sure we can allocate one extent!
        if ( allocateSpaceForANewRecord( _ns.ns().c_str(),
                                         d,
                                         Record::HeaderSize+1,
                                         false).isNull() ) {
            return StatusWith<CompactStats>( ErrorCodes::InternalError,
                                             "compact error no space available to allocate" );
        }

        // note that the drop indexes call also invalidates all clientcursors for the namespace,
        // which is important and wanted here
        log() << "compact dropping indexes" << endl;
        Status status = _indexCatalog.dropAllIndexes( true );
        if ( !status.isOK() ) {
            return StatusWith<CompactStats>( status );
        }

        getDur().commitIfNeeded();

        CompactStats stats;

        OwnedPointerVector<IndexCatalog::IndexBuildBlock> indexBuildBlocks;
        vector<IndexAccessMethod*> indexesToInsertTo;
        vector< std::pair<IndexAccessMethod*,IndexAccessMethod*> > bulkToCommit;
        for ( size_t i = 0; i < indexSpecs.size(); i++ ) {
            killCurrentOp.checkForInterrupt(false);
            BSONObj info = indexSpecs[i];
            info = _compactAdjustIndexSpec( info );
            info = _indexCatalog.fixIndexSpec( info );
            auto_ptr<IndexCatalog::IndexBuildBlock> block( new IndexCatalog::IndexBuildBlock( this,info ) );
            Status status = block->init();
            if ( !status.isOK() )
                return StatusWith<CompactStats>(status);

            IndexAccessMethod* accessMethod = block->getEntry()->accessMethod();
            status = accessMethod->initializeAsEmpty();
            if ( !status.isOK() )
                return StatusWith<CompactStats>(status);

            IndexAccessMethod* bulk = accessMethod->initiateBulk();
            if ( bulk ) {
                indexesToInsertTo.push_back( bulk );
                bulkToCommit.push_back( std::pair<IndexAccessMethod*,IndexAccessMethod*>( accessMethod, bulk ) );
            }
            else {
                indexesToInsertTo.push_back( accessMethod );
            }

            indexBuildBlocks.mutableVector().push_back( block.release() );
        }

        // reset data size and record counts to 0 for this namespace
        // as we're about to tally them up again for each new extent
        d->setStats( 0, 0 );

        ProgressMeterHolder pm(cc().curop()->setMessage("compact extent",
                                                        "Extent Compacting Progress",
                                                        extents.size()));

        int extentNumber = 0;
        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
            _compactExtent(*i, extentNumber++, indexesToInsertTo, compactOptions, &stats );
            pm.hit();
        }

        verify( d->firstExtent().ext()->xprev.isNull() );

        // indexes will do their own progress meter?
        pm.finished();

        log() << "starting index commits";

        for ( size_t i = 0; i < bulkToCommit.size(); i++ ) {
            bulkToCommit[i].first->commitBulk( bulkToCommit[i].second, false, NULL );
        }

        for ( size_t i = 0; i < indexBuildBlocks.size(); i++ ) {
            IndexCatalog::IndexBuildBlock* block = indexBuildBlocks.mutableVector()[i];
            block->success();
        }

        return StatusWith<CompactStats>( stats );
    }
Example #18
0
    /**
     * The core config write functionality.
     *
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     *
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
     */
    void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest,
                                          BatchedCommandResponse* clientResponse,
                                          bool fsyncCheck ) {

        NamespaceString nss( clientRequest.getNS() );
        dassert( nss.db() == "config" || nss.db() == "admin" );
        dassert( clientRequest.sizeWriteOps() == 1u );

        if ( fsyncCheck ) {

            //
            // Sanity check that all configs are still reachable using fsync, preserving legacy
            // behavior
            //

            OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned;
            vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector();

            //
            // Send side
            //

            for ( vector<ConnectionString>::iterator it = _configHosts.begin();
                it != _configHosts.end(); ++it ) {
                ConnectionString& configHost = *it;
                FsyncRequest fsyncRequest;
                _dispatcher->addCommand( configHost, "admin", fsyncRequest );
            }

            _dispatcher->sendAll();

            //
            // Recv side
            //

            bool fsyncError = false;
            while ( _dispatcher->numPending() > 0 ) {

                fsyncResponses.push_back( new ConfigFsyncResponse() );
                ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back();
                Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost,
                                                              &fsyncResponse.response );

                // We've got to recv everything, no matter what
                if ( !dispatchStatus.isOK() ) {
                    fsyncError = true;
                    buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response );
                }
                else if ( !fsyncResponse.response.getOk() ) {
                    fsyncError = true;
                }
            }

            if ( fsyncError ) {
                combineFsyncErrors( fsyncResponses, clientResponse );
                return;
            }
            else {
                fsyncResponsesOwned.clear();
            }
        }

        //
        // Do the actual writes
        //

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        //
        // Send the actual config writes
        //

        // Get as many batches as we can at once
        for ( vector<ConnectionString>::iterator it = _configHosts.begin();
            it != _configHosts.end(); ++it ) {
            ConnectionString& configHost = *it;
            _dispatcher->addCommand( configHost, nss.db(), configRequest );
        }

        // Send them all out
        _dispatcher->sendAll();

        //
        // Recv side
        //

        while ( _dispatcher->numPending() > 0 ) {

            // Get the response
            responses.push_back( new ConfigResponse() );
            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost,
                                                          &configResponse.response );

            if ( !dispatchStatus.isOK() ) {
                buildErrorFrom( dispatchStatus, &configResponse.response );
            }
        }

        combineResponses( responses, clientResponse );
    }
Example #19
0
    Status Strategy::commandOpWrite(const std::string& dbName,
                                    const BSONObj& command,
                                    BatchItemRef targetingBatchItem,
                                    std::vector<CommandResult>* results) {

        // Note that this implementation will not handle targeting retries and does not completely
        // emulate write behavior

        ChunkManagerTargeter targeter(NamespaceString(
                                        targetingBatchItem.getRequest()->getTargetingNS()));
        Status status = targeter.init();
        if (!status.isOK())
            return status;

        OwnedPointerVector<ShardEndpoint> endpointsOwned;
        vector<ShardEndpoint*>& endpoints = endpointsOwned.mutableVector();

        if (targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Insert) {
            ShardEndpoint* endpoint;
            Status status = targeter.targetInsert(targetingBatchItem.getDocument(), &endpoint);
            if (!status.isOK())
                return status;
            endpoints.push_back(endpoint);
        }
        else if (targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Update) {
            Status status = targeter.targetUpdate(*targetingBatchItem.getUpdate(), &endpoints);
            if (!status.isOK())
                return status;
        }
        else {
            invariant(targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Delete);
            Status status = targeter.targetDelete(*targetingBatchItem.getDelete(), &endpoints);
            if (!status.isOK())
                return status;
        }

        DBClientShardResolver resolver;
        DBClientMultiCommand dispatcher;

        // Assemble requests
        for (vector<ShardEndpoint*>::const_iterator it = endpoints.begin(); it != endpoints.end();
            ++it) {

            const ShardEndpoint* endpoint = *it;

            ConnectionString host;
            Status status = resolver.chooseWriteHost(endpoint->shardName, &host);
            if (!status.isOK())
                return status;

            RawBSONSerializable request(command);
            dispatcher.addCommand(host, dbName, request);
        }

        // Errors reported when recv'ing responses
        dispatcher.sendAll();
        Status dispatchStatus = Status::OK();

        // Recv responses
        while (dispatcher.numPending() > 0) {

            ConnectionString host;
            RawBSONSerializable response;

            Status status = dispatcher.recvAny(&host, &response);
            if (!status.isOK()) {
                // We always need to recv() all the sent operations
                dispatchStatus = status;
                continue;
            }

            CommandResult result;
            result.target = host;
            result.shardTarget = Shard::make(host.toString());
            result.result = response.toBSON();

            results->push_back(result);
        }

        return dispatchStatus;
    }
    /**
     * Upgrade v3 to v4 described here.
     *
     * This upgrade takes a config server without collection epochs (potentially) and adds
     * epochs to all mongo processes.
     *
     */
    bool doUpgradeV3ToV4(const ConnectionString& configLoc,
                         const VersionType& lastVersionInfo,
                         string* errMsg)
    {
        string dummy;
        if (!errMsg) errMsg = &dummy;

        verify(lastVersionInfo.getCurrentVersion() == UpgradeHistory_NoEpochVersion);

        if (lastVersionInfo.isUpgradeIdSet() && lastVersionInfo.getUpgradeId().isSet()) {

            //
            // Another upgrade failed, so cleanup may be necessary
            //

            BSONObj lastUpgradeState = lastVersionInfo.getUpgradeState();

            bool inCriticalSection;
            if (!FieldParser::extract(lastUpgradeState,
                                      inCriticalSectionField,
                                      &inCriticalSection,
                                      errMsg))
            {

                *errMsg = stream() << "problem reading previous upgrade state" << causedBy(errMsg);

                return false;
            }

            if (inCriticalSection) {

                // Manual intervention is needed here.  Somehow our upgrade didn't get applied
                // consistently across config servers.

                *errMsg = cannotCleanupMessage;

                return false;
            }

            if (!_cleanupUpgradeState(configLoc, lastVersionInfo.getUpgradeId(), errMsg)) {
                
                // If we can't cleanup the old upgrade state, the user might have done it for us,
                // not a fatal problem (we'll just end up with extra collections).
                
                warning() << "could not cleanup previous upgrade state" << causedBy(errMsg) << endl;
                *errMsg = "";
            }
        }

        //
        // Check the versions of other mongo processes in the cluster before upgrade.
        // We can't upgrade if there are active pre-v2.2 processes in the cluster
        //

        Status mongoVersionStatus = checkClusterMongoVersions(configLoc,
                                                              string(minMongoProcessVersion));

        if (!mongoVersionStatus.isOK()) {

            *errMsg = stream() << "cannot upgrade with pre-v" << minMongoProcessVersion
                               << " mongo processes active in the cluster"
                               << causedBy(mongoVersionStatus);

            return false;
        }

        VersionType newVersionInfo;
        lastVersionInfo.cloneTo(&newVersionInfo);

        // Set our upgrade id and state
        OID upgradeId = OID::gen();
        newVersionInfo.setUpgradeId(upgradeId);
        newVersionInfo.setUpgradeState(BSONObj());

        // Write our upgrade id and state
        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                verify(newVersionInfo.isValid(NULL));

                conn->update(VersionType::ConfigNS,
                             BSON("_id" << 1 << VersionType::version_DEPRECATED(3)),
                             newVersionInfo.toBSON());
                _checkGLE(conn);
            }
            catch (const DBException& e) {

                *errMsg = stream() << "could not initialize version info for upgrade"
                                   << causedBy(e);

                return false;
            }

            connPtr->done();
        }

        //
        // First lock all collection namespaces that exist
        //

        OwnedPointerMap<string, CollectionType> ownedCollections;
        const map<string, CollectionType*>& collections = ownedCollections.map();

        Status findCollectionsStatus = findAllCollectionsV3(configLoc, &ownedCollections);

        if (!findCollectionsStatus.isOK()) {

            *errMsg = stream() << "could not read collections from config server"
                               << causedBy(findCollectionsStatus);

            return false;
        }

        //
        // Acquire locks for all sharded collections
        // Something that didn't involve getting thousands of locks would be better.
        //

        OwnedPointerVector<ScopedDistributedLock> collectionLocks;

        log() << "acquiring locks for " << collections.size() << " sharded collections..." << endl;

        for (map<string, CollectionType*>::const_iterator it = collections.begin();
                it != collections.end(); ++it)
        {
            const CollectionType& collection = *(it->second);

            ScopedDistributedLock* namespaceLock = new ScopedDistributedLock(configLoc,
                                                                             collection.getNS());

            namespaceLock->setLockMessage(str::stream() << "upgrading " << collection.getNS()
                                                        << " with new epochs for upgrade "
                                                        << upgradeId);

            if (!namespaceLock->acquire(15 * 60 * 1000, errMsg)) {

                *errMsg = stream() << "could not acquire all namespace locks for upgrade"
                                   << causedBy(errMsg);

                return false;
            }

            collectionLocks.mutableVector().push_back(namespaceLock);

            // Progress update
            if (collectionLocks.vector().size() % 10 == 0) {
                log() << "acquired " << collectionLocks.vector().size() << " locks out of "
                      << collections.size() << " for config upgrade" << endl;
            }
        }

        // We are now preventing all splits and migrates for all sharded collections

        // Get working and backup suffixes
        string workingSuffix = genWorkingSuffix(upgradeId);
        string backupSuffix = genBackupSuffix(upgradeId);

        log() << "copying collection and chunk metadata to working and backup collections..."
              << endl;

        // Get a backup and working copy of the config.collections and config.chunks collections

        Status copyStatus = copyFrozenCollection(configLoc,
                                                 CollectionType::ConfigNS,
                                                 CollectionType::ConfigNS + workingSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to "
                               << (CollectionType::ConfigNS + workingSuffix)
                               << causedBy(copyStatus);

            return false;
        }

        copyStatus = copyFrozenCollection(configLoc,
                                          CollectionType::ConfigNS,
                                          CollectionType::ConfigNS + backupSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to "
                               << (CollectionType::ConfigNS + backupSuffix) << causedBy(copyStatus);

            return false;
        }

        copyStatus = copyFrozenCollection(configLoc,
                                          ChunkType::ConfigNS,
                                          ChunkType::ConfigNS + workingSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to "
                               << (ChunkType::ConfigNS + workingSuffix) << causedBy(copyStatus);

            return false;
        }

        copyStatus = copyFrozenCollection(configLoc,
                                          ChunkType::ConfigNS,
                                          ChunkType::ConfigNS + backupSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to "
                               << (ChunkType::ConfigNS + backupSuffix) << causedBy(copyStatus);

            return false;
        }

        //
        // Go through sharded collections one-by-one and add epochs where missing
        //

        for (map<string, CollectionType*>::const_iterator it = collections.begin();
                it != collections.end(); ++it)
        {
            // Create a copy so that we can change the epoch later
            CollectionType collection;
            it->second->cloneTo(&collection);

            log() << "checking epochs for " << collection.getNS() << " collection..." << endl;

            OID epoch = collection.getEpoch();

            //
            // Go through chunks to find epoch if we haven't found it or to verify epoch is the same
            //

            OwnedPointerVector<ChunkType> ownedChunks;
            const vector<ChunkType*>& chunks = ownedChunks.vector();

            Status findChunksStatus = findAllChunks(configLoc, collection.getNS(), &ownedChunks);

            if (!findChunksStatus.isOK()) {

                *errMsg = stream() << "could not read chunks from config server"
                                   << causedBy(findChunksStatus);

                return false;
            }

            for (vector<ChunkType*>::const_iterator chunkIt = chunks.begin();
                    chunkIt != chunks.end(); ++chunkIt)
            {
                const ChunkType& chunk = *(*chunkIt);

                // If our chunk epoch is set and doesn't match
                if (epoch.isSet() && chunk.getVersion().epoch().isSet()
                    && chunk.getVersion().epoch() != epoch)
                {

                    *errMsg = stream() << "chunk epoch for " << chunk.toString() << " in "
                                       << collection.getNS() << " does not match found epoch "
                                       << epoch;

                    return false;
                }
                else if (!epoch.isSet() && chunk.getVersion().epoch().isSet()) {
                    epoch = chunk.getVersion().epoch();
                }
            }

            //
            // Write collection epoch if needed
            //

            if (!collection.getEpoch().isSet()) {

                OID newEpoch = OID::gen();

                log() << "writing new epoch " << newEpoch << " for " << collection.getNS()
                      << " collection..." << endl;

                scoped_ptr<ScopedDbConnection> connPtr;

                try {
                    connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                    ScopedDbConnection& conn = *connPtr;

                    conn->update(CollectionType::ConfigNS + workingSuffix,
                                 BSON(CollectionType::ns(collection.getNS())),
                                 BSON("$set" << BSON(CollectionType::DEPRECATED_lastmodEpoch(newEpoch))));
                    _checkGLE(conn);
                }
                catch (const DBException& e) {

                    *errMsg = stream() << "could not write a new epoch for " << collection.getNS()
                                       << causedBy(e);

                    return false;
                }

                connPtr->done();
                collection.setEpoch(newEpoch);
            }

            epoch = collection.getEpoch();
            verify(epoch.isSet());

            //
            // Now write verified epoch to all chunks
            //

            log() << "writing epoch " << epoch << " for " << chunks.size() << " chunks in "
                  << collection.getNS() << " collection..." << endl;

            {
                scoped_ptr<ScopedDbConnection> connPtr;

                try {
                    connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                    ScopedDbConnection& conn = *connPtr;

                    // Multi-update of all chunks
                    conn->update(ChunkType::ConfigNS + workingSuffix,
                                 BSON(ChunkType::ns(collection.getNS())),
                                 BSON("$set" << BSON(ChunkType::DEPRECATED_epoch(epoch))),
                                 false,
                                 true); // multi
                    _checkGLE(conn);
                }
                catch (const DBException& e) {

                    *errMsg = stream() << "could not write a new epoch " << epoch.toString()
                                       << " for chunks in " << collection.getNS() << causedBy(e);

                    return false;
                }

                connPtr->done();
            }
        }

        //
        // Paranoid verify the collection writes
        //

        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                // Find collections with no epochs
                BSONObj emptyDoc =
                        conn->findOne(CollectionType::ConfigNS + workingSuffix,
                                      BSON("$unset" << BSON(CollectionType::DEPRECATED_lastmodEpoch() << 1)));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "collection " << emptyDoc
                                       << " is still missing epoch after config upgrade";

                    connPtr->done();
                    return false;
                }

                // Find collections with empty epochs
                emptyDoc = conn->findOne(CollectionType::ConfigNS + workingSuffix,
                                         BSON(CollectionType::DEPRECATED_lastmodEpoch(OID())));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "collection " << emptyDoc
                                       << " still has empty epoch after config upgrade";

                    connPtr->done();
                    return false;
                }

                // Find chunks with no epochs
                emptyDoc =
                        conn->findOne(ChunkType::ConfigNS + workingSuffix,
                                      BSON("$unset" << BSON(ChunkType::DEPRECATED_epoch() << 1)));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "chunk " << emptyDoc
                                       << " is still missing epoch after config upgrade";

                    connPtr->done();
                    return false;
                }

                // Find chunks with empty epochs
                emptyDoc = conn->findOne(ChunkType::ConfigNS + workingSuffix,
                                         BSON(ChunkType::DEPRECATED_epoch(OID())));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "chunk " << emptyDoc
                                       << " still has empty epoch after config upgrade";

                    connPtr->done();
                    return false;
                }
            }
            catch (const DBException& e) {

                *errMsg = stream() << "could not verify epoch writes" << causedBy(e);

                return false;
            }

            connPtr->done();
        }

        //
        // Double check that our collections haven't changed
        //

        Status idCheckStatus = checkIdsTheSame(configLoc,
                                               CollectionType::ConfigNS,
                                               CollectionType::ConfigNS + workingSuffix);

        if (!idCheckStatus.isOK()) {

            *errMsg = stream() << CollectionType::ConfigNS
                               << " was modified while working on upgrade"
                               << causedBy(idCheckStatus);

            return false;
        }

        idCheckStatus = checkIdsTheSame(configLoc,
                                        ChunkType::ConfigNS,
                                        ChunkType::ConfigNS + workingSuffix);

        if (!idCheckStatus.isOK()) {

            *errMsg = stream() << ChunkType::ConfigNS << " was modified while working on upgrade"
                               << causedBy(idCheckStatus);

            return false;
        }

        //
        // ENTER CRITICAL SECTION
        //

        newVersionInfo.setUpgradeState(BSON(inCriticalSectionField(true)));

        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                verify(newVersionInfo.isValid(NULL));

                conn->update(VersionType::ConfigNS,
                             BSON("_id" << 1 << VersionType::version_DEPRECATED(3)),
                             newVersionInfo.toBSON());
                _checkGLE(conn);
            }
            catch (const DBException& e) {

                // No cleanup message here since we're not sure if we wrote or not, and
                // not dangerous either way except to prevent further updates (at which point
                // the message is printed)
                *errMsg = stream()
                        << "could not update version info to enter critical update section"
                        << causedBy(e);

                return false;
            }

            // AT THIS POINT ANY FAILURE REQUIRES MANUAL INTERVENTION!
            connPtr->done();
        }

        Status overwriteStatus = overwriteCollection(configLoc,
                                                     CollectionType::ConfigNS + workingSuffix,
                                                     CollectionType::ConfigNS);

        if (!overwriteStatus.isOK()) {

            error() << cleanupMessage << endl;
            *errMsg = stream() << "could not overwrite collection " << CollectionType::ConfigNS
                               << " with working collection "
                               << (CollectionType::ConfigNS + workingSuffix)
                               << causedBy(overwriteStatus);

            return false;
        }

        overwriteStatus = overwriteCollection(configLoc,
                                              ChunkType::ConfigNS + workingSuffix,
                                              ChunkType::ConfigNS);

        if (!overwriteStatus.isOK()) {

            error() << cleanupMessage << endl;
            *errMsg = stream() << "could not overwrite collection " << ChunkType::ConfigNS
                               << " with working collection "
                               << (ChunkType::ConfigNS + workingSuffix)
                               << causedBy(overwriteStatus);

            return false;
        }

        //
        // Finally update the version to latest and add clusterId to version
        //

        OID newClusterId = OID::gen();

        // Note: hardcoded versions, since this is a very particular upgrade
        // Note: DO NOT CLEAR the config version unless bumping the minCompatibleVersion,
        // we want to save the excludes that were set.

        newVersionInfo.setMinCompatibleVersion(UpgradeHistory_NoEpochVersion);
        newVersionInfo.setCurrentVersion(UpgradeHistory_MandatoryEpochVersion);
        newVersionInfo.setClusterId(newClusterId);

        // Leave critical section
        newVersionInfo.unsetUpgradeId();
        newVersionInfo.unsetUpgradeState();

        log() << "writing new version info and clusterId " << newClusterId << "..." << endl;

        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                verify(newVersionInfo.isValid(NULL));

                conn->update(VersionType::ConfigNS,
                             BSON("_id" << 1 << VersionType::version_DEPRECATED(UpgradeHistory_NoEpochVersion)),
                             newVersionInfo.toBSON());
                _checkGLE(conn);
            }
            catch (const DBException& e) {

                error() << cleanupMessage << endl;
                *errMsg = stream() << "could not write new version info "
                                   << "and exit critical upgrade section" << causedBy(e);

                return false;
            }

            connPtr->done();
        }

        //
        // END CRITICAL SECTION
        //

        return true;
    }
Example #21
0
    void Listener::initAndListen() {
        if (!_setupSocketsSuccessful) {
            return;
        }

        for (unsigned i = 0; i < _socks.size(); i++) {
            if (::listen(_socks[i], 128) != 0) {
                error() << "listen(): listen() failed " << errnoWithDescription() << endl;
                return;
            }

            ListeningSockets::get()->add(_socks[i]);
        }

#ifdef MONGO_CONFIG_SSL
        _logListen(_port, _ssl);
#else
        _logListen(_port, false);
#endif

        {
            // Wake up any threads blocked in waitUntilListening()
            stdx::lock_guard<stdx::mutex> lock(_readyMutex);
            _ready = true;
            _readyCondition.notify_all();
        }

        OwnedPointerVector<EventHolder> eventHolders;
        std::unique_ptr<WSAEVENT[]> events(new WSAEVENT[_socks.size()]);
        
        
        // Populate events array with an event for each socket we are watching
        for (size_t count = 0; count < _socks.size(); ++count) {
            EventHolder* ev(new EventHolder);
            eventHolders.mutableVector().push_back(ev);
            events[count] = ev->get();            
        }
            
        while ( ! inShutdown() ) {
            // Turn on listening for accept-ready sockets
            for (size_t count = 0; count < _socks.size(); ++count) {
                int status = WSAEventSelect(_socks[count], events[count], FD_ACCEPT | FD_CLOSE);
                if (status == SOCKET_ERROR) {
                    const int mongo_errno = WSAGetLastError();

                    // During shutdown, we may fail to listen on the socket if it has already
                    // been closed
                    if (inShutdown()) {
                        return;
                    }

                    error() << "Windows WSAEventSelect returned "
                        << errnoWithDescription(mongo_errno) << endl;
                    fassertFailed(16727);
                }
            }
        
            // Wait till one of them goes active, or we time out
            DWORD result = WSAWaitForMultipleEvents(_socks.size(),
                                                    events.get(), 
                                                    FALSE, // don't wait for all the events
                                                    10, // timeout, in ms 
                                                    FALSE); // do not allow I/O interruptions
            if (result == WSA_WAIT_FAILED) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAWaitForMultipleEvents returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                fassertFailed(16723);
            }
        
            if (result == WSA_WAIT_TIMEOUT) {
                _elapsedTime += 10;
                continue;
            }
            _elapsedTime += 1; // assume 1ms to grab connection. very rough
            
            // Determine which socket is ready
            DWORD eventIndex = result - WSA_WAIT_EVENT_0;
            WSANETWORKEVENTS networkEvents;            
            // Extract event details, and clear event for next pass
            int status = WSAEnumNetworkEvents(_socks[eventIndex],
                                              events[eventIndex], 
                                              &networkEvents);
            if (status == SOCKET_ERROR) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAEnumNetworkEvents returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                continue;
            }
            
            if (networkEvents.lNetworkEvents & FD_CLOSE) {              
                log() << "listen socket closed" << endl;
                break;
            }
            
            if (!(networkEvents.lNetworkEvents & FD_ACCEPT)) {
                error() << "Unexpected network event: " << networkEvents.lNetworkEvents << endl;
                continue;
            }
            
            int iec = networkEvents.iErrorCode[FD_ACCEPT_BIT];
            if (iec != 0) {                 
                error() << "Windows socket accept did not work:" << errnoWithDescription(iec) 
                        << endl;
                continue;
            }
            
            status = WSAEventSelect(_socks[eventIndex], NULL, 0);
            if (status == SOCKET_ERROR) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAEventSelect returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                continue;
            }
            
            disableNonblockingMode(_socks[eventIndex]);
            
            SockAddr from;
            int s = accept(_socks[eventIndex], from.raw(), &from.addressSize);
            if ( s < 0 ) {
                int x = errno; // so no global issues
                if (x == EBADF) {
                    log() << "Port " << _port << " is no longer valid" << endl;
                    continue;
                }
                else if (x == ECONNABORTED) {
                    log() << "Listener on port " << _port << " aborted" << endl;
                    continue;
                }
                if ( x == 0 && inShutdown() ) {
                    return;   // socket closed
                }
                if( !inShutdown() ) {
                    log() << "Listener: accept() returns " << s << " " 
                        << errnoWithDescription(x) << endl;
                    if (x == EMFILE || x == ENFILE) {
                        // Connection still in listen queue but we can't accept it yet
                        error() << "Out of file descriptors. Waiting one second before"
                            " trying to accept more connections." << warnings;
                        sleepsecs(1);
                    }
                }
                continue;
            }
            if (from.getType() != AF_UNIX)
                disableNagle(s);

            long long myConnectionNumber = globalConnectionNumber.addAndFetch(1);

            if (_logConnect && !serverGlobalParams.quiet) {
                int conns = globalTicketHolder.used()+1;
                const char* word = (conns == 1 ? " connection" : " connections");
                log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl;
            }
            
            std::shared_ptr<Socket> pnewSock( new Socket(s, from) );
#ifdef MONGO_CONFIG_SSL
            if (_ssl) {
                pnewSock->secureAccepted(_ssl);
            }
#endif
            accepted( pnewSock , myConnectionNumber );
        }
    }
Example #22
0
    Status BatchWriteOp::targetBatch( const NSTargeter& targeter,
                                      bool recordTargetErrors,
                                      vector<TargetedWriteBatch*>* targetedBatches ) {

        TargetedBatchMap batchMap;

        size_t numWriteOps = _clientRequest->sizeWriteOps();
        for ( size_t i = 0; i < numWriteOps; ++i ) {

            // Only do one-at-a-time ops if COE is false
            if ( !_clientRequest->getContinueOnError() && !batchMap.empty() ) break;

            WriteOp& writeOp = _writeOps[i];

            // Only target _Ready ops
            if ( writeOp.getWriteState() != WriteOpState_Ready ) continue;

            //
            // Get TargetedWrites from the targeter for the write operation
            //

            // TargetedWrites need to be owned once returned
            OwnedPointerVector<TargetedWrite> writesOwned;
            vector<TargetedWrite*>& writes = writesOwned.mutableVector();

            Status targetStatus = writeOp.targetWrites( targeter, &writes );

            if ( !targetStatus.isOK() ) {

                //
                // We're not sure how to target here, so either record the error or cancel the
                // current batches.
                //

                BatchedErrorDetail targetError;
                buildTargetError( targetStatus, &targetError );

                if ( recordTargetErrors ) {
                    writeOp.setOpError( targetError );
                    continue;
                }
                else {
                    // Cancel current batch state with an error
                    cancelBatches( targetError, _writeOps, &batchMap );
                    dassert( batchMap.empty() );
                    return targetStatus;
                }
            }

            //
            // Targeting went ok, add to appropriate TargetedBatch
            //

            for ( vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it ) {

                TargetedWrite* write = *it;

                TargetedBatchMap::iterator seenIt = batchMap.find( &write->endpoint );
                if ( seenIt == batchMap.end() ) {
                    TargetedWriteBatch* newBatch = new TargetedWriteBatch( write->endpoint );
                    seenIt = batchMap.insert( make_pair( &newBatch->getEndpoint(), //
                                                         newBatch ) ).first;
                }

                TargetedWriteBatch* batch = seenIt->second;
                batch->addWrite( write );
            }

            // Relinquish ownership of TargetedWrites, now the TargetedBatches own them
            writesOwned.mutableVector().clear();
        }

        //
        // Send back our targeted batches
        //

        for ( TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it ) {

            TargetedWriteBatch* batch = it->second;

            // Remember targeted batch for reporting
            _targeted.insert( batch );
            // Send the handle back to caller
            targetedBatches->push_back( batch );
        }

        return Status::OK();
    }
Example #23
0
Status WriteOp::targetWrites( const NSTargeter& targeter,
                              std::vector<TargetedWrite*>* targetedWrites ) {

    bool isUpdate = _itemRef.getOpType() == BatchedCommandRequest::BatchType_Update;
    bool isDelete = _itemRef.getOpType() == BatchedCommandRequest::BatchType_Delete;
    bool isIndexInsert = _itemRef.getRequest()->isInsertIndexRequest();

    Status targetStatus = Status::OK();
    OwnedPointerVector<ShardEndpoint> endpointsOwned;
    vector<ShardEndpoint*>& endpoints = endpointsOwned.mutableVector();

    if ( isUpdate ) {
        targetStatus = targeter.targetUpdate( *_itemRef.getUpdate(), &endpoints );
    }
    else if ( isDelete ) {
        targetStatus = targeter.targetDelete( *_itemRef.getDelete(), &endpoints );
    }
    else {
        dassert( _itemRef.getOpType() == BatchedCommandRequest::BatchType_Insert );

        ShardEndpoint* endpoint = NULL;
        // TODO: Remove the index targeting stuff once there is a command for it
        if ( !isIndexInsert ) {
            targetStatus = targeter.targetInsert( _itemRef.getDocument(), &endpoint );
        }
        else {
            // TODO: Retry index writes with stale version?
            targetStatus = targeter.targetAll( &endpoints );
        }

        if ( !targetStatus.isOK() ) {
            dassert( NULL == endpoint );
            return targetStatus;
        }

        // Store single endpoint result if we targeted a single endpoint
        if ( endpoint ) endpoints.push_back( endpoint );
    }

    // If we had an error, stop here
    if ( !targetStatus.isOK() ) return targetStatus;

    for ( vector<ShardEndpoint*>::iterator it = endpoints.begin(); it != endpoints.end();
            ++it ) {

        ShardEndpoint* endpoint = *it;

        _childOps.push_back( new ChildWriteOp( this ) );

        WriteOpRef ref( _itemRef.getItemIndex(), _childOps.size() - 1 );

        // For now, multiple endpoints imply no versioning
        if ( endpoints.size() == 1u ) {
            targetedWrites->push_back( new TargetedWrite( *endpoint, ref ) );
        }
        else {
            ShardEndpoint broadcastEndpoint( endpoint->shardName,
                                             ChunkVersion::IGNORED() );
            targetedWrites->push_back( new TargetedWrite( broadcastEndpoint, ref ) );
        }

        _childOps.back()->pendingWrite = targetedWrites->back();
        _childOps.back()->state = WriteOpState_Pending;
    }

    _state = WriteOpState_Pending;
    return Status::OK();
}
// static
Status ClearFilters::clear(OperationContext* txn,
                           QuerySettings* querySettings,
                           PlanCache* planCache,
                           const std::string& ns,
                           const BSONObj& cmdObj) {
    invariant(querySettings);

    // According to the specification, the planCacheClearFilters command runs in two modes:
    // - clear all hints; or
    // - clear hints for single query shape when a query shape is described in the
    //   command arguments.
    if (cmdObj.hasField("query")) {
        auto statusWithCQ = PlanCacheCommand::canonicalize(txn, ns, cmdObj);
        if (!statusWithCQ.isOK()) {
            return statusWithCQ.getStatus();
        }

        unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());
        querySettings->removeAllowedIndices(planCache->computeKey(*cq));

        // Remove entry from plan cache
        planCache->remove(*cq);

        LOG(0) << "Removed index filter on " << ns << " " << cq->toStringShort();

        return Status::OK();
    }

    // If query is not provided, make sure sort and projection are not in arguments.
    // We do not want to clear the entire cache inadvertently when the user
    // forgot to provide a value for "query".
    if (cmdObj.hasField("sort") || cmdObj.hasField("projection")) {
        return Status(ErrorCodes::BadValue, "sort or projection provided without query");
    }

    // Get entries from query settings. We need to remove corresponding entries from the plan
    // cache shortly.
    OwnedPointerVector<AllowedIndexEntry> entries;
    entries.mutableVector() = querySettings->getAllAllowedIndices();

    // OK to proceed with clearing entire cache.
    querySettings->clearAllowedIndices();

    const NamespaceString nss(ns);
    const ExtensionsCallbackReal extensionsCallback(txn, &nss);

    // Remove corresponding entries from plan cache.
    // Admin hints affect the planning process directly. If there were
    // plans generated as a result of applying index filter, these need to be
    // invalidated. This allows the planner to re-populate the plan cache with
    // non-filtered indexed solutions next time the query is run.
    // Resolve plan cache key from (query, sort, projection) in query settings entry.
    // Concurrency note: There's no harm in removing plan cache entries one at at time.
    // Only way that PlanCache::remove() can fail is when the query shape has been removed from
    // the cache by some other means (re-index, collection info reset, ...). This is OK since
    // that's the intended effect of calling the remove() function with the key from the hint entry.
    for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
        AllowedIndexEntry* entry = *i;
        invariant(entry);

        // Create canonical query.
        auto qr = stdx::make_unique<QueryRequest>(nss);
        qr->setFilter(entry->query);
        qr->setSort(entry->sort);
        qr->setProj(entry->projection);
        auto statusWithCQ = CanonicalQuery::canonicalize(txn, std::move(qr), extensionsCallback);
        invariantOK(statusWithCQ.getStatus());
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // Remove plan cache entry.
        planCache->remove(*cq);
    }

    LOG(0) << "Removed all index filters for collection: " << ns;

    return Status::OK();
}
Example #25
0
Status BatchWriteOp::targetBatch(OperationContext* txn,
                                 const NSTargeter& targeter,
                                 bool recordTargetErrors,
                                 vector<TargetedWriteBatch*>* targetedBatches) {
    //
    // Targeting of unordered batches is fairly simple - each remaining write op is targeted,
    // and each of those targeted writes are grouped into a batch for a particular shard
    // endpoint.
    //
    // Targeting of ordered batches is a bit more complex - to respect the ordering of the
    // batch, we can only send:
    // A) a single targeted batch to one shard endpoint
    // B) multiple targeted batches, but only containing targeted writes for a single write op
    //
    // This means that any multi-shard write operation must be targeted and sent one-by-one.
    // Subsequent single-shard write operations can be batched together if they go to the same
    // place.
    //
    // Ex: ShardA : { skey : a->k }, ShardB : { skey : k->z }
    //
    // Ordered insert batch of: [{ skey : a }, { skey : b }, { skey : x }]
    // broken into:
    //  [{ skey : a }, { skey : b }],
    //  [{ skey : x }]
    //
    // Ordered update Batch of :
    //  [{ skey : a }{ $push },
    //   { skey : b }{ $push },
    //   { skey : [c, x] }{ $push },
    //   { skey : y }{ $push },
    //   { skey : z }{ $push }]
    // broken into:
    //  [{ skey : a }, { skey : b }],
    //  [{ skey : [c,x] }],
    //  [{ skey : y }, { skey : z }]
    //

    const bool ordered = _clientRequest->getOrdered();

    TargetedBatchMap batchMap;
    TargetedBatchSizeMap batchSizes;

    int numTargetErrors = 0;

    size_t numWriteOps = _clientRequest->sizeWriteOps();
    for (size_t i = 0; i < numWriteOps; ++i) {
        WriteOp& writeOp = _writeOps[i];

        // Only target _Ready ops
        if (writeOp.getWriteState() != WriteOpState_Ready)
            continue;

        //
        // Get TargetedWrites from the targeter for the write operation
        //

        // TargetedWrites need to be owned once returned
        OwnedPointerVector<TargetedWrite> writesOwned;
        vector<TargetedWrite*>& writes = writesOwned.mutableVector();

        Status targetStatus = writeOp.targetWrites(txn, targeter, &writes);

        if (!targetStatus.isOK()) {
            WriteErrorDetail targetError;
            buildTargetError(targetStatus, &targetError);

            if (!recordTargetErrors) {
                // Cancel current batch state with an error

                cancelBatches(targetError, _writeOps, &batchMap);
                dassert(batchMap.empty());
                return targetStatus;
            } else if (!ordered || batchMap.empty()) {
                // Record an error for this batch

                writeOp.setOpError(targetError);
                ++numTargetErrors;

                if (ordered)
                    return Status::OK();

                continue;
            } else {
                dassert(ordered && !batchMap.empty());

                // Send out what we have, but don't record an error yet, since there may be an
                // error in the writes before this point.

                writeOp.cancelWrites(&targetError);
                break;
            }
        }

        //
        // If ordered and we have a previous endpoint, make sure we don't need to send these
        // targeted writes to any other endpoints.
        //

        if (ordered && !batchMap.empty()) {
            dassert(batchMap.size() == 1u);
            if (isNewBatchRequired(writes, batchMap)) {
                writeOp.cancelWrites(NULL);
                break;
            }
        }

        //
        // If this write will push us over some sort of size limit, stop targeting
        //

        int writeSizeBytes = getWriteSizeBytes(writeOp);
        if (wouldMakeBatchesTooBig(writes, writeSizeBytes, batchSizes)) {
            invariant(!batchMap.empty());
            writeOp.cancelWrites(NULL);
            break;
        }

        //
        // Targeting went ok, add to appropriate TargetedBatch
        //

        for (vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it) {
            TargetedWrite* write = *it;

            TargetedBatchMap::iterator batchIt = batchMap.find(&write->endpoint);
            TargetedBatchSizeMap::iterator batchSizeIt = batchSizes.find(&write->endpoint);

            if (batchIt == batchMap.end()) {
                TargetedWriteBatch* newBatch = new TargetedWriteBatch(write->endpoint);
                batchIt = batchMap.insert(make_pair(&newBatch->getEndpoint(), newBatch)).first;
                batchSizeIt =
                    batchSizes.insert(make_pair(&newBatch->getEndpoint(), BatchSize())).first;
            }

            TargetedWriteBatch* batch = batchIt->second;
            BatchSize& batchSize = batchSizeIt->second;

            ++batchSize.numOps;
            batchSize.sizeBytes += writeSizeBytes;
            batch->addWrite(write);
        }

        // Relinquish ownership of TargetedWrites, now the TargetedBatches own them
        writesOwned.mutableVector().clear();

        //
        // Break if we're ordered and we have more than one endpoint - later writes cannot be
        // enforced as ordered across multiple shard endpoints.
        //

        if (ordered && batchMap.size() > 1u)
            break;
    }

    //
    // Send back our targeted batches
    //

    for (TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it) {
        TargetedWriteBatch* batch = it->second;

        if (batch->getWrites().empty())
            continue;

        // Remember targeted batch for reporting
        _targeted.insert(batch);
        // Send the handle back to caller
        targetedBatches->push_back(batch);
    }

    return Status::OK();
}
Example #26
0
// static
Status ClearFilters::clear(QuerySettings* querySettings, PlanCache* planCache,
                           const std::string& ns, const BSONObj& cmdObj) {
    invariant(querySettings);

    // According to the specification, the planCacheClearFilters command runs in two modes:
    // - clear all hints; or
    // - clear hints for single query shape when a query shape is described in the
    //   command arguments.
    if (cmdObj.hasField("query")) {
        CanonicalQuery* cqRaw;
        Status status = PlanCacheCommand::canonicalize(ns, cmdObj, &cqRaw);
        if (!status.isOK()) {
            return status;
        }

        scoped_ptr<CanonicalQuery> cq(cqRaw);
        querySettings->removeAllowedIndices(*cq);

        // Remove entry from plan cache
        planCache->remove(*cq);
        return Status::OK();
    }

    // If query is not provided, make sure sort and projection are not in arguments.
    // We do not want to clear the entire cache inadvertently when the user
    // forgot to provide a value for "query".
    if (cmdObj.hasField("sort") || cmdObj.hasField("projection")) {
        return Status(ErrorCodes::BadValue, "sort or projection provided without query");
    }

    // Get entries from query settings. We need to remove corresponding entries from the plan
    // cache shortly.
    OwnedPointerVector<AllowedIndexEntry> entries;
    entries.mutableVector() = querySettings->getAllAllowedIndices();

    // OK to proceed with clearing entire cache.
    querySettings->clearAllowedIndices();

    const NamespaceString nss(ns);
    const WhereCallbackReal whereCallback(nss.db());

    // Remove corresponding entries from plan cache.
    // Admin hints affect the planning process directly. If there were
    // plans generated as a result of applying index filter, these need to be
    // invalidated. This allows the planner to re-populate the plan cache with
    // non-filtered indexed solutions next time the query is run.
    // Resolve plan cache key from (query, sort, projection) in query settings entry.
    // Concurrency note: There's no harm in removing plan cache entries one at at time.
    // Only way that PlanCache::remove() can fail is when the query shape has been removed from
    // the cache by some other means (re-index, collection info reset, ...). This is OK since
    // that's the intended effect of calling the remove() function with the key from the hint entry.
    for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin();
            i != entries.end(); ++i) {
        AllowedIndexEntry* entry = *i;
        invariant(entry);

        // Create canonical query.
        CanonicalQuery* cqRaw;
        Status result = CanonicalQuery::canonicalize(
                            ns, entry->query, entry->sort, entry->projection, &cqRaw, whereCallback);
        invariant(result.isOK());
        scoped_ptr<CanonicalQuery> cq(cqRaw);

        // Remove plan cache entry.
        planCache->remove(*cq);
    }

    return Status::OK();
}
Example #27
0
    void ReplSetImpl::loadConfig(OperationContext* txn) {
        startupStatus = LOADINGCONFIG;
        startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
        LOG(1) << "loadConfig() " << rsConfigNs << endl;

        while (1) {
            try {
                OwnedPointerVector<ReplSetConfig> configs;
                try {
                    configs.mutableVector().push_back(ReplSetConfig::makeDirect(txn));
                }
                catch (DBException& e) {
                    log() << "replSet exception loading our local replset configuration object : "
                          << e.toString() << rsLog;
                }
                for (vector<HostAndPort>::const_iterator i = _seeds->begin();
                        i != _seeds->end();
                        i++) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, *i));
                    }
                    catch (DBException& e) {
                        log() << "replSet exception trying to load config from " << *i
                              << " : " << e.toString() << rsLog;
                    }
                }
                ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings();
                {
                    scoped_lock lck(replSettings.discoveredSeeds_mx);
                    if (replSettings.discoveredSeeds.size() > 0) {
                        for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); 
                             i != replSettings.discoveredSeeds.end(); 
                             i++) {
                            try {
                                configs.mutableVector().push_back(
                                                            ReplSetConfig::make(txn, HostAndPort(*i)));
                            }
                            catch (DBException&) {
                                LOG(1) << "replSet exception trying to load config from discovered "
                                          "seed " << *i << rsLog;
                                replSettings.discoveredSeeds.erase(*i);
                            }
                        }
                    }
                }

                if (!replSettings.reconfig.isEmpty()) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, replSettings.reconfig,
                                                                       true));
                    }
                    catch (DBException& re) {
                        log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
                        replSettings.reconfig = BSONObj();
                    }
                }

                int nok = 0;
                int nempty = 0;
                for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin();
                     i != configs.mutableVector().end(); i++) {
                    if ((*i)->ok())
                        nok++;
                    if ((*i)->empty())
                        nempty++;
                }
                if (nok == 0) {

                    if (nempty == (int) configs.mutableVector().size()) {
                        startupStatus = EMPTYCONFIG;
                        startupStatusMsg.set("can't get " + rsConfigNs +
                                             " config from self or any seed (EMPTYCONFIG)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (EMPTYCONFIG)" << rsLog;
                        static unsigned once;
                        if (++once == 1) {
                            log() << "replSet info you may need to run replSetInitiate -- rs.initia"
                                     "te() in the shell -- if that is not already done" << rsLog;
                        }
                        if (_seeds->size() == 0) {
                            LOG(1) << "replSet info no seed hosts were specified on the --replSet "
                                      "command line" << rsLog;
                        }
                    }
                    else {
                        startupStatus = EMPTYUNREACHABLE;
                        startupStatusMsg.set("can't currently get " + rsConfigNs +
                                             " config from self or any seed (EMPTYUNREACHABLE)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (yet)" << rsLog;
                    }

                    sleepsecs(1);
                    continue;
                }

                if (!_loadConfigFinish(txn, configs.mutableVector())) {
                    log() << "replSet info Couldn't load config yet. Sleeping 3 sec and will try "
                             "again." << rsLog;
                    sleepsecs(3);
                    continue;
                }
            }
            catch (DBException& e) {
                startupStatus = BADCONFIG;
                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
                log() << "replSet error loading configurations " << e.toString() << rsLog;
                log() << "replSet error replication will not start" << rsLog;
                sethbmsg("error loading set config");
                fassertFailedNoTrace(18754);
                throw;
            }
            break;
        }
        startupStatusMsg.set("? started");
        startupStatus = STARTED;
    }
Example #28
0
    StatusWith<DiskLoc> Collection::updateDocument( const DiskLoc& oldLocation,
                                                    const BSONObj& objNew,
                                                    bool enforceQuota,
                                                    OpDebug* debug ) {

        Record* oldRecord = getExtentManager()->recordFor( oldLocation );
        BSONObj objOld = BSONObj::make( oldRecord );

        if ( objOld.hasElement( "_id" ) ) {
            BSONElement oldId = objOld["_id"];
            BSONElement newId = objNew["_id"];
            if ( oldId != newId )
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            "in Collection::updateDocument _id mismatch",
                                            13596 );
        }

        if ( ns().coll() == "system.users" ) {
            // XXX - andy and spencer think this should go away now
            V2UserDocumentParser parser;
            Status s = parser.checkValidUserDocument(objNew);
            if ( !s.isOK() )
                return StatusWith<DiskLoc>( s );
        }

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerVector<UpdateTicket> updateTickets;
        updateTickets.mutableVector().resize(_indexCatalog.numIndexesTotal());
        for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) {
            IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i );
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique())
                || ignoreUniqueIndex(descriptor);
            updateTickets.mutableVector()[i] = new UpdateTicket();
            Status ret = iam->validateUpdate(objOld, objNew, oldLocation, options,
                                             updateTickets.mutableVector()[i]);
            if ( !ret.isOK() ) {
                return StatusWith<DiskLoc>( ret );
            }
        }

        if ( oldRecord->netLength() < objNew.objsize() ) {
            // doesn't fit, have to move to new location

            if ( _details->isCapped() )
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            "failing update: objects in a capped ns cannot grow",
                                            10003 );

            moveCounter.increment();
            _details->paddingTooSmall();

            // unindex old record, don't delete
            // this way, if inserting new doc fails, we can re-index this one
            ClientCursor::aboutToDelete(_ns.ns(), _details, oldLocation);
            _indexCatalog.unindexRecord( objOld, oldLocation, true );

            if ( debug ) {
                if (debug->nmoved == -1) // default of -1 rather than 0
                    debug->nmoved = 1;
                else
                    debug->nmoved += 1;
            }

            StatusWith<DiskLoc> loc = insertDocument( objNew, enforceQuota );

            if ( loc.isOK() ) {
                // insert successful, now lets deallocate the old location
                // remember its already unindexed
                _recordStore.deallocRecord( oldLocation, oldRecord );
            }
            else {
                // new doc insert failed, so lets re-index the old document and location
                _indexCatalog.indexRecord( objOld, oldLocation );
            }

            return loc;
        }

        _infoCache.notifyOfWriteOp();
        _details->paddingFits();

        if ( debug )
            debug->keyUpdates = 0;

        for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) {
            IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i );
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            int64_t updatedKeys;
            Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys);
            if ( !ret.isOK() )
                return StatusWith<DiskLoc>( ret );
            if ( debug )
                debug->keyUpdates += updatedKeys;
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(oldRecord->data(), sz), objNew.objdata(), sz);
        return StatusWith<DiskLoc>( oldLocation );
    }
Example #29
0
    static Status parseGeoJSONPolygonCoordinates(const BSONElement& elem, S2Polygon *out) {
        if (Array != elem.type()) { return BAD_VALUE("Polygon coordinates must be an array"); }

        OwnedPointerVector<S2Loop> loops;
        Status status = Status::OK();
        string err;

        BSONObjIterator it(elem.Obj());
        // Iterate all loops of the polygon.
        while (it.more()) {
            // Parse the array of vertices of a loop.
            BSONElement coordinateElt = it.next();
            vector<S2Point> points;
            status = parseArrayOfCoodinates(coordinateElt, &points);
            if (!status.isOK()) return status;

            // Check if the loop is closed.
            status = isLoopClosed(points, coordinateElt);
            if (!status.isOK()) return status;

            eraseDuplicatePoints(&points);
            // Drop the duplicated last point.
            points.resize(points.size() - 1);

            S2Loop* loop = new S2Loop(points);
            loops.push_back(loop);

            // Check whether this loop is valid.
            // 1. At least 3 vertices.
            // 2. All vertices must be unit length. Guaranteed by parsePoints().
            // 3. Loops are not allowed to have any duplicate vertices.
            // 4. Non-adjacent edges are not allowed to intersect.
            if (!loop->IsValid(&err)) {
                return BAD_VALUE("Loop is not valid: " << coordinateElt.toString(false) << " "
                                 << err);
            }
            // If the loop is more than one hemisphere, invert it.
            loop->Normalize();

            // Check the first loop must be the exterior ring and any others must be
            // interior rings or holes.
            if (loops.size() > 1 && !loops[0]->Contains(loop)) {
                return BAD_VALUE("Secondary loops not contained by first exterior loop - "
                    "secondary loops must be holes: " << coordinateElt.toString(false)
                    << " first loop: " << elem.Obj().firstElement().toString(false));
            }
        }

        // Check if the given loops form a valid polygon.
        // 1. If a loop contains an edge AB, then no other loop may contain AB or BA.
        // 2. No loop covers more than half of the sphere.
        // 3. No two loops cross.
        if (!S2Polygon::IsValid(loops.vector(), &err))
            return BAD_VALUE("Polygon isn't valid: " << err << " " << elem.toString(false));

        // Given all loops are valid / normalized and S2Polygon::IsValid() above returns true.
        // The polygon must be valid. See S2Polygon member function IsValid().

        // Transfer ownership of the loops and clears loop vector.
        out->Init(&loops.mutableVector());

        // Check if every loop of this polygon shares at most one vertex with
        // its parent loop.
        if (!out->IsNormalized(&err))
            // "err" looks like "Loop 1 shares more than one vertex with its parent loop 0"
            return BAD_VALUE(err << ": " << elem.toString(false));

        // S2Polygon contains more than one ring, which is allowed by S2, but not by GeoJSON.
        //
        // Loops are indexed according to a preorder traversal of the nesting hierarchy.
        // GetLastDescendant() returns the index of the last loop that is contained within
        // a given loop. We guarantee that the first loop is the exterior ring.
        if (out->GetLastDescendant(0) < out->num_loops() - 1) {
            return BAD_VALUE("Only one exterior polygon loop is allowed: " << elem.toString(false));
        }

        // In GeoJSON, only one nesting is allowed.
        // The depth of a loop is set by polygon according to the nesting hierarchy of polygon,
        // so the exterior ring's depth is 0, a hole in it is 1, etc.
        for (int i = 0; i < out->num_loops(); i++) {
            if (out->loop(i)->depth() > 1) {
                return BAD_VALUE("Polygon interior loops cannot be nested: "<< elem.toString(false));
            }
        }
        return Status::OK();
    }
Example #30
0
    PlanStage::StageState TextStage::fillOutResults() {
        Database* db = cc().database();
        Collection* collection = db->getCollection( _params.ns );
        if (NULL == collection) {
            warning() << "TextStage params namespace error";
            return PlanStage::FAILURE;
        }
        vector<IndexDescriptor*> idxMatches;
        collection->getIndexCatalog()->findIndexByType("text", idxMatches);
        if (1 != idxMatches.size()) {
            warning() << "Expected exactly one text index";
            return PlanStage::FAILURE;
        }

        // Get all the index scans for each term in our query.
        OwnedPointerVector<PlanStage> scanners;
        for (size_t i = 0; i < _params.query.getTerms().size(); i++) {
            const string& term = _params.query.getTerms()[i];
            IndexScanParams params;
            params.bounds.startKey = FTSIndexFormat::getIndexKey(MAX_WEIGHT, term,
                                                                 _params.indexPrefix);
            params.bounds.endKey = FTSIndexFormat::getIndexKey(0, term, _params.indexPrefix);
            params.bounds.endKeyInclusive = true;
            params.bounds.isSimpleRange = true;
            params.descriptor = idxMatches[0];
            params.direction = -1;
            IndexScan* ixscan = new IndexScan(params, _ws, NULL);
            scanners.mutableVector().push_back(ixscan);
        }

        // Map: diskloc -> aggregate score for doc.
        typedef unordered_map<DiskLoc, double, DiskLoc::Hasher> ScoreMap;
        ScoreMap scores;

        // For each index scan, read all results and store scores.
        size_t currentIndexScanner = 0;
        while (currentIndexScanner < scanners.size()) {
            BSONObj keyObj;
            DiskLoc loc;

            WorkingSetID id;
            PlanStage::StageState state = scanners.vector()[currentIndexScanner]->work(&id);

            if (PlanStage::ADVANCED == state) {
                WorkingSetMember* wsm = _ws->get(id);
                IndexKeyDatum& keyDatum = wsm->keyData.back();
                filterAndScore(keyDatum.keyData, wsm->loc, &scores[wsm->loc]);
                _ws->free(id);
            }
            else if (PlanStage::IS_EOF == state) {
                // Done with this scan.
                ++currentIndexScanner;
            }
            else if (PlanStage::NEED_FETCH == state) {
                // We're calling work() on ixscans and they have no way to return a fetch.
                verify(false);
            }
            else if (PlanStage::NEED_TIME == state) {
                // We are a blocking stage, so ignore scanner's request for more time.
            }
            else {
                verify(PlanStage::FAILURE == state);
                warning() << "error from index scan during text stage: invalid FAILURE state";
                return PlanStage::FAILURE;
            }
        }

        // Filter for phrases and negative terms, score and truncate.
        for (ScoreMap::iterator i = scores.begin(); i != scores.end(); ++i) {
            DiskLoc loc = i->first;
            double score = i->second;

            // Ignore non-matched documents.
            if (score < 0) {
                continue;
            }

            // Filter for phrases and negated terms
            if (_params.query.hasNonTermPieces()) {
                if (!_ftsMatcher.matchesNonTerm(loc.obj())) {
                    continue;
                }
            }

            // Add results to working set as LOC_AND_UNOWNED_OBJ initially.
            // On invalidation, we copy the object and change the state to
            // OWNED_OBJ.
            // Fill out a WSM.
            WorkingSetID id = _ws->allocate();
            WorkingSetMember* member = _ws->get(id);
            member->loc = loc;
            member->obj = member->loc.obj();
            member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ;
            member->addComputed(new TextScoreComputedData(score));

            _results.push_back(id);
            _wsidByDiskLoc[member->loc] = id;
        }

        _filledOutResults = true;

        if (_results.size() == 0) {
            return PlanStage::IS_EOF;
        }
        return PlanStage::NEED_TIME;
    }