Example #1
0
bool BigSimplePolygon::Contains(const S2Polyline& line) const {
    //
    // A line is contained within a loop if the result of subtracting the loop from the line is
    // nothing.
    //
    // Also, a line is contained within a loop if the result of clipping the line to the
    // complement of the loop is nothing.
    //
    // If we can't subtract the loop itself using S2, we clip (intersect) to the inverse.  Every
    // point in S2 is contained in exactly one of these loops.
    //
    // TODO: Polygon borders are actually kind of weird, and this is somewhat inconsistent with
    // Intersects().  A point might Intersect() a boundary exactly, but not be Contain()ed
    // within the Polygon.  Think the right thing to do here is custom intersection functions.
    //
    const S2Polygon& polyBorder = GetPolygonBorder();

    OwnedPointerVector<S2Polyline> clippedOwned;
    vector<S2Polyline*>& clipped = clippedOwned.mutableVector();

    if (_isNormalized) {
        // Polygon border is the same as the loop
        polyBorder.SubtractFromPolyline(&line, &clipped);
        return clipped.size() == 0;
    } else {
        // Polygon border is the complement of the loop
        polyBorder.IntersectWithPolyline(&line, &clipped);
        return clipped.size() == 0;
    }
}
Example #2
0
    PlanCacheEntry* PlanCacheEntry::clone() const {
        OwnedPointerVector<QuerySolution> solutions;
        for (size_t i = 0; i < plannerData.size(); ++i) {
            QuerySolution* qs = new QuerySolution();
            qs->cacheData.reset(plannerData[i]->clone());
            solutions.mutableVector().push_back(qs);
        }
        PlanCacheEntry* entry = new PlanCacheEntry(solutions.vector(), decision->clone());

        entry->backupSoln = backupSoln;

        // Copy query shape.
        entry->query = query.getOwned();
        entry->sort = sort.getOwned();
        entry->projection = projection.getOwned();

        // Copy performance stats.
        for (size_t i = 0; i < feedback.size(); ++i) {
            PlanCacheEntryFeedback* fb = new PlanCacheEntryFeedback();
            fb->stats.reset(feedback[i]->stats->clone());
            fb->score = feedback[i]->score;
            entry->feedback.push_back(fb);
        }
        entry->averageScore = averageScore;
        entry->stddevScore = stddevScore;
        return entry;
    }
Example #3
0
/**
 * Currently the allowable shard keys are either
 * i) a hashed single field, e.g. { a : "hashed" }, or
 * ii) a compound list of ascending, potentially-nested field paths, e.g. { a : 1 , b.c : 1 }
 */
static vector<FieldRef*> parseShardKeyPattern(const BSONObj& keyPattern) {
    OwnedPointerVector<FieldRef> parsedPaths;
    static const vector<FieldRef*> empty;

    BSONObjIterator patternIt(keyPattern);
    while (patternIt.more()) {
        BSONElement patternEl = patternIt.next();
        parsedPaths.push_back(new FieldRef(patternEl.fieldNameStringData()));
        const FieldRef& patternPath = *parsedPaths.back();

        // Empty path
        if (patternPath.numParts() == 0)
            return empty;

        // Extra "." in path?
        if (patternPath.dottedField() != patternEl.fieldNameStringData())
            return empty;

        // Empty parts of the path, ".."?
        for (size_t i = 0; i < patternPath.numParts(); ++i) {
            if (patternPath.getPart(i).size() == 0)
                return empty;
        }

        // Numeric and ascending (1.0), or "hashed" and single field
        if (!patternEl.isNumber()) {
            if (keyPattern.nFields() != 1 || !isHashedPatternEl(patternEl))
                return empty;
        } else if (patternEl.numberInt() != 1) {
            return empty;
        }
    }

    return parsedPaths.release();
}
    vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const {
        OwnedPointerVector<RecordIterator> iterators;
        const Extent* ext;
        for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
            ext = _getExtent(txn, extLoc);
            if (ext->firstRecord.isNull())
                continue;
            iterators.push_back(
                new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this));
        }

        return iterators.release();
    }
Example #5
0
    vector<PlanStageStats*> MultiPlanStage::generateCandidateStats() {
        OwnedPointerVector<PlanStageStats> candidateStats;

        for (size_t ix = 0; ix < _candidates.size(); ix++) {
            if (ix == (size_t)_bestPlanIdx) { continue; }
            if (ix == (size_t)_backupPlanIdx) { continue; }

            PlanStageStats* stats = _candidates[ix].root->getStats();
            candidateStats.push_back(stats);
        }

        return candidateStats.release();
    }
Example #6
0
const S2Polygon& BigSimplePolygon::GetPolygonBorder() const {
    if (_borderPoly)
        return *_borderPoly;

    unique_ptr<S2Loop> cloned(_loop->Clone());

    // Any loop in polygon should be than a hemisphere (2*Pi).
    cloned->Normalize();

    OwnedPointerVector<S2Loop> loops;
    loops.mutableVector().push_back(cloned.release());
    _borderPoly.reset(new S2Polygon(&loops.mutableVector()));
    return *_borderPoly;
}
Example #7
0
IndexBounds ChunkManager::getIndexBoundsForQuery(const BSONObj& key,
                                                 const CanonicalQuery& canonicalQuery) {
    // $text is not allowed in planning since we don't have text index on mongos.
    //
    // TODO: Treat $text query as a no-op in planning. So with shard key {a: 1},
    //       the query { a: 2, $text: { ... } } will only target to {a: 2}.
    if (QueryPlannerCommon::hasNode(canonicalQuery.root(), MatchExpression::TEXT)) {
        IndexBounds bounds;
        IndexBoundsBuilder::allValuesBounds(key, &bounds);  // [minKey, maxKey]
        return bounds;
    }

    // Consider shard key as an index
    string accessMethod = IndexNames::findPluginName(key);
    dassert(accessMethod == IndexNames::BTREE || accessMethod == IndexNames::HASHED);

    // Use query framework to generate index bounds
    QueryPlannerParams plannerParams;
    // Must use "shard key" index
    plannerParams.options = QueryPlannerParams::NO_TABLE_SCAN;
    IndexEntry indexEntry(key,
                          accessMethod,
                          false /* multiKey */,
                          false /* sparse */,
                          false /* unique */,
                          "shardkey",
                          NULL /* filterExpr */,
                          BSONObj());
    plannerParams.indices.push_back(indexEntry);

    OwnedPointerVector<QuerySolution> solutions;
    Status status = QueryPlanner::plan(canonicalQuery, plannerParams, &solutions.mutableVector());
    uassert(status.code(), status.reason(), status.isOK());

    IndexBounds bounds;

    for (vector<QuerySolution*>::const_iterator it = solutions.begin();
         bounds.size() == 0 && it != solutions.end();
         it++) {
        // Try next solution if we failed to generate index bounds, i.e. bounds.size() == 0
        bounds = collapseQuerySolution((*it)->root.get());
    }

    if (bounds.size() == 0) {
        // We cannot plan the query without collection scan, so target to all shards.
        IndexBoundsBuilder::allValuesBounds(key, &bounds);  // [minKey, maxKey]
    }
    return bounds;
}
Example #8
0
void Strategy::writeOp(OperationContext* txn, int op, Request& request) {
    // make sure we have a last error
    dassert(&LastError::get(cc()));

    OwnedPointerVector<BatchedCommandRequest> commandRequestsOwned;
    vector<BatchedCommandRequest*>& commandRequests = commandRequestsOwned.mutableVector();

    msgToBatchRequests(request.m(), &commandRequests);

    for (vector<BatchedCommandRequest*>::iterator it = commandRequests.begin();
         it != commandRequests.end();
         ++it) {
        // Multiple commands registered to last error as multiple requests
        if (it != commandRequests.begin())
            LastError::get(cc()).startRequest();

        BatchedCommandRequest* commandRequest = *it;

        // Adjust namespaces for command
        NamespaceString fullNS(commandRequest->getNS());
        string cmdNS = fullNS.getCommandNS();
        // We only pass in collection name to command
        commandRequest->setNS(fullNS);

        BSONObjBuilder builder;
        BSONObj requestBSON = commandRequest->toBSON();

        {
            // Disable the last error object for the duration of the write cmd
            LastError::Disabled disableLastError(&LastError::get(cc()));
            Command::runAgainstRegistered(txn, cmdNS.c_str(), requestBSON, builder, 0);
        }

        BatchedCommandResponse commandResponse;
        bool parsed = commandResponse.parseBSON(builder.done(), NULL);
        (void)parsed;  // for compile
        dassert(parsed && commandResponse.isValid(NULL));

        // Populate the lastError object based on the write response
        LastError::get(cc()).reset();
        bool hadError =
            batchErrorToLastError(*commandRequest, commandResponse, &LastError::get(cc()));

        // Check if this is an ordered batch and we had an error which should stop processing
        if (commandRequest->getOrdered() && hadError)
            break;
    }
}
Example #9
0
    BSONObj buildMergeLogEntry( const OwnedPointerVector<ChunkType>& chunksToMerge,
                                const ChunkVersion& currShardVersion,
                                const ChunkVersion& newMergedVersion ) {

        BSONObjBuilder logDetailB;

        BSONArrayBuilder mergedB( logDetailB.subarrayStart( "merged" ) );

        for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin();
                it != chunksToMerge.end(); ++it ) {
            ChunkType* chunkToMerge = *it;
            mergedB.append( chunkToMerge->toBSON() );
        }

        mergedB.done();

        currShardVersion.addToBSON( logDetailB, "prevShardVersion" );
        newMergedVersion.addToBSON( logDetailB, "mergedVersion" );

        return logDetailB.obj();
    }
    BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const {
        RangeDeleter* deleter = getDeleter();
        if (!deleter) {
            return BSONObj();
        }

        BSONObjBuilder result;

        OwnedPointerVector<DeleteJobStats> statsList;
        deleter->getStatsHistory(&statsList.mutableVector());
        BSONArrayBuilder oldStatsBuilder;
        for (OwnedPointerVector<DeleteJobStats>::const_iterator it = statsList.begin();
             it != statsList.end();
             ++it) {
            BSONObjBuilder entryBuilder;
            entryBuilder.append("deletedDocs", (*it)->deletedDocCount);

            if ((*it)->queueEndTS > Date_t()) {
                entryBuilder.append("queueStart", (*it)->queueStartTS);
                entryBuilder.append("queueEnd", (*it)->queueEndTS);
            }

            if ((*it)->deleteEndTS > Date_t()) {
                entryBuilder.append("deleteStart", (*it)->deleteStartTS);
                entryBuilder.append("deleteEnd", (*it)->deleteEndTS);

                if ((*it)->waitForReplEndTS > Date_t()) {
                    entryBuilder.append("waitForReplStart", (*it)->waitForReplStartTS);
                    entryBuilder.append("waitForReplEnd", (*it)->waitForReplEndTS);
                }
            }

            oldStatsBuilder.append(entryBuilder.obj());
        }
        result.append("lastDeleteStats", oldStatsBuilder.arr());

        return result.obj();
    }
Example #11
0
// static
Status ListFilters::list(const QuerySettings& querySettings, BSONObjBuilder* bob) {
    invariant(bob);

    // Format of BSON result:
    //
    // {
    //     hints: [
    //         {
    //             query: <query>,
    //             sort: <sort>,
    //             projection: <projection>,
    //             indexes: [<index1>, <index2>, <index3>, ...]
    //         }
    //  }
    BSONArrayBuilder hintsBuilder(bob->subarrayStart("filters"));
    OwnedPointerVector<AllowedIndexEntry> entries;
    entries.mutableVector() = querySettings.getAllAllowedIndices();
    for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin();
            i != entries.end(); ++i) {
        AllowedIndexEntry* entry = *i;
        invariant(entry);

        BSONObjBuilder hintBob(hintsBuilder.subobjStart());
        hintBob.append("query", entry->query);
        hintBob.append("sort", entry->sort);
        hintBob.append("projection", entry->projection);
        BSONArrayBuilder indexesBuilder(hintBob.subarrayStart("indexes"));
        for (vector<BSONObj>::const_iterator j = entry->indexKeyPatterns.begin();
                j != entry->indexKeyPatterns.end(); ++j) {
            const BSONObj& index = *j;
            indexesBuilder.append(index);
        }
        indexesBuilder.doneFast();
    }
    hintsBuilder.doneFast();
    return Status::OK();
}
Example #12
0
/*static*/ int MongoFile::_flushAll(bool sync) {
    if (!sync) {
        int num = 0;
        LockMongoFilesShared lk;
        for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) {
            num++;
            MongoFile* mmf = *i;
            if (!mmf)
                continue;

            mmf->flush(sync);
        }
        return num;
    }

    // want to do it sync

    // get a thread-safe Flushable object for each file first in a single lock
    // so that we can iterate and flush without doing any locking here
    OwnedPointerVector<Flushable> thingsToFlushWrapper;
    vector<Flushable*>& thingsToFlush = thingsToFlushWrapper.mutableVector();
    {
        LockMongoFilesShared lk;
        for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) {
            MongoFile* mmf = *i;
            if (!mmf)
                continue;
            thingsToFlush.push_back(mmf->prepareFlush());
        }
    }

    for (size_t i = 0; i < thingsToFlush.size(); i++) {
        thingsToFlush[i]->flush();
    }

    return thingsToFlush.size();
}
Example #13
0
    BSONObj buildApplyOpsCmd( const OwnedPointerVector<ChunkType>& chunksToMerge,
                              const ChunkVersion& currShardVersion,
                              const ChunkVersion& newMergedVersion ) {

        BSONObjBuilder applyOpsCmdB;
        BSONArrayBuilder updatesB( applyOpsCmdB.subarrayStart( "applyOps" ) );

        // The chunk we'll be "expanding" is the first chunk
        const ChunkType* chunkToMerge = *chunksToMerge.begin();

        // Fill in details not tracked by metadata
        ChunkType mergedChunk;
        chunkToMerge->cloneTo( &mergedChunk );
        mergedChunk.setName( Chunk::genID( chunkToMerge->getNS(), chunkToMerge->getMin() ) );
        mergedChunk.setMax( ( *chunksToMerge.vector().rbegin() )->getMax() );
        mergedChunk.setVersion( newMergedVersion );

        updatesB.append( buildOpMergeChunk( mergedChunk ) );

        // Don't remove chunk we're expanding
        OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin();
        for ( ++it; it != chunksToMerge.end(); ++it ) {
            ChunkType* chunkToMerge = *it;
            chunkToMerge->setName( Chunk::genID( chunkToMerge->getNS(), chunkToMerge->getMin() ) );
            updatesB.append( buildOpRemoveChunk( *chunkToMerge ) );
        }

        updatesB.done();

        applyOpsCmdB.append( "preCondition",
                             buildOpPrecond( chunkToMerge->getNS(),
                                             chunkToMerge->getShard(),
                                             currShardVersion ) );

        return applyOpsCmdB.obj();
    }
Example #14
0
    StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) {

        if ( isCapped() )
            return StatusWith<CompactStats>( ErrorCodes::BadValue,
                                             "cannot compact capped collection" );

        if ( _indexCatalog.numIndexesInProgress() )
            return StatusWith<CompactStats>( ErrorCodes::BadValue,
                                             "cannot compact when indexes in progress" );

        NamespaceDetails* d = details();

        // this is a big job, so might as well make things tidy before we start just to be nice.
        getDur().commitIfNeeded();

        list<DiskLoc> extents;
        for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext )
            extents.push_back(L);
        log() << "compact " << extents.size() << " extents" << endl;

        // same data, but might perform a little different after compact?
        _infoCache.reset();

        vector<BSONObj> indexSpecs;
        {
            IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) );
            while ( ii.more() ) {
                IndexDescriptor* descriptor = ii.next();
                indexSpecs.push_back( _compactAdjustIndexSpec( descriptor->infoObj() ) );
            }
        }

        log() << "compact orphan deleted lists" << endl;
        d->orphanDeletedList();

        // Start over from scratch with our extent sizing and growth
        d->setLastExtentSize( 0 );

        // before dropping indexes, at least make sure we can allocate one extent!
        if ( allocateSpaceForANewRecord( _ns.ns().c_str(),
                                         d,
                                         Record::HeaderSize+1,
                                         false).isNull() ) {
            return StatusWith<CompactStats>( ErrorCodes::InternalError,
                                             "compact error no space available to allocate" );
        }

        // note that the drop indexes call also invalidates all clientcursors for the namespace,
        // which is important and wanted here
        log() << "compact dropping indexes" << endl;
        Status status = _indexCatalog.dropAllIndexes( true );
        if ( !status.isOK() ) {
            return StatusWith<CompactStats>( status );
        }

        getDur().commitIfNeeded();

        CompactStats stats;

        OwnedPointerVector<IndexCatalog::IndexBuildBlock> indexBuildBlocks;
        vector<IndexAccessMethod*> indexesToInsertTo;
        vector< std::pair<IndexAccessMethod*,IndexAccessMethod*> > bulkToCommit;
        for ( size_t i = 0; i < indexSpecs.size(); i++ ) {
            killCurrentOp.checkForInterrupt(false);
            BSONObj info = indexSpecs[i];
            info = _compactAdjustIndexSpec( info );
            info = _indexCatalog.fixIndexSpec( info );
            auto_ptr<IndexCatalog::IndexBuildBlock> block( new IndexCatalog::IndexBuildBlock( this,info ) );
            Status status = block->init();
            if ( !status.isOK() )
                return StatusWith<CompactStats>(status);

            IndexAccessMethod* accessMethod = block->getEntry()->accessMethod();
            status = accessMethod->initializeAsEmpty();
            if ( !status.isOK() )
                return StatusWith<CompactStats>(status);

            IndexAccessMethod* bulk = accessMethod->initiateBulk();
            if ( bulk ) {
                indexesToInsertTo.push_back( bulk );
                bulkToCommit.push_back( std::pair<IndexAccessMethod*,IndexAccessMethod*>( accessMethod, bulk ) );
            }
            else {
                indexesToInsertTo.push_back( accessMethod );
            }

            indexBuildBlocks.mutableVector().push_back( block.release() );
        }

        // reset data size and record counts to 0 for this namespace
        // as we're about to tally them up again for each new extent
        d->setStats( 0, 0 );

        ProgressMeterHolder pm(cc().curop()->setMessage("compact extent",
                                                        "Extent Compacting Progress",
                                                        extents.size()));

        int extentNumber = 0;
        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
            _compactExtent(*i, extentNumber++, indexesToInsertTo, compactOptions, &stats );
            pm.hit();
        }

        verify( d->firstExtent().ext()->xprev.isNull() );

        // indexes will do their own progress meter?
        pm.finished();

        log() << "starting index commits";

        for ( size_t i = 0; i < bulkToCommit.size(); i++ ) {
            bulkToCommit[i].first->commitBulk( bulkToCommit[i].second, false, NULL );
        }

        for ( size_t i = 0; i < indexBuildBlocks.size(); i++ ) {
            IndexCatalog::IndexBuildBlock* block = indexBuildBlocks.mutableVector()[i];
            block->success();
        }

        return StatusWith<CompactStats>( stats );
    }
Example #15
0
    static StatusWith<double> computeGeoNearDistance(const GeoNearParams& nearParams,
                                                     WorkingSetMember* member) {

        //
        // Generic GeoNear distance computation
        // Distances are computed by projecting the stored geometry into the query CRS, and
        // computing distance in that CRS.
        //

        // Must have an object in order to get geometry out of it.
        invariant(member->hasObj());

        CRS queryCRS = nearParams.nearQuery.centroid.crs;

        // Extract all the geometries out of this document for the near query
        OwnedPointerVector<StoredGeometry> geometriesOwned;
        vector<StoredGeometry*>& geometries = geometriesOwned.mutableVector();
        extractGeometries(member->obj, nearParams.nearQuery.field, &geometries);

        // Compute the minimum distance of all the geometries in the document
        double minDistance = -1;
        BSONObj minDistanceObj;
        for (vector<StoredGeometry*>::iterator it = geometries.begin(); it != geometries.end();
            ++it) {

            StoredGeometry& stored = **it;

            // NOTE: A stored document with STRICT_SPHERE CRS is treated as a malformed document
            // and ignored. Since GeoNear requires an index, there's no stored STRICT_SPHERE shape.
            // So we don't check it here.

            // NOTE: For now, we're sure that if we get this far in the query we'll have an
            // appropriate index which validates the type of geometry we're pulling back here.
            // TODO: It may make sense to change our semantics and, by default, only return
            // shapes in the same CRS from $geoNear.
            if (!stored.geometry.supportsProject(queryCRS))
                continue;
            stored.geometry.projectInto(queryCRS);

            double nextDistance = stored.geometry.minDistance(nearParams.nearQuery.centroid);

            if (minDistance < 0 || nextDistance < minDistance) {
                minDistance = nextDistance;
                minDistanceObj = stored.element.Obj();
            }
        }

        if (minDistance < 0) {
            // No distance to report
            return StatusWith<double>(-1);
        }

        if (nearParams.addDistMeta) {
            if (nearParams.nearQuery.unitsAreRadians) {
                // Hack for nearSphere
                // TODO: Remove nearSphere?
                invariant(SPHERE == queryCRS);
                member->addComputed(new GeoDistanceComputedData(minDistance
                                                                / kRadiusOfEarthInMeters));
            }
            else {
                member->addComputed(new GeoDistanceComputedData(minDistance));
            }
        }

        if (nearParams.addPointMeta) {
            member->addComputed(new GeoNearPointComputedData(minDistanceObj));
        }

        return StatusWith<double>(minDistance);
    }
Example #16
0
Status BatchWriteOp::targetBatch(OperationContext* txn,
                                 const NSTargeter& targeter,
                                 bool recordTargetErrors,
                                 vector<TargetedWriteBatch*>* targetedBatches) {
    //
    // Targeting of unordered batches is fairly simple - each remaining write op is targeted,
    // and each of those targeted writes are grouped into a batch for a particular shard
    // endpoint.
    //
    // Targeting of ordered batches is a bit more complex - to respect the ordering of the
    // batch, we can only send:
    // A) a single targeted batch to one shard endpoint
    // B) multiple targeted batches, but only containing targeted writes for a single write op
    //
    // This means that any multi-shard write operation must be targeted and sent one-by-one.
    // Subsequent single-shard write operations can be batched together if they go to the same
    // place.
    //
    // Ex: ShardA : { skey : a->k }, ShardB : { skey : k->z }
    //
    // Ordered insert batch of: [{ skey : a }, { skey : b }, { skey : x }]
    // broken into:
    //  [{ skey : a }, { skey : b }],
    //  [{ skey : x }]
    //
    // Ordered update Batch of :
    //  [{ skey : a }{ $push },
    //   { skey : b }{ $push },
    //   { skey : [c, x] }{ $push },
    //   { skey : y }{ $push },
    //   { skey : z }{ $push }]
    // broken into:
    //  [{ skey : a }, { skey : b }],
    //  [{ skey : [c,x] }],
    //  [{ skey : y }, { skey : z }]
    //

    const bool ordered = _clientRequest->getOrdered();

    TargetedBatchMap batchMap;
    TargetedBatchSizeMap batchSizes;

    int numTargetErrors = 0;

    size_t numWriteOps = _clientRequest->sizeWriteOps();
    for (size_t i = 0; i < numWriteOps; ++i) {
        WriteOp& writeOp = _writeOps[i];

        // Only target _Ready ops
        if (writeOp.getWriteState() != WriteOpState_Ready)
            continue;

        //
        // Get TargetedWrites from the targeter for the write operation
        //

        // TargetedWrites need to be owned once returned
        OwnedPointerVector<TargetedWrite> writesOwned;
        vector<TargetedWrite*>& writes = writesOwned.mutableVector();

        Status targetStatus = writeOp.targetWrites(txn, targeter, &writes);

        if (!targetStatus.isOK()) {
            WriteErrorDetail targetError;
            buildTargetError(targetStatus, &targetError);

            if (!recordTargetErrors) {
                // Cancel current batch state with an error

                cancelBatches(targetError, _writeOps, &batchMap);
                dassert(batchMap.empty());
                return targetStatus;
            } else if (!ordered || batchMap.empty()) {
                // Record an error for this batch

                writeOp.setOpError(targetError);
                ++numTargetErrors;

                if (ordered)
                    return Status::OK();

                continue;
            } else {
                dassert(ordered && !batchMap.empty());

                // Send out what we have, but don't record an error yet, since there may be an
                // error in the writes before this point.

                writeOp.cancelWrites(&targetError);
                break;
            }
        }

        //
        // If ordered and we have a previous endpoint, make sure we don't need to send these
        // targeted writes to any other endpoints.
        //

        if (ordered && !batchMap.empty()) {
            dassert(batchMap.size() == 1u);
            if (isNewBatchRequired(writes, batchMap)) {
                writeOp.cancelWrites(NULL);
                break;
            }
        }

        //
        // If this write will push us over some sort of size limit, stop targeting
        //

        int writeSizeBytes = getWriteSizeBytes(writeOp);
        if (wouldMakeBatchesTooBig(writes, writeSizeBytes, batchSizes)) {
            invariant(!batchMap.empty());
            writeOp.cancelWrites(NULL);
            break;
        }

        //
        // Targeting went ok, add to appropriate TargetedBatch
        //

        for (vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it) {
            TargetedWrite* write = *it;

            TargetedBatchMap::iterator batchIt = batchMap.find(&write->endpoint);
            TargetedBatchSizeMap::iterator batchSizeIt = batchSizes.find(&write->endpoint);

            if (batchIt == batchMap.end()) {
                TargetedWriteBatch* newBatch = new TargetedWriteBatch(write->endpoint);
                batchIt = batchMap.insert(make_pair(&newBatch->getEndpoint(), newBatch)).first;
                batchSizeIt =
                    batchSizes.insert(make_pair(&newBatch->getEndpoint(), BatchSize())).first;
            }

            TargetedWriteBatch* batch = batchIt->second;
            BatchSize& batchSize = batchSizeIt->second;

            ++batchSize.numOps;
            batchSize.sizeBytes += writeSizeBytes;
            batch->addWrite(write);
        }

        // Relinquish ownership of TargetedWrites, now the TargetedBatches own them
        writesOwned.mutableVector().clear();

        //
        // Break if we're ordered and we have more than one endpoint - later writes cannot be
        // enforced as ordered across multiple shard endpoints.
        //

        if (ordered && batchMap.size() > 1u)
            break;
    }

    //
    // Send back our targeted batches
    //

    for (TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it) {
        TargetedWriteBatch* batch = it->second;

        if (batch->getWrites().empty())
            continue;

        // Remember targeted batch for reporting
        _targeted.insert(batch);
        // Send the handle back to caller
        targetedBatches->push_back(batch);
    }

    return Status::OK();
}
        virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options,
                          string& errmsg, BSONObjBuilder& result,
                          bool fromRepl = false ) {

            NamespaceString ns( dbname, cmdObj[name].String() );

            AutoGetCollectionForRead ctx(txn, ns.ns());

            Collection* collection = ctx.getCollection();
            if ( !collection )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::NamespaceNotFound,
                                                    str::stream() <<
                                                    "ns does not exist: " << ns.ns() ) );

            size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() );

            if ( numCursors == 0 || numCursors > 10000 )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::BadValue,
                                                    str::stream() <<
                                                    "numCursors has to be between 1 and 10000" <<
                                                    " was: " << numCursors ) );

            OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn));

            if (iterators.size() < numCursors) {
                numCursors = iterators.size();
            }

            OwnedPointerVector<PlanExecutor> execs;
            for ( size_t i = 0; i < numCursors; i++ ) {
                WorkingSet* ws = new WorkingSet();
                MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection);

                PlanExecutor* rawExec;
                // Takes ownership of 'ws' and 'mis'.
                Status execStatus = PlanExecutor::make(txn, ws, mis, collection,
                                                       PlanExecutor::YIELD_AUTO, &rawExec);
                invariant(execStatus.isOK());
                auto_ptr<PlanExecutor> curExec(rawExec);

                // The PlanExecutor was registered on construction due to the YIELD_AUTO policy.
                // We have to deregister it, as it will be registered with ClientCursor.
                curExec->deregisterExec();

                // Need to save state while yielding locks between now and getMore().
                curExec->saveState();

                execs.push_back(curExec.release());
            }

            // transfer iterators to executors using a round-robin distribution.
            // TODO consider using a common work queue once invalidation issues go away.
            for (size_t i = 0; i < iterators.size(); i++) {
                PlanExecutor* theExec = execs[i % execs.size()];
                MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage());

                // This wasn't called above as they weren't assigned yet
                iterators[i]->saveState();

                mis->addIterator(iterators.releaseAt(i));
            }

            {
                BSONArrayBuilder bucketsBuilder;
                for (size_t i = 0; i < execs.size(); i++) {
                    // transfer ownership of an executor to the ClientCursor (which manages its own
                    // lifetime).
                    ClientCursor* cc = new ClientCursor( collection->getCursorManager(),
                                                         execs.releaseAt(i),
                                                         ns.ns() );

                    BSONObjBuilder threadResult;
                    appendCursorResponseObject( cc->cursorid(),
                                                ns.ns(),
                                                BSONArray(),
                                                &threadResult );
                    threadResult.appendBool( "ok", 1 );

                    bucketsBuilder.append( threadResult.obj() );
                }
                result.appendArray( "cursors", bucketsBuilder.obj() );
            }

            return true;

        }
        virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options,
                          string& errmsg, BSONObjBuilder& result,
                          bool fromRepl = false ) {

            NamespaceString ns( dbname, cmdObj[name].String() );

            AutoGetCollectionForRead ctx(txn, ns.ns());

            Collection* collection = ctx.getCollection();
            if ( !collection )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::NamespaceNotFound,
                                                    str::stream() <<
                                                    "ns does not exist: " << ns.ns() ) );

            size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() );

            if ( numCursors == 0 || numCursors > 10000 )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::BadValue,
                                                    str::stream() <<
                                                    "numCursors has to be between 1 and 10000" <<
                                                    " was: " << numCursors ) );

            OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn));

            if (iterators.size() < numCursors) {
                numCursors = iterators.size();
            }

            OwnedPointerVector<PlanExecutor> execs;
            for ( size_t i = 0; i < numCursors; i++ ) {
                WorkingSet* ws = new WorkingSet();
                MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection);
                // Takes ownership of 'ws' and 'mis'.
                auto_ptr<PlanExecutor> curExec(new PlanExecutor(txn, ws, mis, collection));

                // Each of the plan executors should yield automatically. We pass "false" to
                // indicate that 'curExec' should not register itself, as it will get registered
                // by ClientCursor instead.
                curExec->setYieldPolicy(PlanExecutor::YIELD_AUTO, false);

                // Need to save state while yielding locks between now and newGetMore.
                curExec->saveState();

                execs.push_back(curExec.release());
            }

            // transfer iterators to executors using a round-robin distribution.
            // TODO consider using a common work queue once invalidation issues go away.
            for (size_t i = 0; i < iterators.size(); i++) {
                PlanExecutor* theExec = execs[i % execs.size()];
                MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage());
                mis->addIterator(iterators.releaseAt(i));
            }

            {
                BSONArrayBuilder bucketsBuilder;
                for (size_t i = 0; i < execs.size(); i++) {
                    // transfer ownership of an executor to the ClientCursor (which manages its own
                    // lifetime).
                    ClientCursor* cc = new ClientCursor( collection, execs.releaseAt(i) );

                    // we are mimicking the aggregation cursor output here
                    // that is why there are ns, ok and empty firstBatch
                    BSONObjBuilder threadResult;
                    {
                        BSONObjBuilder cursor;
                        cursor.appendArray( "firstBatch", BSONObj() );
                        cursor.append( "ns", ns );
                        cursor.append( "id", cc->cursorid() );
                        threadResult.append( "cursor", cursor.obj() );
                    }
                    threadResult.appendBool( "ok", 1 );

                    bucketsBuilder.append( threadResult.obj() );
                }
                result.appendArray( "cursors", bucketsBuilder.obj() );
            }

            return true;

        }
Example #19
0
    // static
    void Explain::explainStages(PlanExecutor* exec,
                                ExplainCommon::Verbosity verbosity,
                                BSONObjBuilder* out) {
        //
        // Step 1: run the stages as required by the verbosity level.
        //

        // Inspect the tree to see if there is a MultiPlanStage.
        MultiPlanStage* mps = getMultiPlanStage(exec->getRootStage());

        // Get stats of the winning plan from the trial period, if the verbosity level
        // is high enough and there was a runoff between multiple plans.
        auto_ptr<PlanStageStats> winningStatsTrial;
        if (verbosity >= ExplainCommon::EXEC_ALL_PLANS && NULL != mps) {
            winningStatsTrial.reset(exec->getStats());
            invariant(winningStatsTrial.get());
        }

        // If we need execution stats, then run the plan in order to gather the stats.
        Status executePlanStatus = Status::OK();
        if (verbosity >= ExplainCommon::EXEC_STATS) {
            executePlanStatus = exec->executePlan();
        }

        //
        // Step 2: collect plan stats (which also give the structure of the plan tree).
        //

        // Get stats for the winning plan.
        scoped_ptr<PlanStageStats> winningStats(exec->getStats());

        // Get stats for the rejected plans, if more than one plan was considered.
        OwnedPointerVector<PlanStageStats> allPlansStats;
        if (NULL != mps) {
            allPlansStats = mps->generateCandidateStats();
        }

        //
        // Step 3: use the stats trees to produce explain BSON.
        //

        CanonicalQuery* query = exec->getCanonicalQuery();
        if (verbosity >= ExplainCommon::QUERY_PLANNER) {
            generatePlannerInfo(query, winningStats.get(), allPlansStats.vector(), out);
        }

        if (verbosity >= ExplainCommon::EXEC_STATS) {
            BSONObjBuilder execBob(out->subobjStart("executionStats"));

            // If there is an execution error while running the query, the error is reported under
            // the "executionStats" section and the explain as a whole succeeds.
            execBob.append("executionSuccess", executePlanStatus.isOK());
            if (!executePlanStatus.isOK()) {
                execBob.append("errorMessage", executePlanStatus.reason());
                execBob.append("errorCode", executePlanStatus.code());
            }

            // Generate exec stats BSON for the winning plan.
            OperationContext* opCtx = exec->getOpCtx();
            long long totalTimeMillis = opCtx->getCurOp()->elapsedMillis();
            generateExecStats(winningStats.get(), verbosity, &execBob, totalTimeMillis);

            // Also generate exec stats for all plans, if the verbosity level is high enough.
            // These stats reflect what happened during the trial period that ranked the plans.
            if (verbosity >= ExplainCommon::EXEC_ALL_PLANS) {
                // If we ranked multiple plans against each other, then add stats collected
                // from the trial period of the winning plan. The "allPlansExecution" section
                // will contain an apples-to-apples comparison of the winning plan's stats against
                // all rejected plans' stats collected during the trial period.
                if (NULL != mps) {
                    invariant(winningStatsTrial.get());
                    allPlansStats.push_back(winningStatsTrial.release());
                }

                BSONArrayBuilder allPlansBob(execBob.subarrayStart("allPlansExecution"));
                for (size_t i = 0; i < allPlansStats.size(); ++i) {
                    BSONObjBuilder planBob(allPlansBob.subobjStart());
                    generateExecStats(allPlansStats[i], verbosity, &planBob);
                    planBob.doneFast();
                }
                allPlansBob.doneFast();
            }

            execBob.doneFast();
        }

        generateServerInfo(out);
    }
Example #20
0
    Status Strategy::commandOpWrite(const std::string& dbName,
                                    const BSONObj& command,
                                    BatchItemRef targetingBatchItem,
                                    std::vector<CommandResult>* results) {

        // Note that this implementation will not handle targeting retries and does not completely
        // emulate write behavior

        ChunkManagerTargeter targeter(NamespaceString(
                                        targetingBatchItem.getRequest()->getTargetingNS()));
        Status status = targeter.init();
        if (!status.isOK())
            return status;

        OwnedPointerVector<ShardEndpoint> endpointsOwned;
        vector<ShardEndpoint*>& endpoints = endpointsOwned.mutableVector();

        if (targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Insert) {
            ShardEndpoint* endpoint;
            Status status = targeter.targetInsert(targetingBatchItem.getDocument(), &endpoint);
            if (!status.isOK())
                return status;
            endpoints.push_back(endpoint);
        }
        else if (targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Update) {
            Status status = targeter.targetUpdate(*targetingBatchItem.getUpdate(), &endpoints);
            if (!status.isOK())
                return status;
        }
        else {
            invariant(targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Delete);
            Status status = targeter.targetDelete(*targetingBatchItem.getDelete(), &endpoints);
            if (!status.isOK())
                return status;
        }

        DBClientShardResolver resolver;
        DBClientMultiCommand dispatcher;

        // Assemble requests
        for (vector<ShardEndpoint*>::const_iterator it = endpoints.begin(); it != endpoints.end();
            ++it) {

            const ShardEndpoint* endpoint = *it;

            ConnectionString host;
            Status status = resolver.chooseWriteHost(endpoint->shardName, &host);
            if (!status.isOK())
                return status;

            RawBSONSerializable request(command);
            dispatcher.addCommand(host, dbName, request);
        }

        // Errors reported when recv'ing responses
        dispatcher.sendAll();
        Status dispatchStatus = Status::OK();

        // Recv responses
        while (dispatcher.numPending() > 0) {

            ConnectionString host;
            RawBSONSerializable response;

            Status status = dispatcher.recvAny(&host, &response);
            if (!status.isOK()) {
                // We always need to recv() all the sent operations
                dispatchStatus = status;
                continue;
            }

            CommandResult result;
            result.target = host;
            result.shardTarget = Shard::make(host.toString());
            result.result = response.toBSON();

            results->push_back(result);
        }

        return dispatchStatus;
    }
Example #21
0
    /**
     * The core config write functionality.
     *
     * Config writes run in two passes - the first is a quick check to ensure the config servers
     * are all reachable, the second runs the actual write.
     *
     * TODO: Upgrade and move this logic to the config servers, a state machine implementation
     * is probably the next step.
     */
    void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest,
                                          BatchedCommandResponse* clientResponse,
                                          bool fsyncCheck ) {

        NamespaceString nss( clientRequest.getNS() );
        dassert( nss.db() == "config" || nss.db() == "admin" );
        dassert( clientRequest.sizeWriteOps() == 1u );

        if ( fsyncCheck ) {

            //
            // Sanity check that all configs are still reachable using fsync, preserving legacy
            // behavior
            //

            OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned;
            vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector();

            //
            // Send side
            //

            for ( vector<ConnectionString>::iterator it = _configHosts.begin();
                it != _configHosts.end(); ++it ) {
                ConnectionString& configHost = *it;
                FsyncRequest fsyncRequest;
                _dispatcher->addCommand( configHost, "admin", fsyncRequest );
            }

            _dispatcher->sendAll();

            //
            // Recv side
            //

            bool fsyncError = false;
            while ( _dispatcher->numPending() > 0 ) {

                fsyncResponses.push_back( new ConfigFsyncResponse() );
                ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back();
                Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost,
                                                              &fsyncResponse.response );

                // We've got to recv everything, no matter what
                if ( !dispatchStatus.isOK() ) {
                    fsyncError = true;
                    buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response );
                }
                else if ( !fsyncResponse.response.getOk() ) {
                    fsyncError = true;
                }
            }

            if ( fsyncError ) {
                combineFsyncErrors( fsyncResponses, clientResponse );
                return;
            }
            else {
                fsyncResponsesOwned.clear();
            }
        }

        //
        // Do the actual writes
        //

        BatchedCommandRequest configRequest( clientRequest.getBatchType() );
        clientRequest.cloneTo( &configRequest );
        configRequest.setNS( nss.coll() );

        OwnedPointerVector<ConfigResponse> responsesOwned;
        vector<ConfigResponse*>& responses = responsesOwned.mutableVector();

        //
        // Send the actual config writes
        //

        // Get as many batches as we can at once
        for ( vector<ConnectionString>::iterator it = _configHosts.begin();
            it != _configHosts.end(); ++it ) {
            ConnectionString& configHost = *it;
            _dispatcher->addCommand( configHost, nss.db(), configRequest );
        }

        // Send them all out
        _dispatcher->sendAll();

        //
        // Recv side
        //

        while ( _dispatcher->numPending() > 0 ) {

            // Get the response
            responses.push_back( new ConfigResponse() );
            ConfigResponse& configResponse = *responses.back();
            Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost,
                                                          &configResponse.response );

            if ( !dispatchStatus.isOK() ) {
                buildErrorFrom( dispatchStatus, &configResponse.response );
            }
        }

        combineResponses( responses, clientResponse );
    }
Example #22
0
    static Status parseGeoJSONPolygonCoordinates(const BSONElement& elem, S2Polygon *out) {
        if (Array != elem.type()) { return BAD_VALUE("Polygon coordinates must be an array"); }

        OwnedPointerVector<S2Loop> loops;
        Status status = Status::OK();
        string err;

        BSONObjIterator it(elem.Obj());
        // Iterate all loops of the polygon.
        while (it.more()) {
            // Parse the array of vertices of a loop.
            BSONElement coordinateElt = it.next();
            vector<S2Point> points;
            status = parseArrayOfCoodinates(coordinateElt, &points);
            if (!status.isOK()) return status;

            // Check if the loop is closed.
            status = isLoopClosed(points, coordinateElt);
            if (!status.isOK()) return status;

            eraseDuplicatePoints(&points);
            // Drop the duplicated last point.
            points.resize(points.size() - 1);

            S2Loop* loop = new S2Loop(points);
            loops.push_back(loop);

            // Check whether this loop is valid.
            // 1. At least 3 vertices.
            // 2. All vertices must be unit length. Guaranteed by parsePoints().
            // 3. Loops are not allowed to have any duplicate vertices.
            // 4. Non-adjacent edges are not allowed to intersect.
            if (!loop->IsValid(&err)) {
                return BAD_VALUE("Loop is not valid: " << coordinateElt.toString(false) << " "
                                 << err);
            }
            // If the loop is more than one hemisphere, invert it.
            loop->Normalize();

            // Check the first loop must be the exterior ring and any others must be
            // interior rings or holes.
            if (loops.size() > 1 && !loops[0]->Contains(loop)) {
                return BAD_VALUE("Secondary loops not contained by first exterior loop - "
                    "secondary loops must be holes: " << coordinateElt.toString(false)
                    << " first loop: " << elem.Obj().firstElement().toString(false));
            }
        }

        // Check if the given loops form a valid polygon.
        // 1. If a loop contains an edge AB, then no other loop may contain AB or BA.
        // 2. No loop covers more than half of the sphere.
        // 3. No two loops cross.
        if (!S2Polygon::IsValid(loops.vector(), &err))
            return BAD_VALUE("Polygon isn't valid: " << err << " " << elem.toString(false));

        // Given all loops are valid / normalized and S2Polygon::IsValid() above returns true.
        // The polygon must be valid. See S2Polygon member function IsValid().

        // Transfer ownership of the loops and clears loop vector.
        out->Init(&loops.mutableVector());

        // Check if every loop of this polygon shares at most one vertex with
        // its parent loop.
        if (!out->IsNormalized(&err))
            // "err" looks like "Loop 1 shares more than one vertex with its parent loop 0"
            return BAD_VALUE(err << ": " << elem.toString(false));

        // S2Polygon contains more than one ring, which is allowed by S2, but not by GeoJSON.
        //
        // Loops are indexed according to a preorder traversal of the nesting hierarchy.
        // GetLastDescendant() returns the index of the last loop that is contained within
        // a given loop. We guarantee that the first loop is the exterior ring.
        if (out->GetLastDescendant(0) < out->num_loops() - 1) {
            return BAD_VALUE("Only one exterior polygon loop is allowed: " << elem.toString(false));
        }

        // In GeoJSON, only one nesting is allowed.
        // The depth of a loop is set by polygon according to the nesting hierarchy of polygon,
        // so the exterior ring's depth is 0, a hole in it is 1, etc.
        for (int i = 0; i < out->num_loops(); i++) {
            if (out->loop(i)->depth() > 1) {
                return BAD_VALUE("Polygon interior loops cannot be nested: "<< elem.toString(false));
            }
        }
        return Status::OK();
    }
Example #23
0
    void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request,
                                           BatchedCommandResponse* response ) {

        // Validate namespace
        const NamespaceString nss = NamespaceString( request.getNS() );
        if ( !nss.isValid() ) {
            toBatchError( Status( ErrorCodes::InvalidNamespace,
                                  nss.ns() + " is not a valid namespace" ),
                          response );
            return;
        }

        // Make sure we can write to the namespace
        Status allowedStatus = userAllowedWriteNS( nss );
        if ( !allowedStatus.isOK() ) {
            toBatchError( allowedStatus, response );
            return;
        }

        // Validate insert index requests
        // TODO: Push insert index requests through createIndex once all upgrade paths support it
        string errMsg;
        if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) {
            toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response );
            return;
        }

        // Validate write concern
        // TODO: Lift write concern parsing out of this entirely
        WriteConcernOptions writeConcern;

        BSONObj wcDoc;
        if ( request.isWriteConcernSet() ) {
            wcDoc = request.getWriteConcern();
        }

        Status wcStatus = Status::OK();
        if ( wcDoc.isEmpty() ) {

            // The default write concern if empty is w : 1
            // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1

            wcStatus = writeConcern.parse(
                _defaultWriteConcern.isEmpty() ?
                    WriteConcernOptions::Acknowledged : _defaultWriteConcern );

            if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) {
                writeConcern.wNumNodes = 1;
            }
        }
        else {
            wcStatus = writeConcern.parse( wcDoc );
        }

        if ( wcStatus.isOK() ) {
            wcStatus = validateWriteConcern( writeConcern );
        }

        if ( !wcStatus.isOK() ) {
            toBatchError( wcStatus, response );
            return;
        }

        if ( request.sizeWriteOps() == 0u ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  "no write ops were included in the batch" ),
                          response );
            return;
        }

        // Validate batch size
        if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) {
            toBatchError( Status( ErrorCodes::InvalidLength,
                                  stream() << "exceeded maximum write batch size of "
                                           << BatchedCommandRequest::kMaxWriteBatchSize ),
                          response );
            return;
        }

        //
        // End validation
        //

        bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0
                        && writeConcern.syncMode == WriteConcernOptions::NONE;

        Timer commandTimer;

        OwnedPointerVector<WriteErrorDetail> writeErrorsOwned;
        vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector();

        OwnedPointerVector<BatchedUpsertDetail> upsertedOwned;
        vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector();

        //
        // Apply each batch item, possibly bulking some items together in the write lock.
        // Stops on error if batch is ordered.
        //

        bulkExecute( request, &upserted, &writeErrors );

        //
        // Try to enforce the write concern if everything succeeded (unordered or ordered)
        // OR if something succeeded and we're unordered.
        //

        auto_ptr<WCErrorDetail> wcError;
        bool needToEnforceWC = writeErrors.empty()
                               || ( !request.getOrdered()
                                    && writeErrors.size() < request.sizeWriteOps() );

        if ( needToEnforceWC ) {

            _client->curop()->setMessage( "waiting for write concern" );

            WriteConcernResult res;
            Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res );

            if ( !status.isOK() ) {
                wcError.reset( toWriteConcernError( status, res ) );
            }
        }

        //
        // Refresh metadata if needed
        //

        bool staleBatch = !writeErrors.empty()
                          && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion;

        if ( staleBatch ) {

            const BatchedRequestMetadata* requestMetadata = request.getMetadata();
            dassert( requestMetadata );

            // Make sure our shard name is set or is the same as what was set previously
            if ( shardingState.setShardName( requestMetadata->getShardName() ) ) {

                //
                // First, we refresh metadata if we need to based on the requested version.
                //

                ChunkVersion latestShardVersion;
                shardingState.refreshMetadataIfNeeded( request.getTargetingNS(),
                                                       requestMetadata->getShardVersion(),
                                                       &latestShardVersion );

                // Report if we're still changing our metadata
                // TODO: Better reporting per-collection
                if ( shardingState.inCriticalMigrateSection() ) {
                    noteInCriticalSection( writeErrors.back() );
                }

                if ( queueForMigrationCommit ) {

                    //
                    // Queue up for migration to end - this allows us to be sure that clients will
                    // not repeatedly try to refresh metadata that is not yet written to the config
                    // server.  Not necessary for correctness.
                    // Exposed as optional parameter to allow testing of queuing behavior with
                    // different network timings.
                    //

                    const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion();

                    //
                    // Only wait if we're an older version (in the current collection epoch) and
                    // we're not write compatible, implying that the current migration is affecting
                    // writes.
                    //

                    if ( requestShardVersion.isOlderThan( latestShardVersion ) &&
                         !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) {

                        while ( shardingState.inCriticalMigrateSection() ) {

                            log() << "write request to old shard version "
                                  << requestMetadata->getShardVersion().toString()
                                  << " waiting for migration commit" << endl;

                            shardingState.waitTillNotInCriticalSection( 10 /* secs */);
                        }
                    }
                }
            }
            else {
                // If our shard name is stale, our version must have been stale as well
                dassert( writeErrors.size() == request.sizeWriteOps() );
            }
        }

        //
        // Construct response
        //

        response->setOk( true );

        if ( !silentWC ) {

            if ( upserted.size() ) {
                response->setUpsertDetails( upserted );
            }

            if ( writeErrors.size() ) {
                response->setErrDetails( writeErrors );
            }

            if ( wcError.get() ) {
                response->setWriteConcernError( wcError.release() );
            }

            const repl::ReplicationCoordinator::Mode replMode =
                    repl::getGlobalReplicationCoordinator()->getReplicationMode();
            if (replMode != repl::ReplicationCoordinator::modeNone) {
                response->setLastOp( _client->getLastOp() );
                if (replMode == repl::ReplicationCoordinator::modeReplSet) {
                    response->setElectionId(repl::theReplSet->getElectionId());
                }
            }

            // Set the stats for the response
            response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched
                            + _stats->numDeleted );
            if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update )
                response->setNModified( _stats->numModified );
        }

        dassert( response->isValid( NULL ) );
    }
        virtual bool run( const string& dbname, BSONObj& cmdObj, int options,
                          string& errmsg, BSONObjBuilder& result,
                          bool fromRepl = false ) {

            NamespaceString ns( dbname, cmdObj[name].String() );

            Client::ReadContext ctx(ns.ns());

            Database* db = ctx.ctx().db();
            Collection* collection = db->getCollection( ns );

            if ( !collection )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::NamespaceNotFound,
                                                    str::stream() <<
                                                    "ns does not exist: " << ns.ns() ) );

            size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() );

            if ( numCursors == 0 || numCursors > 10000 )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::BadValue,
                                                    str::stream() <<
                                                    "numCursors has to be between 1 and 10000" <<
                                                    " was: " << numCursors ) );

            OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators());

            if (iterators.size() < numCursors) {
                numCursors = iterators.size();
            }

            OwnedPointerVector<MultiIteratorRunner> runners;
            for ( size_t i = 0; i < numCursors; i++ ) {
                runners.push_back(new MultiIteratorRunner(ns.ns(), collection));
            }

            // transfer iterators to runners using a round-robin distribution.
            // TODO consider using a common work queue once invalidation issues go away.
            for (size_t i = 0; i < iterators.size(); i++) {
                runners[i % runners.size()]->addIterator(iterators.releaseAt(i));
            }

            {
                BSONArrayBuilder bucketsBuilder;
                for (size_t i = 0; i < runners.size(); i++) {
                    // transfer ownership of a runner to the ClientCursor (which manages its own
                    // lifetime).
                    ClientCursor* cc = new ClientCursor( collection, runners.releaseAt(i) );

                    // we are mimicking the aggregation cursor output here
                    // that is why there are ns, ok and empty firstBatch
                    BSONObjBuilder threadResult;
                    {
                        BSONObjBuilder cursor;
                        cursor.appendArray( "firstBatch", BSONObj() );
                        cursor.append( "ns", ns );
                        cursor.append( "id", cc->cursorid() );
                        threadResult.append( "cursor", cursor.obj() );
                    }
                    threadResult.appendBool( "ok", 1 );

                    bucketsBuilder.append( threadResult.obj() );
                }
                result.appendArray( "cursors", bucketsBuilder.obj() );
            }

            return true;

        }
    /**
     * Upgrade v3 to v4 described here.
     *
     * This upgrade takes a config server without collection epochs (potentially) and adds
     * epochs to all mongo processes.
     *
     */
    bool doUpgradeV3ToV4(const ConnectionString& configLoc,
                         const VersionType& lastVersionInfo,
                         string* errMsg)
    {
        string dummy;
        if (!errMsg) errMsg = &dummy;

        verify(lastVersionInfo.getCurrentVersion() == UpgradeHistory_NoEpochVersion);

        if (lastVersionInfo.isUpgradeIdSet() && lastVersionInfo.getUpgradeId().isSet()) {

            //
            // Another upgrade failed, so cleanup may be necessary
            //

            BSONObj lastUpgradeState = lastVersionInfo.getUpgradeState();

            bool inCriticalSection;
            if (!FieldParser::extract(lastUpgradeState,
                                      inCriticalSectionField,
                                      &inCriticalSection,
                                      errMsg))
            {

                *errMsg = stream() << "problem reading previous upgrade state" << causedBy(errMsg);

                return false;
            }

            if (inCriticalSection) {

                // Manual intervention is needed here.  Somehow our upgrade didn't get applied
                // consistently across config servers.

                *errMsg = cannotCleanupMessage;

                return false;
            }

            if (!_cleanupUpgradeState(configLoc, lastVersionInfo.getUpgradeId(), errMsg)) {
                
                // If we can't cleanup the old upgrade state, the user might have done it for us,
                // not a fatal problem (we'll just end up with extra collections).
                
                warning() << "could not cleanup previous upgrade state" << causedBy(errMsg) << endl;
                *errMsg = "";
            }
        }

        //
        // Check the versions of other mongo processes in the cluster before upgrade.
        // We can't upgrade if there are active pre-v2.2 processes in the cluster
        //

        Status mongoVersionStatus = checkClusterMongoVersions(configLoc,
                                                              string(minMongoProcessVersion));

        if (!mongoVersionStatus.isOK()) {

            *errMsg = stream() << "cannot upgrade with pre-v" << minMongoProcessVersion
                               << " mongo processes active in the cluster"
                               << causedBy(mongoVersionStatus);

            return false;
        }

        VersionType newVersionInfo;
        lastVersionInfo.cloneTo(&newVersionInfo);

        // Set our upgrade id and state
        OID upgradeId = OID::gen();
        newVersionInfo.setUpgradeId(upgradeId);
        newVersionInfo.setUpgradeState(BSONObj());

        // Write our upgrade id and state
        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                verify(newVersionInfo.isValid(NULL));

                conn->update(VersionType::ConfigNS,
                             BSON("_id" << 1 << VersionType::version_DEPRECATED(3)),
                             newVersionInfo.toBSON());
                _checkGLE(conn);
            }
            catch (const DBException& e) {

                *errMsg = stream() << "could not initialize version info for upgrade"
                                   << causedBy(e);

                return false;
            }

            connPtr->done();
        }

        //
        // First lock all collection namespaces that exist
        //

        OwnedPointerMap<string, CollectionType> ownedCollections;
        const map<string, CollectionType*>& collections = ownedCollections.map();

        Status findCollectionsStatus = findAllCollectionsV3(configLoc, &ownedCollections);

        if (!findCollectionsStatus.isOK()) {

            *errMsg = stream() << "could not read collections from config server"
                               << causedBy(findCollectionsStatus);

            return false;
        }

        //
        // Acquire locks for all sharded collections
        // Something that didn't involve getting thousands of locks would be better.
        //

        OwnedPointerVector<ScopedDistributedLock> collectionLocks;

        log() << "acquiring locks for " << collections.size() << " sharded collections..." << endl;
        
        // WARNING - this string is used programmatically when forcing locks, be careful when
        // changing!
        // TODO: Add programmatic "why" field to lock collection
        string lockMessage = str::stream() << "ensuring epochs for config upgrade"
                                           << " (" << upgradeId.toString() << ")";
        
        if (!_acquireAllCollectionLocks(configLoc,
                                        collections,
                                        lockMessage,
                                        20 * 60 * 1000,
                                        &collectionLocks,
                                        errMsg))
        {

            *errMsg = stream() << "could not acquire all namespace locks for upgrade" 
                               << " (" << upgradeId.toString() << ")"
                               << causedBy(errMsg);

            return false;
        }

        // We are now preventing all splits and migrates for all sharded collections

        // Get working and backup suffixes
        string workingSuffix = genWorkingSuffix(upgradeId);
        string backupSuffix = genBackupSuffix(upgradeId);

        log() << "copying collection and chunk metadata to working and backup collections..."
              << endl;

        // Get a backup and working copy of the config.collections and config.chunks collections

        Status copyStatus = copyFrozenCollection(configLoc,
                                                 CollectionType::ConfigNS,
                                                 CollectionType::ConfigNS + workingSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to "
                               << (CollectionType::ConfigNS + workingSuffix)
                               << causedBy(copyStatus);

            return false;
        }

        copyStatus = copyFrozenCollection(configLoc,
                                          CollectionType::ConfigNS,
                                          CollectionType::ConfigNS + backupSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to "
                               << (CollectionType::ConfigNS + backupSuffix) << causedBy(copyStatus);

            return false;
        }

        copyStatus = copyFrozenCollection(configLoc,
                                          ChunkType::ConfigNS,
                                          ChunkType::ConfigNS + workingSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to "
                               << (ChunkType::ConfigNS + workingSuffix) << causedBy(copyStatus);

            return false;
        }

        copyStatus = copyFrozenCollection(configLoc,
                                          ChunkType::ConfigNS,
                                          ChunkType::ConfigNS + backupSuffix);

        if (!copyStatus.isOK()) {

            *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to "
                               << (ChunkType::ConfigNS + backupSuffix) << causedBy(copyStatus);

            return false;
        }

        //
        // Go through sharded collections one-by-one and add epochs where missing
        //

        for (map<string, CollectionType*>::const_iterator it = collections.begin();
                it != collections.end(); ++it)
        {
            // Create a copy so that we can change the epoch later
            CollectionType collection;
            it->second->cloneTo(&collection);

            log() << "checking epochs for " << collection.getNS() << " collection..." << endl;

            OID epoch = collection.getEpoch();

            //
            // Go through chunks to find epoch if we haven't found it or to verify epoch is the same
            //

            OwnedPointerVector<ChunkType> ownedChunks;
            const vector<ChunkType*>& chunks = ownedChunks.vector();

            Status findChunksStatus = findAllChunks(configLoc, collection.getNS(), &ownedChunks);

            if (!findChunksStatus.isOK()) {

                *errMsg = stream() << "could not read chunks from config server"
                                   << causedBy(findChunksStatus);

                return false;
            }

            for (vector<ChunkType*>::const_iterator chunkIt = chunks.begin();
                    chunkIt != chunks.end(); ++chunkIt)
            {
                const ChunkType& chunk = *(*chunkIt);

                // If our chunk epoch is set and doesn't match
                if (epoch.isSet() && chunk.getVersion().epoch().isSet()
                    && chunk.getVersion().epoch() != epoch)
                {

                    *errMsg = stream() << "chunk epoch for " << chunk.toString() << " in "
                                       << collection.getNS() << " does not match found epoch "
                                       << epoch;

                    return false;
                }
                else if (!epoch.isSet() && chunk.getVersion().epoch().isSet()) {
                    epoch = chunk.getVersion().epoch();
                }
            }

            //
            // Write collection epoch if needed
            //

            if (!collection.getEpoch().isSet()) {

                OID newEpoch = OID::gen();

                log() << "writing new epoch " << newEpoch << " for " << collection.getNS()
                      << " collection..." << endl;

                scoped_ptr<ScopedDbConnection> connPtr;

                try {
                    connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                    ScopedDbConnection& conn = *connPtr;

                    conn->update(CollectionType::ConfigNS + workingSuffix,
                                 BSON(CollectionType::ns(collection.getNS())),
                                 BSON("$set" << BSON(CollectionType::DEPRECATED_lastmodEpoch(newEpoch))));
                    _checkGLE(conn);
                }
                catch (const DBException& e) {

                    *errMsg = stream() << "could not write a new epoch for " << collection.getNS()
                                       << causedBy(e);

                    return false;
                }

                connPtr->done();
                collection.setEpoch(newEpoch);
            }

            epoch = collection.getEpoch();
            verify(epoch.isSet());

            //
            // Now write verified epoch to all chunks
            //

            log() << "writing epoch " << epoch << " for " << chunks.size() << " chunks in "
                  << collection.getNS() << " collection..." << endl;

            {
                scoped_ptr<ScopedDbConnection> connPtr;

                try {
                    connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                    ScopedDbConnection& conn = *connPtr;

                    // Multi-update of all chunks
                    conn->update(ChunkType::ConfigNS + workingSuffix,
                                 BSON(ChunkType::ns(collection.getNS())),
                                 BSON("$set" << BSON(ChunkType::DEPRECATED_epoch(epoch))),
                                 false,
                                 true); // multi
                    _checkGLE(conn);
                }
                catch (const DBException& e) {

                    *errMsg = stream() << "could not write a new epoch " << epoch.toString()
                                       << " for chunks in " << collection.getNS() << causedBy(e);

                    return false;
                }

                connPtr->done();
            }
        }

        //
        // Paranoid verify the collection writes
        //

        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                // Find collections with no epochs
                BSONObj emptyDoc =
                        conn->findOne(CollectionType::ConfigNS + workingSuffix,
                                      BSON("$unset" << BSON(CollectionType::DEPRECATED_lastmodEpoch() << 1)));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "collection " << emptyDoc
                                       << " is still missing epoch after config upgrade";

                    connPtr->done();
                    return false;
                }

                // Find collections with empty epochs
                emptyDoc = conn->findOne(CollectionType::ConfigNS + workingSuffix,
                                         BSON(CollectionType::DEPRECATED_lastmodEpoch(OID())));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "collection " << emptyDoc
                                       << " still has empty epoch after config upgrade";

                    connPtr->done();
                    return false;
                }

                // Find chunks with no epochs
                emptyDoc =
                        conn->findOne(ChunkType::ConfigNS + workingSuffix,
                                      BSON("$unset" << BSON(ChunkType::DEPRECATED_epoch() << 1)));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "chunk " << emptyDoc
                                       << " is still missing epoch after config upgrade";

                    connPtr->done();
                    return false;
                }

                // Find chunks with empty epochs
                emptyDoc = conn->findOne(ChunkType::ConfigNS + workingSuffix,
                                         BSON(ChunkType::DEPRECATED_epoch(OID())));

                if (!emptyDoc.isEmpty()) {

                    *errMsg = stream() << "chunk " << emptyDoc
                                       << " still has empty epoch after config upgrade";

                    connPtr->done();
                    return false;
                }
            }
            catch (const DBException& e) {

                *errMsg = stream() << "could not verify epoch writes" << causedBy(e);

                return false;
            }

            connPtr->done();
        }

        //
        // Double check that our collections haven't changed
        //

        Status idCheckStatus = checkIdsTheSame(configLoc,
                                               CollectionType::ConfigNS,
                                               CollectionType::ConfigNS + workingSuffix);

        if (!idCheckStatus.isOK()) {

            *errMsg = stream() << CollectionType::ConfigNS
                               << " was modified while working on upgrade"
                               << causedBy(idCheckStatus);

            return false;
        }

        idCheckStatus = checkIdsTheSame(configLoc,
                                        ChunkType::ConfigNS,
                                        ChunkType::ConfigNS + workingSuffix);

        if (!idCheckStatus.isOK()) {

            *errMsg = stream() << ChunkType::ConfigNS << " was modified while working on upgrade"
                               << causedBy(idCheckStatus);

            return false;
        }

        //
        // ENTER CRITICAL SECTION
        //

        newVersionInfo.setUpgradeState(BSON(inCriticalSectionField(true)));

        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                verify(newVersionInfo.isValid(NULL));

                conn->update(VersionType::ConfigNS,
                             BSON("_id" << 1 << VersionType::version_DEPRECATED(3)),
                             newVersionInfo.toBSON());
                _checkGLE(conn);
            }
            catch (const DBException& e) {

                // No cleanup message here since we're not sure if we wrote or not, and
                // not dangerous either way except to prevent further updates (at which point
                // the message is printed)
                *errMsg = stream()
                        << "could not update version info to enter critical update section"
                        << causedBy(e);

                return false;
            }

            // AT THIS POINT ANY FAILURE REQUIRES MANUAL INTERVENTION!
            connPtr->done();
        }

        log() << "entered critical section for config upgrade" << endl;

        Status overwriteStatus = overwriteCollection(configLoc,
                                                     CollectionType::ConfigNS + workingSuffix,
                                                     CollectionType::ConfigNS);

        if (!overwriteStatus.isOK()) {

            error() << cleanupMessage << endl;
            *errMsg = stream() << "could not overwrite collection " << CollectionType::ConfigNS
                               << " with working collection "
                               << (CollectionType::ConfigNS + workingSuffix)
                               << causedBy(overwriteStatus);

            return false;
        }

        overwriteStatus = overwriteCollection(configLoc,
                                              ChunkType::ConfigNS + workingSuffix,
                                              ChunkType::ConfigNS);

        if (!overwriteStatus.isOK()) {

            error() << cleanupMessage << endl;
            *errMsg = stream() << "could not overwrite collection " << ChunkType::ConfigNS
                               << " with working collection "
                               << (ChunkType::ConfigNS + workingSuffix)
                               << causedBy(overwriteStatus);

            return false;
        }

        //
        // Finally update the version to latest and add clusterId to version
        //

        OID newClusterId = OID::gen();

        // Note: hardcoded versions, since this is a very particular upgrade
        // Note: DO NOT CLEAR the config version unless bumping the minCompatibleVersion,
        // we want to save the excludes that were set.

        newVersionInfo.setMinCompatibleVersion(UpgradeHistory_NoEpochVersion);
        newVersionInfo.setCurrentVersion(UpgradeHistory_MandatoryEpochVersion);
        newVersionInfo.setClusterId(newClusterId);

        // Leave critical section
        newVersionInfo.unsetUpgradeId();
        newVersionInfo.unsetUpgradeState();

        log() << "writing new version info and clusterId " << newClusterId << "..." << endl;

        {
            scoped_ptr<ScopedDbConnection> connPtr;

            try {
                connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30));
                ScopedDbConnection& conn = *connPtr;

                verify(newVersionInfo.isValid(NULL));

                conn->update(VersionType::ConfigNS,
                             BSON("_id" << 1 << VersionType::version_DEPRECATED(UpgradeHistory_NoEpochVersion)),
                             newVersionInfo.toBSON());
                _checkGLE(conn);
            }
            catch (const DBException& e) {

                error() << cleanupMessage << endl;
                *errMsg = stream() << "could not write new version info "
                                   << "and exit critical upgrade section" << causedBy(e);

                return false;
            }

            connPtr->done();
        }

        //
        // END CRITICAL SECTION
        //

        return true;
    }
Example #26
0
    void Listener::initAndListen() {
        if (!_setupSocketsSuccessful) {
            return;
        }

        for (unsigned i = 0; i < _socks.size(); i++) {
            if (::listen(_socks[i], 128) != 0) {
                error() << "listen(): listen() failed " << errnoWithDescription() << endl;
                return;
            }

            ListeningSockets::get()->add(_socks[i]);
        }

#ifdef MONGO_CONFIG_SSL
        _logListen(_port, _ssl);
#else
        _logListen(_port, false);
#endif

        {
            // Wake up any threads blocked in waitUntilListening()
            stdx::lock_guard<stdx::mutex> lock(_readyMutex);
            _ready = true;
            _readyCondition.notify_all();
        }

        OwnedPointerVector<EventHolder> eventHolders;
        std::unique_ptr<WSAEVENT[]> events(new WSAEVENT[_socks.size()]);
        
        
        // Populate events array with an event for each socket we are watching
        for (size_t count = 0; count < _socks.size(); ++count) {
            EventHolder* ev(new EventHolder);
            eventHolders.mutableVector().push_back(ev);
            events[count] = ev->get();            
        }
            
        while ( ! inShutdown() ) {
            // Turn on listening for accept-ready sockets
            for (size_t count = 0; count < _socks.size(); ++count) {
                int status = WSAEventSelect(_socks[count], events[count], FD_ACCEPT | FD_CLOSE);
                if (status == SOCKET_ERROR) {
                    const int mongo_errno = WSAGetLastError();

                    // During shutdown, we may fail to listen on the socket if it has already
                    // been closed
                    if (inShutdown()) {
                        return;
                    }

                    error() << "Windows WSAEventSelect returned "
                        << errnoWithDescription(mongo_errno) << endl;
                    fassertFailed(16727);
                }
            }
        
            // Wait till one of them goes active, or we time out
            DWORD result = WSAWaitForMultipleEvents(_socks.size(),
                                                    events.get(), 
                                                    FALSE, // don't wait for all the events
                                                    10, // timeout, in ms 
                                                    FALSE); // do not allow I/O interruptions
            if (result == WSA_WAIT_FAILED) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAWaitForMultipleEvents returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                fassertFailed(16723);
            }
        
            if (result == WSA_WAIT_TIMEOUT) {
                _elapsedTime += 10;
                continue;
            }
            _elapsedTime += 1; // assume 1ms to grab connection. very rough
            
            // Determine which socket is ready
            DWORD eventIndex = result - WSA_WAIT_EVENT_0;
            WSANETWORKEVENTS networkEvents;            
            // Extract event details, and clear event for next pass
            int status = WSAEnumNetworkEvents(_socks[eventIndex],
                                              events[eventIndex], 
                                              &networkEvents);
            if (status == SOCKET_ERROR) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAEnumNetworkEvents returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                continue;
            }
            
            if (networkEvents.lNetworkEvents & FD_CLOSE) {              
                log() << "listen socket closed" << endl;
                break;
            }
            
            if (!(networkEvents.lNetworkEvents & FD_ACCEPT)) {
                error() << "Unexpected network event: " << networkEvents.lNetworkEvents << endl;
                continue;
            }
            
            int iec = networkEvents.iErrorCode[FD_ACCEPT_BIT];
            if (iec != 0) {                 
                error() << "Windows socket accept did not work:" << errnoWithDescription(iec) 
                        << endl;
                continue;
            }
            
            status = WSAEventSelect(_socks[eventIndex], NULL, 0);
            if (status == SOCKET_ERROR) {
                const int mongo_errno = WSAGetLastError();
                error() << "Windows WSAEventSelect returned " 
                    << errnoWithDescription(mongo_errno) << endl;
                continue;
            }
            
            disableNonblockingMode(_socks[eventIndex]);
            
            SockAddr from;
            int s = accept(_socks[eventIndex], from.raw(), &from.addressSize);
            if ( s < 0 ) {
                int x = errno; // so no global issues
                if (x == EBADF) {
                    log() << "Port " << _port << " is no longer valid" << endl;
                    continue;
                }
                else if (x == ECONNABORTED) {
                    log() << "Listener on port " << _port << " aborted" << endl;
                    continue;
                }
                if ( x == 0 && inShutdown() ) {
                    return;   // socket closed
                }
                if( !inShutdown() ) {
                    log() << "Listener: accept() returns " << s << " " 
                        << errnoWithDescription(x) << endl;
                    if (x == EMFILE || x == ENFILE) {
                        // Connection still in listen queue but we can't accept it yet
                        error() << "Out of file descriptors. Waiting one second before"
                            " trying to accept more connections." << warnings;
                        sleepsecs(1);
                    }
                }
                continue;
            }
            if (from.getType() != AF_UNIX)
                disableNagle(s);

            long long myConnectionNumber = globalConnectionNumber.addAndFetch(1);

            if (_logConnect && !serverGlobalParams.quiet) {
                int conns = globalTicketHolder.used()+1;
                const char* word = (conns == 1 ? " connection" : " connections");
                log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl;
            }
            
            std::shared_ptr<Socket> pnewSock( new Socket(s, from) );
#ifdef MONGO_CONFIG_SSL
            if (_ssl) {
                pnewSock->secureAccepted(_ssl);
            }
#endif
            accepted( pnewSock , myConnectionNumber );
        }
    }
Example #27
0
// static
Status ClearFilters::clear(QuerySettings* querySettings, PlanCache* planCache,
                           const std::string& ns, const BSONObj& cmdObj) {
    invariant(querySettings);

    // According to the specification, the planCacheClearFilters command runs in two modes:
    // - clear all hints; or
    // - clear hints for single query shape when a query shape is described in the
    //   command arguments.
    if (cmdObj.hasField("query")) {
        CanonicalQuery* cqRaw;
        Status status = PlanCacheCommand::canonicalize(ns, cmdObj, &cqRaw);
        if (!status.isOK()) {
            return status;
        }

        scoped_ptr<CanonicalQuery> cq(cqRaw);
        querySettings->removeAllowedIndices(*cq);

        // Remove entry from plan cache
        planCache->remove(*cq);
        return Status::OK();
    }

    // If query is not provided, make sure sort and projection are not in arguments.
    // We do not want to clear the entire cache inadvertently when the user
    // forgot to provide a value for "query".
    if (cmdObj.hasField("sort") || cmdObj.hasField("projection")) {
        return Status(ErrorCodes::BadValue, "sort or projection provided without query");
    }

    // Get entries from query settings. We need to remove corresponding entries from the plan
    // cache shortly.
    OwnedPointerVector<AllowedIndexEntry> entries;
    entries.mutableVector() = querySettings->getAllAllowedIndices();

    // OK to proceed with clearing entire cache.
    querySettings->clearAllowedIndices();

    const NamespaceString nss(ns);
    const WhereCallbackReal whereCallback(nss.db());

    // Remove corresponding entries from plan cache.
    // Admin hints affect the planning process directly. If there were
    // plans generated as a result of applying index filter, these need to be
    // invalidated. This allows the planner to re-populate the plan cache with
    // non-filtered indexed solutions next time the query is run.
    // Resolve plan cache key from (query, sort, projection) in query settings entry.
    // Concurrency note: There's no harm in removing plan cache entries one at at time.
    // Only way that PlanCache::remove() can fail is when the query shape has been removed from
    // the cache by some other means (re-index, collection info reset, ...). This is OK since
    // that's the intended effect of calling the remove() function with the key from the hint entry.
    for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin();
            i != entries.end(); ++i) {
        AllowedIndexEntry* entry = *i;
        invariant(entry);

        // Create canonical query.
        CanonicalQuery* cqRaw;
        Status result = CanonicalQuery::canonicalize(
                            ns, entry->query, entry->sort, entry->projection, &cqRaw, whereCallback);
        invariant(result.isOK());
        scoped_ptr<CanonicalQuery> cq(cqRaw);

        // Remove plan cache entry.
        planCache->remove(*cq);
    }

    return Status::OK();
}
Example #28
0
    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
     */
    const DiskLoc DataFileMgr::updateRecord(
        const char *ns,
        Collection* collection,
        Record *toupdate, const DiskLoc& dl,
        const char *_buf, int _len, OpDebug& debug,  bool god) {

        dassert( toupdate == dl.rec() );

        BSONObj objOld = BSONObj::make(toupdate);
        BSONObj objNew(_buf);
        DEV verify( objNew.objsize() == _len );
        DEV verify( objNew.objdata() == _buf );

        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
            /* add back the old _id value if the update removes it.  Note this implementation is slow
               (copies entire object multiple times), but this shouldn't happen often, so going for simple
               code, not speed.
            */
            BSONObjBuilder b;
            BSONElement e;
            verify( objOld.getObjectID(e) );
            b.append(e); // put _id first, for best performance
            b.appendElements(objNew);
            objNew = b.obj();
        }

        NamespaceString nsstring(ns);
        if (nsstring.coll() == "system.users") {
            V2UserDocumentParser parser;
            uassertStatusOK(parser.checkValidUserDocument(objNew));
        }

        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew,
                objNew["_id"] == objOld["_id"]);

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerVector<UpdateTicket> updateTickets;
        updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount());
        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern())
                                    || descriptor->unique())
                                  || ignoreUniqueIndex(descriptor->getOnDisk());
            updateTickets.mutableVector()[i] = new UpdateTicket();
            Status ret = iam->validateUpdate(objOld, objNew, dl, options,
                                             updateTickets.mutableVector()[i]);

            if (Status::OK() != ret) {
                uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString());
            }
        }

        if ( toupdate->netLength() < objNew.objsize() ) {
            // doesn't fit.  reallocate -----------------------------------------------------
            moveCounter.increment();
            uassert( 10003,
                     "failing update: objects in a capped ns cannot grow",
                     !(collection && collection->details()->isCapped()));
            collection->details()->paddingTooSmall();
            deleteRecord(ns, toupdate, dl);
            DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god);

            if (debug.nmoved == -1) // default of -1 rather than 0
                debug.nmoved = 1;
            else
                debug.nmoved += 1;

            return res;
        }

        collection->infoCache()->notifyOfWriteOp();
        collection->details()->paddingFits();

        debug.keyUpdates = 0;

        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            int64_t updatedKeys;
            Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys);
            if (Status::OK() != ret) {
                // This shouldn't happen unless something disastrous occurred.
                massert(16799, "update failed: " + ret.toString(), false);
            }
            debug.keyUpdates += updatedKeys;
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz);
        return dl;
    }
Example #29
0
    bool mergeChunks( OperationContext* txn,
                      const NamespaceString& nss,
                      const BSONObj& minKey,
                      const BSONObj& maxKey,
                      const OID& epoch,
                      string* errMsg ) {

        //
        // Get sharding state up-to-date
        //

        ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(),
                                                              *errMsg );
        if ( !configLoc.isValid() ){
            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get the distributed lock
        //

        ScopedDistributedLock collLock( configLoc, nss.ns() );
        collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from "
                                          << minKey << " to " << maxKey );

        Status acquisitionStatus = collLock.tryAcquire();
        if (!acquisitionStatus.isOK()) {
            *errMsg = stream() << "could not acquire collection lock for " << nss.ns()
                               << " to merge chunks in [" << minKey << "," << maxKey << ")"
                               << causedBy(acquisitionStatus);

            warning() << *errMsg << endl;
            return false;
        }

        //
        // We now have the collection lock, refresh metadata to latest version and sanity check
        //

        ChunkVersion shardVersion;
        Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion);

        if ( !status.isOK() ) {

            *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for "
                                    << nss.ns() << causedBy( status.reason() );

            warning() << *errMsg << endl;
            return false;
        }

        if ( epoch.isSet() && shardVersion.epoch() != epoch ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has changed" << " since merge was sent" << "(sent epoch : "
                               << epoch.toString()
                               << ", current epoch : " << shardVersion.epoch().toString() << ")";

            warning() << *errMsg << endl;
            return false;
        }

        CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() );

        if ( !metadata || metadata->getKeyPattern().isEmpty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " is not sharded";

            warning() << *errMsg << endl;
            return false;
        }

        dassert( metadata->getShardVersion().equals( shardVersion ) );

        if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) {

            *errMsg = stream() << "could not merge chunks, the range "
                               << rangeToString( minKey, maxKey ) << " is not valid"
                               << " for collection " << nss.ns() << " with key pattern "
                               << metadata->getKeyPattern();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Get merged chunk information
        //

        ChunkVersion mergeVersion = metadata->getCollVersion();
        mergeVersion.incMinor();

        OwnedPointerVector<ChunkType> chunksToMerge;

        ChunkType itChunk;
        itChunk.setMin( minKey );
        itChunk.setMax( minKey );
        itChunk.setNS( nss.ns() );
        itChunk.setShard( shardingState.getShardName() );

        while ( itChunk.getMax().woCompare( maxKey ) < 0 &&
                metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) {
            auto_ptr<ChunkType> saved( new ChunkType );
            itChunk.cloneTo( saved.get() );
            chunksToMerge.mutableVector().push_back( saved.release() );
        }

        if ( chunksToMerge.empty() ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " and ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Validate the range starts and ends at chunks and has no holes, error if not valid
        //

        BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin();
        BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax();
        // minKey is inclusive
        bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey );

        if ( !minKeyInRange ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range starting at " << minKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin();
        BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax();
        // maxKey is exclusive
        bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 &&
                lastDocMax.woCompare( maxKey ) >= 0;

        if ( !maxKeyInRange ) {
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " range ending at " << maxKey
                               << " does not belong to shard " << shardingState.getShardName();

            warning() << *errMsg << endl;
            return false;
        }

        bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0;
        bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0;

        if ( !validRangeStartKey || !validRangeEndKey ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " does not contain a chunk "
                               << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" )
                               << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" )
                               << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" );

            warning() << *errMsg << endl;
            return false;
        }

        if ( chunksToMerge.size() == 1 ) {

            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " already contains chunk for " << rangeToString( minKey, maxKey );

            warning() << *errMsg << endl;
            return false;
        }

        bool holeInRange = false;

        // Look for hole in range
        ChunkType* prevChunk = *chunksToMerge.begin();
        ChunkType* nextChunk = NULL;
        for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin();
                it != chunksToMerge.end(); ++it ) {
            if ( it == chunksToMerge.begin() ) continue;

            nextChunk = *it;
            if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) {
                holeInRange = true;
                break;
            }
            prevChunk = nextChunk;
        }

        if ( holeInRange ) {

            dassert( NULL != nextChunk );
            *errMsg = stream() << "could not merge chunks, collection " << nss.ns()
                               << " has a hole in the range " << rangeToString( minKey, maxKey )
                               << " at " << rangeToString( prevChunk->getMax(),
                                                           nextChunk->getMin() );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Run apply ops command
        //

        BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge,
                                                shardVersion,
                                                mergeVersion );

        bool ok;
        BSONObj result;
        try {
            ScopedDbConnection conn( configLoc, 30.0 );
            ok = conn->runCommand( "config", applyOpsCmd, result );
            if ( !ok ) *errMsg = result.toString();
            conn.done();
        }
        catch( const DBException& ex ) {
            ok = false;
            *errMsg = ex.toString();
        }

        if ( !ok ) {
            *errMsg = stream() << "could not merge chunks for " << nss.ns()
                               << ", writing to config failed" << causedBy( errMsg );

            warning() << *errMsg << endl;
            return false;
        }

        //
        // Install merged chunk metadata
        //

        {
            Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X);
            shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion);
        }

        //
        // Log change
        //

        BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge,
                                                    shardVersion,
                                                    mergeVersion );

        configServer.logChange( "merge", nss.ns(), mergeLogEntry );

        return true;
    }
Example #30
0
    void ReplSetImpl::loadConfig(OperationContext* txn) {
        startupStatus = LOADINGCONFIG;
        startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
        LOG(1) << "loadConfig() " << rsConfigNs << endl;

        while (1) {
            try {
                OwnedPointerVector<ReplSetConfig> configs;
                try {
                    configs.mutableVector().push_back(ReplSetConfig::makeDirect(txn));
                }
                catch (DBException& e) {
                    log() << "replSet exception loading our local replset configuration object : "
                          << e.toString() << rsLog;
                }
                for (vector<HostAndPort>::const_iterator i = _seeds->begin();
                        i != _seeds->end();
                        i++) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, *i));
                    }
                    catch (DBException& e) {
                        log() << "replSet exception trying to load config from " << *i
                              << " : " << e.toString() << rsLog;
                    }
                }
                ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings();
                {
                    scoped_lock lck(replSettings.discoveredSeeds_mx);
                    if (replSettings.discoveredSeeds.size() > 0) {
                        for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); 
                             i != replSettings.discoveredSeeds.end(); 
                             i++) {
                            try {
                                configs.mutableVector().push_back(
                                                            ReplSetConfig::make(txn, HostAndPort(*i)));
                            }
                            catch (DBException&) {
                                LOG(1) << "replSet exception trying to load config from discovered "
                                          "seed " << *i << rsLog;
                                replSettings.discoveredSeeds.erase(*i);
                            }
                        }
                    }
                }

                if (!replSettings.reconfig.isEmpty()) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, replSettings.reconfig,
                                                                       true));
                    }
                    catch (DBException& re) {
                        log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
                        replSettings.reconfig = BSONObj();
                    }
                }

                int nok = 0;
                int nempty = 0;
                for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin();
                     i != configs.mutableVector().end(); i++) {
                    if ((*i)->ok())
                        nok++;
                    if ((*i)->empty())
                        nempty++;
                }
                if (nok == 0) {

                    if (nempty == (int) configs.mutableVector().size()) {
                        startupStatus = EMPTYCONFIG;
                        startupStatusMsg.set("can't get " + rsConfigNs +
                                             " config from self or any seed (EMPTYCONFIG)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (EMPTYCONFIG)" << rsLog;
                        static unsigned once;
                        if (++once == 1) {
                            log() << "replSet info you may need to run replSetInitiate -- rs.initia"
                                     "te() in the shell -- if that is not already done" << rsLog;
                        }
                        if (_seeds->size() == 0) {
                            LOG(1) << "replSet info no seed hosts were specified on the --replSet "
                                      "command line" << rsLog;
                        }
                    }
                    else {
                        startupStatus = EMPTYUNREACHABLE;
                        startupStatusMsg.set("can't currently get " + rsConfigNs +
                                             " config from self or any seed (EMPTYUNREACHABLE)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (yet)" << rsLog;
                    }

                    sleepsecs(1);
                    continue;
                }

                if (!_loadConfigFinish(txn, configs.mutableVector())) {
                    log() << "replSet info Couldn't load config yet. Sleeping 3 sec and will try "
                             "again." << rsLog;
                    sleepsecs(3);
                    continue;
                }
            }
            catch (DBException& e) {
                startupStatus = BADCONFIG;
                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
                log() << "replSet error loading configurations " << e.toString() << rsLog;
                log() << "replSet error replication will not start" << rsLog;
                sethbmsg("error loading set config");
                fassertFailedNoTrace(18754);
                throw;
            }
            break;
        }
        startupStatusMsg.set("? started");
        startupStatus = STARTED;
    }