bool BigSimplePolygon::Contains(const S2Polyline& line) const { // // A line is contained within a loop if the result of subtracting the loop from the line is // nothing. // // Also, a line is contained within a loop if the result of clipping the line to the // complement of the loop is nothing. // // If we can't subtract the loop itself using S2, we clip (intersect) to the inverse. Every // point in S2 is contained in exactly one of these loops. // // TODO: Polygon borders are actually kind of weird, and this is somewhat inconsistent with // Intersects(). A point might Intersect() a boundary exactly, but not be Contain()ed // within the Polygon. Think the right thing to do here is custom intersection functions. // const S2Polygon& polyBorder = GetPolygonBorder(); OwnedPointerVector<S2Polyline> clippedOwned; vector<S2Polyline*>& clipped = clippedOwned.mutableVector(); if (_isNormalized) { // Polygon border is the same as the loop polyBorder.SubtractFromPolyline(&line, &clipped); return clipped.size() == 0; } else { // Polygon border is the complement of the loop polyBorder.IntersectWithPolyline(&line, &clipped); return clipped.size() == 0; } }
PlanCacheEntry* PlanCacheEntry::clone() const { OwnedPointerVector<QuerySolution> solutions; for (size_t i = 0; i < plannerData.size(); ++i) { QuerySolution* qs = new QuerySolution(); qs->cacheData.reset(plannerData[i]->clone()); solutions.mutableVector().push_back(qs); } PlanCacheEntry* entry = new PlanCacheEntry(solutions.vector(), decision->clone()); entry->backupSoln = backupSoln; // Copy query shape. entry->query = query.getOwned(); entry->sort = sort.getOwned(); entry->projection = projection.getOwned(); // Copy performance stats. for (size_t i = 0; i < feedback.size(); ++i) { PlanCacheEntryFeedback* fb = new PlanCacheEntryFeedback(); fb->stats.reset(feedback[i]->stats->clone()); fb->score = feedback[i]->score; entry->feedback.push_back(fb); } entry->averageScore = averageScore; entry->stddevScore = stddevScore; return entry; }
/** * Currently the allowable shard keys are either * i) a hashed single field, e.g. { a : "hashed" }, or * ii) a compound list of ascending, potentially-nested field paths, e.g. { a : 1 , b.c : 1 } */ static vector<FieldRef*> parseShardKeyPattern(const BSONObj& keyPattern) { OwnedPointerVector<FieldRef> parsedPaths; static const vector<FieldRef*> empty; BSONObjIterator patternIt(keyPattern); while (patternIt.more()) { BSONElement patternEl = patternIt.next(); parsedPaths.push_back(new FieldRef(patternEl.fieldNameStringData())); const FieldRef& patternPath = *parsedPaths.back(); // Empty path if (patternPath.numParts() == 0) return empty; // Extra "." in path? if (patternPath.dottedField() != patternEl.fieldNameStringData()) return empty; // Empty parts of the path, ".."? for (size_t i = 0; i < patternPath.numParts(); ++i) { if (patternPath.getPart(i).size() == 0) return empty; } // Numeric and ascending (1.0), or "hashed" and single field if (!patternEl.isNumber()) { if (keyPattern.nFields() != 1 || !isHashedPatternEl(patternEl)) return empty; } else if (patternEl.numberInt() != 1) { return empty; } } return parsedPaths.release(); }
vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const { OwnedPointerVector<RecordIterator> iterators; const Extent* ext; for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) { ext = _getExtent(txn, extLoc); if (ext->firstRecord.isNull()) continue; iterators.push_back( new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this)); } return iterators.release(); }
vector<PlanStageStats*> MultiPlanStage::generateCandidateStats() { OwnedPointerVector<PlanStageStats> candidateStats; for (size_t ix = 0; ix < _candidates.size(); ix++) { if (ix == (size_t)_bestPlanIdx) { continue; } if (ix == (size_t)_backupPlanIdx) { continue; } PlanStageStats* stats = _candidates[ix].root->getStats(); candidateStats.push_back(stats); } return candidateStats.release(); }
const S2Polygon& BigSimplePolygon::GetPolygonBorder() const { if (_borderPoly) return *_borderPoly; unique_ptr<S2Loop> cloned(_loop->Clone()); // Any loop in polygon should be than a hemisphere (2*Pi). cloned->Normalize(); OwnedPointerVector<S2Loop> loops; loops.mutableVector().push_back(cloned.release()); _borderPoly.reset(new S2Polygon(&loops.mutableVector())); return *_borderPoly; }
IndexBounds ChunkManager::getIndexBoundsForQuery(const BSONObj& key, const CanonicalQuery& canonicalQuery) { // $text is not allowed in planning since we don't have text index on mongos. // // TODO: Treat $text query as a no-op in planning. So with shard key {a: 1}, // the query { a: 2, $text: { ... } } will only target to {a: 2}. if (QueryPlannerCommon::hasNode(canonicalQuery.root(), MatchExpression::TEXT)) { IndexBounds bounds; IndexBoundsBuilder::allValuesBounds(key, &bounds); // [minKey, maxKey] return bounds; } // Consider shard key as an index string accessMethod = IndexNames::findPluginName(key); dassert(accessMethod == IndexNames::BTREE || accessMethod == IndexNames::HASHED); // Use query framework to generate index bounds QueryPlannerParams plannerParams; // Must use "shard key" index plannerParams.options = QueryPlannerParams::NO_TABLE_SCAN; IndexEntry indexEntry(key, accessMethod, false /* multiKey */, false /* sparse */, false /* unique */, "shardkey", NULL /* filterExpr */, BSONObj()); plannerParams.indices.push_back(indexEntry); OwnedPointerVector<QuerySolution> solutions; Status status = QueryPlanner::plan(canonicalQuery, plannerParams, &solutions.mutableVector()); uassert(status.code(), status.reason(), status.isOK()); IndexBounds bounds; for (vector<QuerySolution*>::const_iterator it = solutions.begin(); bounds.size() == 0 && it != solutions.end(); it++) { // Try next solution if we failed to generate index bounds, i.e. bounds.size() == 0 bounds = collapseQuerySolution((*it)->root.get()); } if (bounds.size() == 0) { // We cannot plan the query without collection scan, so target to all shards. IndexBoundsBuilder::allValuesBounds(key, &bounds); // [minKey, maxKey] } return bounds; }
void Strategy::writeOp(OperationContext* txn, int op, Request& request) { // make sure we have a last error dassert(&LastError::get(cc())); OwnedPointerVector<BatchedCommandRequest> commandRequestsOwned; vector<BatchedCommandRequest*>& commandRequests = commandRequestsOwned.mutableVector(); msgToBatchRequests(request.m(), &commandRequests); for (vector<BatchedCommandRequest*>::iterator it = commandRequests.begin(); it != commandRequests.end(); ++it) { // Multiple commands registered to last error as multiple requests if (it != commandRequests.begin()) LastError::get(cc()).startRequest(); BatchedCommandRequest* commandRequest = *it; // Adjust namespaces for command NamespaceString fullNS(commandRequest->getNS()); string cmdNS = fullNS.getCommandNS(); // We only pass in collection name to command commandRequest->setNS(fullNS); BSONObjBuilder builder; BSONObj requestBSON = commandRequest->toBSON(); { // Disable the last error object for the duration of the write cmd LastError::Disabled disableLastError(&LastError::get(cc())); Command::runAgainstRegistered(txn, cmdNS.c_str(), requestBSON, builder, 0); } BatchedCommandResponse commandResponse; bool parsed = commandResponse.parseBSON(builder.done(), NULL); (void)parsed; // for compile dassert(parsed && commandResponse.isValid(NULL)); // Populate the lastError object based on the write response LastError::get(cc()).reset(); bool hadError = batchErrorToLastError(*commandRequest, commandResponse, &LastError::get(cc())); // Check if this is an ordered batch and we had an error which should stop processing if (commandRequest->getOrdered() && hadError) break; } }
BSONObj buildMergeLogEntry( const OwnedPointerVector<ChunkType>& chunksToMerge, const ChunkVersion& currShardVersion, const ChunkVersion& newMergedVersion ) { BSONObjBuilder logDetailB; BSONArrayBuilder mergedB( logDetailB.subarrayStart( "merged" ) ); for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); it != chunksToMerge.end(); ++it ) { ChunkType* chunkToMerge = *it; mergedB.append( chunkToMerge->toBSON() ); } mergedB.done(); currShardVersion.addToBSON( logDetailB, "prevShardVersion" ); newMergedVersion.addToBSON( logDetailB, "mergedVersion" ); return logDetailB.obj(); }
BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const { RangeDeleter* deleter = getDeleter(); if (!deleter) { return BSONObj(); } BSONObjBuilder result; OwnedPointerVector<DeleteJobStats> statsList; deleter->getStatsHistory(&statsList.mutableVector()); BSONArrayBuilder oldStatsBuilder; for (OwnedPointerVector<DeleteJobStats>::const_iterator it = statsList.begin(); it != statsList.end(); ++it) { BSONObjBuilder entryBuilder; entryBuilder.append("deletedDocs", (*it)->deletedDocCount); if ((*it)->queueEndTS > Date_t()) { entryBuilder.append("queueStart", (*it)->queueStartTS); entryBuilder.append("queueEnd", (*it)->queueEndTS); } if ((*it)->deleteEndTS > Date_t()) { entryBuilder.append("deleteStart", (*it)->deleteStartTS); entryBuilder.append("deleteEnd", (*it)->deleteEndTS); if ((*it)->waitForReplEndTS > Date_t()) { entryBuilder.append("waitForReplStart", (*it)->waitForReplStartTS); entryBuilder.append("waitForReplEnd", (*it)->waitForReplEndTS); } } oldStatsBuilder.append(entryBuilder.obj()); } result.append("lastDeleteStats", oldStatsBuilder.arr()); return result.obj(); }
// static Status ListFilters::list(const QuerySettings& querySettings, BSONObjBuilder* bob) { invariant(bob); // Format of BSON result: // // { // hints: [ // { // query: <query>, // sort: <sort>, // projection: <projection>, // indexes: [<index1>, <index2>, <index3>, ...] // } // } BSONArrayBuilder hintsBuilder(bob->subarrayStart("filters")); OwnedPointerVector<AllowedIndexEntry> entries; entries.mutableVector() = querySettings.getAllAllowedIndices(); for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin(); i != entries.end(); ++i) { AllowedIndexEntry* entry = *i; invariant(entry); BSONObjBuilder hintBob(hintsBuilder.subobjStart()); hintBob.append("query", entry->query); hintBob.append("sort", entry->sort); hintBob.append("projection", entry->projection); BSONArrayBuilder indexesBuilder(hintBob.subarrayStart("indexes")); for (vector<BSONObj>::const_iterator j = entry->indexKeyPatterns.begin(); j != entry->indexKeyPatterns.end(); ++j) { const BSONObj& index = *j; indexesBuilder.append(index); } indexesBuilder.doneFast(); } hintsBuilder.doneFast(); return Status::OK(); }
/*static*/ int MongoFile::_flushAll(bool sync) { if (!sync) { int num = 0; LockMongoFilesShared lk; for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) { num++; MongoFile* mmf = *i; if (!mmf) continue; mmf->flush(sync); } return num; } // want to do it sync // get a thread-safe Flushable object for each file first in a single lock // so that we can iterate and flush without doing any locking here OwnedPointerVector<Flushable> thingsToFlushWrapper; vector<Flushable*>& thingsToFlush = thingsToFlushWrapper.mutableVector(); { LockMongoFilesShared lk; for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) { MongoFile* mmf = *i; if (!mmf) continue; thingsToFlush.push_back(mmf->prepareFlush()); } } for (size_t i = 0; i < thingsToFlush.size(); i++) { thingsToFlush[i]->flush(); } return thingsToFlush.size(); }
BSONObj buildApplyOpsCmd( const OwnedPointerVector<ChunkType>& chunksToMerge, const ChunkVersion& currShardVersion, const ChunkVersion& newMergedVersion ) { BSONObjBuilder applyOpsCmdB; BSONArrayBuilder updatesB( applyOpsCmdB.subarrayStart( "applyOps" ) ); // The chunk we'll be "expanding" is the first chunk const ChunkType* chunkToMerge = *chunksToMerge.begin(); // Fill in details not tracked by metadata ChunkType mergedChunk; chunkToMerge->cloneTo( &mergedChunk ); mergedChunk.setName( Chunk::genID( chunkToMerge->getNS(), chunkToMerge->getMin() ) ); mergedChunk.setMax( ( *chunksToMerge.vector().rbegin() )->getMax() ); mergedChunk.setVersion( newMergedVersion ); updatesB.append( buildOpMergeChunk( mergedChunk ) ); // Don't remove chunk we're expanding OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); for ( ++it; it != chunksToMerge.end(); ++it ) { ChunkType* chunkToMerge = *it; chunkToMerge->setName( Chunk::genID( chunkToMerge->getNS(), chunkToMerge->getMin() ) ); updatesB.append( buildOpRemoveChunk( *chunkToMerge ) ); } updatesB.done(); applyOpsCmdB.append( "preCondition", buildOpPrecond( chunkToMerge->getNS(), chunkToMerge->getShard(), currShardVersion ) ); return applyOpsCmdB.obj(); }
StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) { if ( isCapped() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact capped collection" ); if ( _indexCatalog.numIndexesInProgress() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact when indexes in progress" ); NamespaceDetails* d = details(); // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; // same data, but might perform a little different after compact? _infoCache.reset(); vector<BSONObj> indexSpecs; { IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); indexSpecs.push_back( _compactAdjustIndexSpec( descriptor->infoObj() ) ); } } log() << "compact orphan deleted lists" << endl; d->orphanDeletedList(); // Start over from scratch with our extent sizing and growth d->setLastExtentSize( 0 ); // before dropping indexes, at least make sure we can allocate one extent! if ( allocateSpaceForANewRecord( _ns.ns().c_str(), d, Record::HeaderSize+1, false).isNull() ) { return StatusWith<CompactStats>( ErrorCodes::InternalError, "compact error no space available to allocate" ); } // note that the drop indexes call also invalidates all clientcursors for the namespace, // which is important and wanted here log() << "compact dropping indexes" << endl; Status status = _indexCatalog.dropAllIndexes( true ); if ( !status.isOK() ) { return StatusWith<CompactStats>( status ); } getDur().commitIfNeeded(); CompactStats stats; OwnedPointerVector<IndexCatalog::IndexBuildBlock> indexBuildBlocks; vector<IndexAccessMethod*> indexesToInsertTo; vector< std::pair<IndexAccessMethod*,IndexAccessMethod*> > bulkToCommit; for ( size_t i = 0; i < indexSpecs.size(); i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i]; info = _compactAdjustIndexSpec( info ); info = _indexCatalog.fixIndexSpec( info ); auto_ptr<IndexCatalog::IndexBuildBlock> block( new IndexCatalog::IndexBuildBlock( this,info ) ); Status status = block->init(); if ( !status.isOK() ) return StatusWith<CompactStats>(status); IndexAccessMethod* accessMethod = block->getEntry()->accessMethod(); status = accessMethod->initializeAsEmpty(); if ( !status.isOK() ) return StatusWith<CompactStats>(status); IndexAccessMethod* bulk = accessMethod->initiateBulk(); if ( bulk ) { indexesToInsertTo.push_back( bulk ); bulkToCommit.push_back( std::pair<IndexAccessMethod*,IndexAccessMethod*>( accessMethod, bulk ) ); } else { indexesToInsertTo.push_back( accessMethod ); } indexBuildBlocks.mutableVector().push_back( block.release() ); } // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent d->setStats( 0, 0 ); ProgressMeterHolder pm(cc().curop()->setMessage("compact extent", "Extent Compacting Progress", extents.size())); int extentNumber = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { _compactExtent(*i, extentNumber++, indexesToInsertTo, compactOptions, &stats ); pm.hit(); } verify( d->firstExtent().ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); log() << "starting index commits"; for ( size_t i = 0; i < bulkToCommit.size(); i++ ) { bulkToCommit[i].first->commitBulk( bulkToCommit[i].second, false, NULL ); } for ( size_t i = 0; i < indexBuildBlocks.size(); i++ ) { IndexCatalog::IndexBuildBlock* block = indexBuildBlocks.mutableVector()[i]; block->success(); } return StatusWith<CompactStats>( stats ); }
static StatusWith<double> computeGeoNearDistance(const GeoNearParams& nearParams, WorkingSetMember* member) { // // Generic GeoNear distance computation // Distances are computed by projecting the stored geometry into the query CRS, and // computing distance in that CRS. // // Must have an object in order to get geometry out of it. invariant(member->hasObj()); CRS queryCRS = nearParams.nearQuery.centroid.crs; // Extract all the geometries out of this document for the near query OwnedPointerVector<StoredGeometry> geometriesOwned; vector<StoredGeometry*>& geometries = geometriesOwned.mutableVector(); extractGeometries(member->obj, nearParams.nearQuery.field, &geometries); // Compute the minimum distance of all the geometries in the document double minDistance = -1; BSONObj minDistanceObj; for (vector<StoredGeometry*>::iterator it = geometries.begin(); it != geometries.end(); ++it) { StoredGeometry& stored = **it; // NOTE: A stored document with STRICT_SPHERE CRS is treated as a malformed document // and ignored. Since GeoNear requires an index, there's no stored STRICT_SPHERE shape. // So we don't check it here. // NOTE: For now, we're sure that if we get this far in the query we'll have an // appropriate index which validates the type of geometry we're pulling back here. // TODO: It may make sense to change our semantics and, by default, only return // shapes in the same CRS from $geoNear. if (!stored.geometry.supportsProject(queryCRS)) continue; stored.geometry.projectInto(queryCRS); double nextDistance = stored.geometry.minDistance(nearParams.nearQuery.centroid); if (minDistance < 0 || nextDistance < minDistance) { minDistance = nextDistance; minDistanceObj = stored.element.Obj(); } } if (minDistance < 0) { // No distance to report return StatusWith<double>(-1); } if (nearParams.addDistMeta) { if (nearParams.nearQuery.unitsAreRadians) { // Hack for nearSphere // TODO: Remove nearSphere? invariant(SPHERE == queryCRS); member->addComputed(new GeoDistanceComputedData(minDistance / kRadiusOfEarthInMeters)); } else { member->addComputed(new GeoDistanceComputedData(minDistance)); } } if (nearParams.addPointMeta) { member->addComputed(new GeoNearPointComputedData(minDistanceObj)); } return StatusWith<double>(minDistance); }
Status BatchWriteOp::targetBatch(OperationContext* txn, const NSTargeter& targeter, bool recordTargetErrors, vector<TargetedWriteBatch*>* targetedBatches) { // // Targeting of unordered batches is fairly simple - each remaining write op is targeted, // and each of those targeted writes are grouped into a batch for a particular shard // endpoint. // // Targeting of ordered batches is a bit more complex - to respect the ordering of the // batch, we can only send: // A) a single targeted batch to one shard endpoint // B) multiple targeted batches, but only containing targeted writes for a single write op // // This means that any multi-shard write operation must be targeted and sent one-by-one. // Subsequent single-shard write operations can be batched together if they go to the same // place. // // Ex: ShardA : { skey : a->k }, ShardB : { skey : k->z } // // Ordered insert batch of: [{ skey : a }, { skey : b }, { skey : x }] // broken into: // [{ skey : a }, { skey : b }], // [{ skey : x }] // // Ordered update Batch of : // [{ skey : a }{ $push }, // { skey : b }{ $push }, // { skey : [c, x] }{ $push }, // { skey : y }{ $push }, // { skey : z }{ $push }] // broken into: // [{ skey : a }, { skey : b }], // [{ skey : [c,x] }], // [{ skey : y }, { skey : z }] // const bool ordered = _clientRequest->getOrdered(); TargetedBatchMap batchMap; TargetedBatchSizeMap batchSizes; int numTargetErrors = 0; size_t numWriteOps = _clientRequest->sizeWriteOps(); for (size_t i = 0; i < numWriteOps; ++i) { WriteOp& writeOp = _writeOps[i]; // Only target _Ready ops if (writeOp.getWriteState() != WriteOpState_Ready) continue; // // Get TargetedWrites from the targeter for the write operation // // TargetedWrites need to be owned once returned OwnedPointerVector<TargetedWrite> writesOwned; vector<TargetedWrite*>& writes = writesOwned.mutableVector(); Status targetStatus = writeOp.targetWrites(txn, targeter, &writes); if (!targetStatus.isOK()) { WriteErrorDetail targetError; buildTargetError(targetStatus, &targetError); if (!recordTargetErrors) { // Cancel current batch state with an error cancelBatches(targetError, _writeOps, &batchMap); dassert(batchMap.empty()); return targetStatus; } else if (!ordered || batchMap.empty()) { // Record an error for this batch writeOp.setOpError(targetError); ++numTargetErrors; if (ordered) return Status::OK(); continue; } else { dassert(ordered && !batchMap.empty()); // Send out what we have, but don't record an error yet, since there may be an // error in the writes before this point. writeOp.cancelWrites(&targetError); break; } } // // If ordered and we have a previous endpoint, make sure we don't need to send these // targeted writes to any other endpoints. // if (ordered && !batchMap.empty()) { dassert(batchMap.size() == 1u); if (isNewBatchRequired(writes, batchMap)) { writeOp.cancelWrites(NULL); break; } } // // If this write will push us over some sort of size limit, stop targeting // int writeSizeBytes = getWriteSizeBytes(writeOp); if (wouldMakeBatchesTooBig(writes, writeSizeBytes, batchSizes)) { invariant(!batchMap.empty()); writeOp.cancelWrites(NULL); break; } // // Targeting went ok, add to appropriate TargetedBatch // for (vector<TargetedWrite*>::iterator it = writes.begin(); it != writes.end(); ++it) { TargetedWrite* write = *it; TargetedBatchMap::iterator batchIt = batchMap.find(&write->endpoint); TargetedBatchSizeMap::iterator batchSizeIt = batchSizes.find(&write->endpoint); if (batchIt == batchMap.end()) { TargetedWriteBatch* newBatch = new TargetedWriteBatch(write->endpoint); batchIt = batchMap.insert(make_pair(&newBatch->getEndpoint(), newBatch)).first; batchSizeIt = batchSizes.insert(make_pair(&newBatch->getEndpoint(), BatchSize())).first; } TargetedWriteBatch* batch = batchIt->second; BatchSize& batchSize = batchSizeIt->second; ++batchSize.numOps; batchSize.sizeBytes += writeSizeBytes; batch->addWrite(write); } // Relinquish ownership of TargetedWrites, now the TargetedBatches own them writesOwned.mutableVector().clear(); // // Break if we're ordered and we have more than one endpoint - later writes cannot be // enforced as ordered across multiple shard endpoints. // if (ordered && batchMap.size() > 1u) break; } // // Send back our targeted batches // for (TargetedBatchMap::iterator it = batchMap.begin(); it != batchMap.end(); ++it) { TargetedWriteBatch* batch = it->second; if (batch->getWrites().empty()) continue; // Remember targeted batch for reporting _targeted.insert(batch); // Send the handle back to caller targetedBatches->push_back(batch); } return Status::OK(); }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); PlanExecutor* rawExec; // Takes ownership of 'ws' and 'mis'. Status execStatus = PlanExecutor::make(txn, ws, mis, collection, PlanExecutor::YIELD_AUTO, &rawExec); invariant(execStatus.isOK()); auto_ptr<PlanExecutor> curExec(rawExec); // The PlanExecutor was registered on construction due to the YIELD_AUTO policy. // We have to deregister it, as it will be registered with ClientCursor. curExec->deregisterExec(); // Need to save state while yielding locks between now and getMore(). curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); // This wasn't called above as they weren't assigned yet iterators[i]->saveState(); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection->getCursorManager(), execs.releaseAt(i), ns.ns() ); BSONObjBuilder threadResult; appendCursorResponseObject( cc->cursorid(), ns.ns(), BSONArray(), &threadResult ); threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); // Takes ownership of 'ws' and 'mis'. auto_ptr<PlanExecutor> curExec(new PlanExecutor(txn, ws, mis, collection)); // Each of the plan executors should yield automatically. We pass "false" to // indicate that 'curExec' should not register itself, as it will get registered // by ClientCursor instead. curExec->setYieldPolicy(PlanExecutor::YIELD_AUTO, false); // Need to save state while yielding locks between now and newGetMore. curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection, execs.releaseAt(i) ); // we are mimicking the aggregation cursor output here // that is why there are ns, ok and empty firstBatch BSONObjBuilder threadResult; { BSONObjBuilder cursor; cursor.appendArray( "firstBatch", BSONObj() ); cursor.append( "ns", ns ); cursor.append( "id", cc->cursorid() ); threadResult.append( "cursor", cursor.obj() ); } threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
// static void Explain::explainStages(PlanExecutor* exec, ExplainCommon::Verbosity verbosity, BSONObjBuilder* out) { // // Step 1: run the stages as required by the verbosity level. // // Inspect the tree to see if there is a MultiPlanStage. MultiPlanStage* mps = getMultiPlanStage(exec->getRootStage()); // Get stats of the winning plan from the trial period, if the verbosity level // is high enough and there was a runoff between multiple plans. auto_ptr<PlanStageStats> winningStatsTrial; if (verbosity >= ExplainCommon::EXEC_ALL_PLANS && NULL != mps) { winningStatsTrial.reset(exec->getStats()); invariant(winningStatsTrial.get()); } // If we need execution stats, then run the plan in order to gather the stats. Status executePlanStatus = Status::OK(); if (verbosity >= ExplainCommon::EXEC_STATS) { executePlanStatus = exec->executePlan(); } // // Step 2: collect plan stats (which also give the structure of the plan tree). // // Get stats for the winning plan. scoped_ptr<PlanStageStats> winningStats(exec->getStats()); // Get stats for the rejected plans, if more than one plan was considered. OwnedPointerVector<PlanStageStats> allPlansStats; if (NULL != mps) { allPlansStats = mps->generateCandidateStats(); } // // Step 3: use the stats trees to produce explain BSON. // CanonicalQuery* query = exec->getCanonicalQuery(); if (verbosity >= ExplainCommon::QUERY_PLANNER) { generatePlannerInfo(query, winningStats.get(), allPlansStats.vector(), out); } if (verbosity >= ExplainCommon::EXEC_STATS) { BSONObjBuilder execBob(out->subobjStart("executionStats")); // If there is an execution error while running the query, the error is reported under // the "executionStats" section and the explain as a whole succeeds. execBob.append("executionSuccess", executePlanStatus.isOK()); if (!executePlanStatus.isOK()) { execBob.append("errorMessage", executePlanStatus.reason()); execBob.append("errorCode", executePlanStatus.code()); } // Generate exec stats BSON for the winning plan. OperationContext* opCtx = exec->getOpCtx(); long long totalTimeMillis = opCtx->getCurOp()->elapsedMillis(); generateExecStats(winningStats.get(), verbosity, &execBob, totalTimeMillis); // Also generate exec stats for all plans, if the verbosity level is high enough. // These stats reflect what happened during the trial period that ranked the plans. if (verbosity >= ExplainCommon::EXEC_ALL_PLANS) { // If we ranked multiple plans against each other, then add stats collected // from the trial period of the winning plan. The "allPlansExecution" section // will contain an apples-to-apples comparison of the winning plan's stats against // all rejected plans' stats collected during the trial period. if (NULL != mps) { invariant(winningStatsTrial.get()); allPlansStats.push_back(winningStatsTrial.release()); } BSONArrayBuilder allPlansBob(execBob.subarrayStart("allPlansExecution")); for (size_t i = 0; i < allPlansStats.size(); ++i) { BSONObjBuilder planBob(allPlansBob.subobjStart()); generateExecStats(allPlansStats[i], verbosity, &planBob); planBob.doneFast(); } allPlansBob.doneFast(); } execBob.doneFast(); } generateServerInfo(out); }
Status Strategy::commandOpWrite(const std::string& dbName, const BSONObj& command, BatchItemRef targetingBatchItem, std::vector<CommandResult>* results) { // Note that this implementation will not handle targeting retries and does not completely // emulate write behavior ChunkManagerTargeter targeter(NamespaceString( targetingBatchItem.getRequest()->getTargetingNS())); Status status = targeter.init(); if (!status.isOK()) return status; OwnedPointerVector<ShardEndpoint> endpointsOwned; vector<ShardEndpoint*>& endpoints = endpointsOwned.mutableVector(); if (targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Insert) { ShardEndpoint* endpoint; Status status = targeter.targetInsert(targetingBatchItem.getDocument(), &endpoint); if (!status.isOK()) return status; endpoints.push_back(endpoint); } else if (targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Update) { Status status = targeter.targetUpdate(*targetingBatchItem.getUpdate(), &endpoints); if (!status.isOK()) return status; } else { invariant(targetingBatchItem.getOpType() == BatchedCommandRequest::BatchType_Delete); Status status = targeter.targetDelete(*targetingBatchItem.getDelete(), &endpoints); if (!status.isOK()) return status; } DBClientShardResolver resolver; DBClientMultiCommand dispatcher; // Assemble requests for (vector<ShardEndpoint*>::const_iterator it = endpoints.begin(); it != endpoints.end(); ++it) { const ShardEndpoint* endpoint = *it; ConnectionString host; Status status = resolver.chooseWriteHost(endpoint->shardName, &host); if (!status.isOK()) return status; RawBSONSerializable request(command); dispatcher.addCommand(host, dbName, request); } // Errors reported when recv'ing responses dispatcher.sendAll(); Status dispatchStatus = Status::OK(); // Recv responses while (dispatcher.numPending() > 0) { ConnectionString host; RawBSONSerializable response; Status status = dispatcher.recvAny(&host, &response); if (!status.isOK()) { // We always need to recv() all the sent operations dispatchStatus = status; continue; } CommandResult result; result.target = host; result.shardTarget = Shard::make(host.toString()); result.result = response.toBSON(); results->push_back(result); } return dispatchStatus; }
/** * The core config write functionality. * * Config writes run in two passes - the first is a quick check to ensure the config servers * are all reachable, the second runs the actual write. * * TODO: Upgrade and move this logic to the config servers, a state machine implementation * is probably the next step. */ void ConfigCoordinator::executeBatch( const BatchedCommandRequest& clientRequest, BatchedCommandResponse* clientResponse, bool fsyncCheck ) { NamespaceString nss( clientRequest.getNS() ); dassert( nss.db() == "config" || nss.db() == "admin" ); dassert( clientRequest.sizeWriteOps() == 1u ); if ( fsyncCheck ) { // // Sanity check that all configs are still reachable using fsync, preserving legacy // behavior // OwnedPointerVector<ConfigFsyncResponse> fsyncResponsesOwned; vector<ConfigFsyncResponse*>& fsyncResponses = fsyncResponsesOwned.mutableVector(); // // Send side // for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; FsyncRequest fsyncRequest; _dispatcher->addCommand( configHost, "admin", fsyncRequest ); } _dispatcher->sendAll(); // // Recv side // bool fsyncError = false; while ( _dispatcher->numPending() > 0 ) { fsyncResponses.push_back( new ConfigFsyncResponse() ); ConfigFsyncResponse& fsyncResponse = *fsyncResponses.back(); Status dispatchStatus = _dispatcher->recvAny( &fsyncResponse.configHost, &fsyncResponse.response ); // We've got to recv everything, no matter what if ( !dispatchStatus.isOK() ) { fsyncError = true; buildFsyncErrorFrom( dispatchStatus, &fsyncResponse.response ); } else if ( !fsyncResponse.response.getOk() ) { fsyncError = true; } } if ( fsyncError ) { combineFsyncErrors( fsyncResponses, clientResponse ); return; } else { fsyncResponsesOwned.clear(); } } // // Do the actual writes // BatchedCommandRequest configRequest( clientRequest.getBatchType() ); clientRequest.cloneTo( &configRequest ); configRequest.setNS( nss.coll() ); OwnedPointerVector<ConfigResponse> responsesOwned; vector<ConfigResponse*>& responses = responsesOwned.mutableVector(); // // Send the actual config writes // // Get as many batches as we can at once for ( vector<ConnectionString>::iterator it = _configHosts.begin(); it != _configHosts.end(); ++it ) { ConnectionString& configHost = *it; _dispatcher->addCommand( configHost, nss.db(), configRequest ); } // Send them all out _dispatcher->sendAll(); // // Recv side // while ( _dispatcher->numPending() > 0 ) { // Get the response responses.push_back( new ConfigResponse() ); ConfigResponse& configResponse = *responses.back(); Status dispatchStatus = _dispatcher->recvAny( &configResponse.configHost, &configResponse.response ); if ( !dispatchStatus.isOK() ) { buildErrorFrom( dispatchStatus, &configResponse.response ); } } combineResponses( responses, clientResponse ); }
static Status parseGeoJSONPolygonCoordinates(const BSONElement& elem, S2Polygon *out) { if (Array != elem.type()) { return BAD_VALUE("Polygon coordinates must be an array"); } OwnedPointerVector<S2Loop> loops; Status status = Status::OK(); string err; BSONObjIterator it(elem.Obj()); // Iterate all loops of the polygon. while (it.more()) { // Parse the array of vertices of a loop. BSONElement coordinateElt = it.next(); vector<S2Point> points; status = parseArrayOfCoodinates(coordinateElt, &points); if (!status.isOK()) return status; // Check if the loop is closed. status = isLoopClosed(points, coordinateElt); if (!status.isOK()) return status; eraseDuplicatePoints(&points); // Drop the duplicated last point. points.resize(points.size() - 1); S2Loop* loop = new S2Loop(points); loops.push_back(loop); // Check whether this loop is valid. // 1. At least 3 vertices. // 2. All vertices must be unit length. Guaranteed by parsePoints(). // 3. Loops are not allowed to have any duplicate vertices. // 4. Non-adjacent edges are not allowed to intersect. if (!loop->IsValid(&err)) { return BAD_VALUE("Loop is not valid: " << coordinateElt.toString(false) << " " << err); } // If the loop is more than one hemisphere, invert it. loop->Normalize(); // Check the first loop must be the exterior ring and any others must be // interior rings or holes. if (loops.size() > 1 && !loops[0]->Contains(loop)) { return BAD_VALUE("Secondary loops not contained by first exterior loop - " "secondary loops must be holes: " << coordinateElt.toString(false) << " first loop: " << elem.Obj().firstElement().toString(false)); } } // Check if the given loops form a valid polygon. // 1. If a loop contains an edge AB, then no other loop may contain AB or BA. // 2. No loop covers more than half of the sphere. // 3. No two loops cross. if (!S2Polygon::IsValid(loops.vector(), &err)) return BAD_VALUE("Polygon isn't valid: " << err << " " << elem.toString(false)); // Given all loops are valid / normalized and S2Polygon::IsValid() above returns true. // The polygon must be valid. See S2Polygon member function IsValid(). // Transfer ownership of the loops and clears loop vector. out->Init(&loops.mutableVector()); // Check if every loop of this polygon shares at most one vertex with // its parent loop. if (!out->IsNormalized(&err)) // "err" looks like "Loop 1 shares more than one vertex with its parent loop 0" return BAD_VALUE(err << ": " << elem.toString(false)); // S2Polygon contains more than one ring, which is allowed by S2, but not by GeoJSON. // // Loops are indexed according to a preorder traversal of the nesting hierarchy. // GetLastDescendant() returns the index of the last loop that is contained within // a given loop. We guarantee that the first loop is the exterior ring. if (out->GetLastDescendant(0) < out->num_loops() - 1) { return BAD_VALUE("Only one exterior polygon loop is allowed: " << elem.toString(false)); } // In GeoJSON, only one nesting is allowed. // The depth of a loop is set by polygon according to the nesting hierarchy of polygon, // so the exterior ring's depth is 0, a hole in it is 1, etc. for (int i = 0; i < out->num_loops(); i++) { if (out->loop(i)->depth() > 1) { return BAD_VALUE("Polygon interior loops cannot be nested: "<< elem.toString(false)); } } return Status::OK(); }
void WriteBatchExecutor::executeBatch( const BatchedCommandRequest& request, BatchedCommandResponse* response ) { // Validate namespace const NamespaceString nss = NamespaceString( request.getNS() ); if ( !nss.isValid() ) { toBatchError( Status( ErrorCodes::InvalidNamespace, nss.ns() + " is not a valid namespace" ), response ); return; } // Make sure we can write to the namespace Status allowedStatus = userAllowedWriteNS( nss ); if ( !allowedStatus.isOK() ) { toBatchError( allowedStatus, response ); return; } // Validate insert index requests // TODO: Push insert index requests through createIndex once all upgrade paths support it string errMsg; if ( request.isInsertIndexRequest() && !request.isValidIndexRequest( &errMsg ) ) { toBatchError( Status( ErrorCodes::InvalidOptions, errMsg ), response ); return; } // Validate write concern // TODO: Lift write concern parsing out of this entirely WriteConcernOptions writeConcern; BSONObj wcDoc; if ( request.isWriteConcernSet() ) { wcDoc = request.getWriteConcern(); } Status wcStatus = Status::OK(); if ( wcDoc.isEmpty() ) { // The default write concern if empty is w : 1 // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1 wcStatus = writeConcern.parse( _defaultWriteConcern.isEmpty() ? WriteConcernOptions::Acknowledged : _defaultWriteConcern ); if ( writeConcern.wNumNodes == 0 && writeConcern.wMode.empty() ) { writeConcern.wNumNodes = 1; } } else { wcStatus = writeConcern.parse( wcDoc ); } if ( wcStatus.isOK() ) { wcStatus = validateWriteConcern( writeConcern ); } if ( !wcStatus.isOK() ) { toBatchError( wcStatus, response ); return; } if ( request.sizeWriteOps() == 0u ) { toBatchError( Status( ErrorCodes::InvalidLength, "no write ops were included in the batch" ), response ); return; } // Validate batch size if ( request.sizeWriteOps() > BatchedCommandRequest::kMaxWriteBatchSize ) { toBatchError( Status( ErrorCodes::InvalidLength, stream() << "exceeded maximum write batch size of " << BatchedCommandRequest::kMaxWriteBatchSize ), response ); return; } // // End validation // bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && writeConcern.syncMode == WriteConcernOptions::NONE; Timer commandTimer; OwnedPointerVector<WriteErrorDetail> writeErrorsOwned; vector<WriteErrorDetail*>& writeErrors = writeErrorsOwned.mutableVector(); OwnedPointerVector<BatchedUpsertDetail> upsertedOwned; vector<BatchedUpsertDetail*>& upserted = upsertedOwned.mutableVector(); // // Apply each batch item, possibly bulking some items together in the write lock. // Stops on error if batch is ordered. // bulkExecute( request, &upserted, &writeErrors ); // // Try to enforce the write concern if everything succeeded (unordered or ordered) // OR if something succeeded and we're unordered. // auto_ptr<WCErrorDetail> wcError; bool needToEnforceWC = writeErrors.empty() || ( !request.getOrdered() && writeErrors.size() < request.sizeWriteOps() ); if ( needToEnforceWC ) { _client->curop()->setMessage( "waiting for write concern" ); WriteConcernResult res; Status status = waitForWriteConcern( _txn, writeConcern, _client->getLastOp(), &res ); if ( !status.isOK() ) { wcError.reset( toWriteConcernError( status, res ) ); } } // // Refresh metadata if needed // bool staleBatch = !writeErrors.empty() && writeErrors.back()->getErrCode() == ErrorCodes::StaleShardVersion; if ( staleBatch ) { const BatchedRequestMetadata* requestMetadata = request.getMetadata(); dassert( requestMetadata ); // Make sure our shard name is set or is the same as what was set previously if ( shardingState.setShardName( requestMetadata->getShardName() ) ) { // // First, we refresh metadata if we need to based on the requested version. // ChunkVersion latestShardVersion; shardingState.refreshMetadataIfNeeded( request.getTargetingNS(), requestMetadata->getShardVersion(), &latestShardVersion ); // Report if we're still changing our metadata // TODO: Better reporting per-collection if ( shardingState.inCriticalMigrateSection() ) { noteInCriticalSection( writeErrors.back() ); } if ( queueForMigrationCommit ) { // // Queue up for migration to end - this allows us to be sure that clients will // not repeatedly try to refresh metadata that is not yet written to the config // server. Not necessary for correctness. // Exposed as optional parameter to allow testing of queuing behavior with // different network timings. // const ChunkVersion& requestShardVersion = requestMetadata->getShardVersion(); // // Only wait if we're an older version (in the current collection epoch) and // we're not write compatible, implying that the current migration is affecting // writes. // if ( requestShardVersion.isOlderThan( latestShardVersion ) && !requestShardVersion.isWriteCompatibleWith( latestShardVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "write request to old shard version " << requestMetadata->getShardVersion().toString() << " waiting for migration commit" << endl; shardingState.waitTillNotInCriticalSection( 10 /* secs */); } } } } else { // If our shard name is stale, our version must have been stale as well dassert( writeErrors.size() == request.sizeWriteOps() ); } } // // Construct response // response->setOk( true ); if ( !silentWC ) { if ( upserted.size() ) { response->setUpsertDetails( upserted ); } if ( writeErrors.size() ) { response->setErrDetails( writeErrors ); } if ( wcError.get() ) { response->setWriteConcernError( wcError.release() ); } const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if (replMode != repl::ReplicationCoordinator::modeNone) { response->setLastOp( _client->getLastOp() ); if (replMode == repl::ReplicationCoordinator::modeReplSet) { response->setElectionId(repl::theReplSet->getElectionId()); } } // Set the stats for the response response->setN( _stats->numInserted + _stats->numUpserted + _stats->numMatched + _stats->numDeleted ); if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) response->setNModified( _stats->numModified ); } dassert( response->isValid( NULL ) ); }
virtual bool run( const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); Client::ReadContext ctx(ns.ns()); Database* db = ctx.ctx().db(); Collection* collection = db->getCollection( ns ); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators()); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<MultiIteratorRunner> runners; for ( size_t i = 0; i < numCursors; i++ ) { runners.push_back(new MultiIteratorRunner(ns.ns(), collection)); } // transfer iterators to runners using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { runners[i % runners.size()]->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < runners.size(); i++) { // transfer ownership of a runner to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection, runners.releaseAt(i) ); // we are mimicking the aggregation cursor output here // that is why there are ns, ok and empty firstBatch BSONObjBuilder threadResult; { BSONObjBuilder cursor; cursor.appendArray( "firstBatch", BSONObj() ); cursor.append( "ns", ns ); cursor.append( "id", cc->cursorid() ); threadResult.append( "cursor", cursor.obj() ); } threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
/** * Upgrade v3 to v4 described here. * * This upgrade takes a config server without collection epochs (potentially) and adds * epochs to all mongo processes. * */ bool doUpgradeV3ToV4(const ConnectionString& configLoc, const VersionType& lastVersionInfo, string* errMsg) { string dummy; if (!errMsg) errMsg = &dummy; verify(lastVersionInfo.getCurrentVersion() == UpgradeHistory_NoEpochVersion); if (lastVersionInfo.isUpgradeIdSet() && lastVersionInfo.getUpgradeId().isSet()) { // // Another upgrade failed, so cleanup may be necessary // BSONObj lastUpgradeState = lastVersionInfo.getUpgradeState(); bool inCriticalSection; if (!FieldParser::extract(lastUpgradeState, inCriticalSectionField, &inCriticalSection, errMsg)) { *errMsg = stream() << "problem reading previous upgrade state" << causedBy(errMsg); return false; } if (inCriticalSection) { // Manual intervention is needed here. Somehow our upgrade didn't get applied // consistently across config servers. *errMsg = cannotCleanupMessage; return false; } if (!_cleanupUpgradeState(configLoc, lastVersionInfo.getUpgradeId(), errMsg)) { // If we can't cleanup the old upgrade state, the user might have done it for us, // not a fatal problem (we'll just end up with extra collections). warning() << "could not cleanup previous upgrade state" << causedBy(errMsg) << endl; *errMsg = ""; } } // // Check the versions of other mongo processes in the cluster before upgrade. // We can't upgrade if there are active pre-v2.2 processes in the cluster // Status mongoVersionStatus = checkClusterMongoVersions(configLoc, string(minMongoProcessVersion)); if (!mongoVersionStatus.isOK()) { *errMsg = stream() << "cannot upgrade with pre-v" << minMongoProcessVersion << " mongo processes active in the cluster" << causedBy(mongoVersionStatus); return false; } VersionType newVersionInfo; lastVersionInfo.cloneTo(&newVersionInfo); // Set our upgrade id and state OID upgradeId = OID::gen(); newVersionInfo.setUpgradeId(upgradeId); newVersionInfo.setUpgradeState(BSONObj()); // Write our upgrade id and state { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; verify(newVersionInfo.isValid(NULL)); conn->update(VersionType::ConfigNS, BSON("_id" << 1 << VersionType::version_DEPRECATED(3)), newVersionInfo.toBSON()); _checkGLE(conn); } catch (const DBException& e) { *errMsg = stream() << "could not initialize version info for upgrade" << causedBy(e); return false; } connPtr->done(); } // // First lock all collection namespaces that exist // OwnedPointerMap<string, CollectionType> ownedCollections; const map<string, CollectionType*>& collections = ownedCollections.map(); Status findCollectionsStatus = findAllCollectionsV3(configLoc, &ownedCollections); if (!findCollectionsStatus.isOK()) { *errMsg = stream() << "could not read collections from config server" << causedBy(findCollectionsStatus); return false; } // // Acquire locks for all sharded collections // Something that didn't involve getting thousands of locks would be better. // OwnedPointerVector<ScopedDistributedLock> collectionLocks; log() << "acquiring locks for " << collections.size() << " sharded collections..." << endl; // WARNING - this string is used programmatically when forcing locks, be careful when // changing! // TODO: Add programmatic "why" field to lock collection string lockMessage = str::stream() << "ensuring epochs for config upgrade" << " (" << upgradeId.toString() << ")"; if (!_acquireAllCollectionLocks(configLoc, collections, lockMessage, 20 * 60 * 1000, &collectionLocks, errMsg)) { *errMsg = stream() << "could not acquire all namespace locks for upgrade" << " (" << upgradeId.toString() << ")" << causedBy(errMsg); return false; } // We are now preventing all splits and migrates for all sharded collections // Get working and backup suffixes string workingSuffix = genWorkingSuffix(upgradeId); string backupSuffix = genBackupSuffix(upgradeId); log() << "copying collection and chunk metadata to working and backup collections..." << endl; // Get a backup and working copy of the config.collections and config.chunks collections Status copyStatus = copyFrozenCollection(configLoc, CollectionType::ConfigNS, CollectionType::ConfigNS + workingSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to " << (CollectionType::ConfigNS + workingSuffix) << causedBy(copyStatus); return false; } copyStatus = copyFrozenCollection(configLoc, CollectionType::ConfigNS, CollectionType::ConfigNS + backupSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << CollectionType::ConfigNS << " to " << (CollectionType::ConfigNS + backupSuffix) << causedBy(copyStatus); return false; } copyStatus = copyFrozenCollection(configLoc, ChunkType::ConfigNS, ChunkType::ConfigNS + workingSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to " << (ChunkType::ConfigNS + workingSuffix) << causedBy(copyStatus); return false; } copyStatus = copyFrozenCollection(configLoc, ChunkType::ConfigNS, ChunkType::ConfigNS + backupSuffix); if (!copyStatus.isOK()) { *errMsg = stream() << "could not copy " << ChunkType::ConfigNS << " to " << (ChunkType::ConfigNS + backupSuffix) << causedBy(copyStatus); return false; } // // Go through sharded collections one-by-one and add epochs where missing // for (map<string, CollectionType*>::const_iterator it = collections.begin(); it != collections.end(); ++it) { // Create a copy so that we can change the epoch later CollectionType collection; it->second->cloneTo(&collection); log() << "checking epochs for " << collection.getNS() << " collection..." << endl; OID epoch = collection.getEpoch(); // // Go through chunks to find epoch if we haven't found it or to verify epoch is the same // OwnedPointerVector<ChunkType> ownedChunks; const vector<ChunkType*>& chunks = ownedChunks.vector(); Status findChunksStatus = findAllChunks(configLoc, collection.getNS(), &ownedChunks); if (!findChunksStatus.isOK()) { *errMsg = stream() << "could not read chunks from config server" << causedBy(findChunksStatus); return false; } for (vector<ChunkType*>::const_iterator chunkIt = chunks.begin(); chunkIt != chunks.end(); ++chunkIt) { const ChunkType& chunk = *(*chunkIt); // If our chunk epoch is set and doesn't match if (epoch.isSet() && chunk.getVersion().epoch().isSet() && chunk.getVersion().epoch() != epoch) { *errMsg = stream() << "chunk epoch for " << chunk.toString() << " in " << collection.getNS() << " does not match found epoch " << epoch; return false; } else if (!epoch.isSet() && chunk.getVersion().epoch().isSet()) { epoch = chunk.getVersion().epoch(); } } // // Write collection epoch if needed // if (!collection.getEpoch().isSet()) { OID newEpoch = OID::gen(); log() << "writing new epoch " << newEpoch << " for " << collection.getNS() << " collection..." << endl; scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; conn->update(CollectionType::ConfigNS + workingSuffix, BSON(CollectionType::ns(collection.getNS())), BSON("$set" << BSON(CollectionType::DEPRECATED_lastmodEpoch(newEpoch)))); _checkGLE(conn); } catch (const DBException& e) { *errMsg = stream() << "could not write a new epoch for " << collection.getNS() << causedBy(e); return false; } connPtr->done(); collection.setEpoch(newEpoch); } epoch = collection.getEpoch(); verify(epoch.isSet()); // // Now write verified epoch to all chunks // log() << "writing epoch " << epoch << " for " << chunks.size() << " chunks in " << collection.getNS() << " collection..." << endl; { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; // Multi-update of all chunks conn->update(ChunkType::ConfigNS + workingSuffix, BSON(ChunkType::ns(collection.getNS())), BSON("$set" << BSON(ChunkType::DEPRECATED_epoch(epoch))), false, true); // multi _checkGLE(conn); } catch (const DBException& e) { *errMsg = stream() << "could not write a new epoch " << epoch.toString() << " for chunks in " << collection.getNS() << causedBy(e); return false; } connPtr->done(); } } // // Paranoid verify the collection writes // { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; // Find collections with no epochs BSONObj emptyDoc = conn->findOne(CollectionType::ConfigNS + workingSuffix, BSON("$unset" << BSON(CollectionType::DEPRECATED_lastmodEpoch() << 1))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "collection " << emptyDoc << " is still missing epoch after config upgrade"; connPtr->done(); return false; } // Find collections with empty epochs emptyDoc = conn->findOne(CollectionType::ConfigNS + workingSuffix, BSON(CollectionType::DEPRECATED_lastmodEpoch(OID()))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "collection " << emptyDoc << " still has empty epoch after config upgrade"; connPtr->done(); return false; } // Find chunks with no epochs emptyDoc = conn->findOne(ChunkType::ConfigNS + workingSuffix, BSON("$unset" << BSON(ChunkType::DEPRECATED_epoch() << 1))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "chunk " << emptyDoc << " is still missing epoch after config upgrade"; connPtr->done(); return false; } // Find chunks with empty epochs emptyDoc = conn->findOne(ChunkType::ConfigNS + workingSuffix, BSON(ChunkType::DEPRECATED_epoch(OID()))); if (!emptyDoc.isEmpty()) { *errMsg = stream() << "chunk " << emptyDoc << " still has empty epoch after config upgrade"; connPtr->done(); return false; } } catch (const DBException& e) { *errMsg = stream() << "could not verify epoch writes" << causedBy(e); return false; } connPtr->done(); } // // Double check that our collections haven't changed // Status idCheckStatus = checkIdsTheSame(configLoc, CollectionType::ConfigNS, CollectionType::ConfigNS + workingSuffix); if (!idCheckStatus.isOK()) { *errMsg = stream() << CollectionType::ConfigNS << " was modified while working on upgrade" << causedBy(idCheckStatus); return false; } idCheckStatus = checkIdsTheSame(configLoc, ChunkType::ConfigNS, ChunkType::ConfigNS + workingSuffix); if (!idCheckStatus.isOK()) { *errMsg = stream() << ChunkType::ConfigNS << " was modified while working on upgrade" << causedBy(idCheckStatus); return false; } // // ENTER CRITICAL SECTION // newVersionInfo.setUpgradeState(BSON(inCriticalSectionField(true))); { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; verify(newVersionInfo.isValid(NULL)); conn->update(VersionType::ConfigNS, BSON("_id" << 1 << VersionType::version_DEPRECATED(3)), newVersionInfo.toBSON()); _checkGLE(conn); } catch (const DBException& e) { // No cleanup message here since we're not sure if we wrote or not, and // not dangerous either way except to prevent further updates (at which point // the message is printed) *errMsg = stream() << "could not update version info to enter critical update section" << causedBy(e); return false; } // AT THIS POINT ANY FAILURE REQUIRES MANUAL INTERVENTION! connPtr->done(); } log() << "entered critical section for config upgrade" << endl; Status overwriteStatus = overwriteCollection(configLoc, CollectionType::ConfigNS + workingSuffix, CollectionType::ConfigNS); if (!overwriteStatus.isOK()) { error() << cleanupMessage << endl; *errMsg = stream() << "could not overwrite collection " << CollectionType::ConfigNS << " with working collection " << (CollectionType::ConfigNS + workingSuffix) << causedBy(overwriteStatus); return false; } overwriteStatus = overwriteCollection(configLoc, ChunkType::ConfigNS + workingSuffix, ChunkType::ConfigNS); if (!overwriteStatus.isOK()) { error() << cleanupMessage << endl; *errMsg = stream() << "could not overwrite collection " << ChunkType::ConfigNS << " with working collection " << (ChunkType::ConfigNS + workingSuffix) << causedBy(overwriteStatus); return false; } // // Finally update the version to latest and add clusterId to version // OID newClusterId = OID::gen(); // Note: hardcoded versions, since this is a very particular upgrade // Note: DO NOT CLEAR the config version unless bumping the minCompatibleVersion, // we want to save the excludes that were set. newVersionInfo.setMinCompatibleVersion(UpgradeHistory_NoEpochVersion); newVersionInfo.setCurrentVersion(UpgradeHistory_MandatoryEpochVersion); newVersionInfo.setClusterId(newClusterId); // Leave critical section newVersionInfo.unsetUpgradeId(); newVersionInfo.unsetUpgradeState(); log() << "writing new version info and clusterId " << newClusterId << "..." << endl; { scoped_ptr<ScopedDbConnection> connPtr; try { connPtr.reset(ScopedDbConnection::getInternalScopedDbConnection(configLoc, 30)); ScopedDbConnection& conn = *connPtr; verify(newVersionInfo.isValid(NULL)); conn->update(VersionType::ConfigNS, BSON("_id" << 1 << VersionType::version_DEPRECATED(UpgradeHistory_NoEpochVersion)), newVersionInfo.toBSON()); _checkGLE(conn); } catch (const DBException& e) { error() << cleanupMessage << endl; *errMsg = stream() << "could not write new version info " << "and exit critical upgrade section" << causedBy(e); return false; } connPtr->done(); } // // END CRITICAL SECTION // return true; }
void Listener::initAndListen() { if (!_setupSocketsSuccessful) { return; } for (unsigned i = 0; i < _socks.size(); i++) { if (::listen(_socks[i], 128) != 0) { error() << "listen(): listen() failed " << errnoWithDescription() << endl; return; } ListeningSockets::get()->add(_socks[i]); } #ifdef MONGO_CONFIG_SSL _logListen(_port, _ssl); #else _logListen(_port, false); #endif { // Wake up any threads blocked in waitUntilListening() stdx::lock_guard<stdx::mutex> lock(_readyMutex); _ready = true; _readyCondition.notify_all(); } OwnedPointerVector<EventHolder> eventHolders; std::unique_ptr<WSAEVENT[]> events(new WSAEVENT[_socks.size()]); // Populate events array with an event for each socket we are watching for (size_t count = 0; count < _socks.size(); ++count) { EventHolder* ev(new EventHolder); eventHolders.mutableVector().push_back(ev); events[count] = ev->get(); } while ( ! inShutdown() ) { // Turn on listening for accept-ready sockets for (size_t count = 0; count < _socks.size(); ++count) { int status = WSAEventSelect(_socks[count], events[count], FD_ACCEPT | FD_CLOSE); if (status == SOCKET_ERROR) { const int mongo_errno = WSAGetLastError(); // During shutdown, we may fail to listen on the socket if it has already // been closed if (inShutdown()) { return; } error() << "Windows WSAEventSelect returned " << errnoWithDescription(mongo_errno) << endl; fassertFailed(16727); } } // Wait till one of them goes active, or we time out DWORD result = WSAWaitForMultipleEvents(_socks.size(), events.get(), FALSE, // don't wait for all the events 10, // timeout, in ms FALSE); // do not allow I/O interruptions if (result == WSA_WAIT_FAILED) { const int mongo_errno = WSAGetLastError(); error() << "Windows WSAWaitForMultipleEvents returned " << errnoWithDescription(mongo_errno) << endl; fassertFailed(16723); } if (result == WSA_WAIT_TIMEOUT) { _elapsedTime += 10; continue; } _elapsedTime += 1; // assume 1ms to grab connection. very rough // Determine which socket is ready DWORD eventIndex = result - WSA_WAIT_EVENT_0; WSANETWORKEVENTS networkEvents; // Extract event details, and clear event for next pass int status = WSAEnumNetworkEvents(_socks[eventIndex], events[eventIndex], &networkEvents); if (status == SOCKET_ERROR) { const int mongo_errno = WSAGetLastError(); error() << "Windows WSAEnumNetworkEvents returned " << errnoWithDescription(mongo_errno) << endl; continue; } if (networkEvents.lNetworkEvents & FD_CLOSE) { log() << "listen socket closed" << endl; break; } if (!(networkEvents.lNetworkEvents & FD_ACCEPT)) { error() << "Unexpected network event: " << networkEvents.lNetworkEvents << endl; continue; } int iec = networkEvents.iErrorCode[FD_ACCEPT_BIT]; if (iec != 0) { error() << "Windows socket accept did not work:" << errnoWithDescription(iec) << endl; continue; } status = WSAEventSelect(_socks[eventIndex], NULL, 0); if (status == SOCKET_ERROR) { const int mongo_errno = WSAGetLastError(); error() << "Windows WSAEventSelect returned " << errnoWithDescription(mongo_errno) << endl; continue; } disableNonblockingMode(_socks[eventIndex]); SockAddr from; int s = accept(_socks[eventIndex], from.raw(), &from.addressSize); if ( s < 0 ) { int x = errno; // so no global issues if (x == EBADF) { log() << "Port " << _port << " is no longer valid" << endl; continue; } else if (x == ECONNABORTED) { log() << "Listener on port " << _port << " aborted" << endl; continue; } if ( x == 0 && inShutdown() ) { return; // socket closed } if( !inShutdown() ) { log() << "Listener: accept() returns " << s << " " << errnoWithDescription(x) << endl; if (x == EMFILE || x == ENFILE) { // Connection still in listen queue but we can't accept it yet error() << "Out of file descriptors. Waiting one second before" " trying to accept more connections." << warnings; sleepsecs(1); } } continue; } if (from.getType() != AF_UNIX) disableNagle(s); long long myConnectionNumber = globalConnectionNumber.addAndFetch(1); if (_logConnect && !serverGlobalParams.quiet) { int conns = globalTicketHolder.used()+1; const char* word = (conns == 1 ? " connection" : " connections"); log() << "connection accepted from " << from.toString() << " #" << myConnectionNumber << " (" << conns << word << " now open)" << endl; } std::shared_ptr<Socket> pnewSock( new Socket(s, from) ); #ifdef MONGO_CONFIG_SSL if (_ssl) { pnewSock->secureAccepted(_ssl); } #endif accepted( pnewSock , myConnectionNumber ); } }
// static Status ClearFilters::clear(QuerySettings* querySettings, PlanCache* planCache, const std::string& ns, const BSONObj& cmdObj) { invariant(querySettings); // According to the specification, the planCacheClearFilters command runs in two modes: // - clear all hints; or // - clear hints for single query shape when a query shape is described in the // command arguments. if (cmdObj.hasField("query")) { CanonicalQuery* cqRaw; Status status = PlanCacheCommand::canonicalize(ns, cmdObj, &cqRaw); if (!status.isOK()) { return status; } scoped_ptr<CanonicalQuery> cq(cqRaw); querySettings->removeAllowedIndices(*cq); // Remove entry from plan cache planCache->remove(*cq); return Status::OK(); } // If query is not provided, make sure sort and projection are not in arguments. // We do not want to clear the entire cache inadvertently when the user // forgot to provide a value for "query". if (cmdObj.hasField("sort") || cmdObj.hasField("projection")) { return Status(ErrorCodes::BadValue, "sort or projection provided without query"); } // Get entries from query settings. We need to remove corresponding entries from the plan // cache shortly. OwnedPointerVector<AllowedIndexEntry> entries; entries.mutableVector() = querySettings->getAllAllowedIndices(); // OK to proceed with clearing entire cache. querySettings->clearAllowedIndices(); const NamespaceString nss(ns); const WhereCallbackReal whereCallback(nss.db()); // Remove corresponding entries from plan cache. // Admin hints affect the planning process directly. If there were // plans generated as a result of applying index filter, these need to be // invalidated. This allows the planner to re-populate the plan cache with // non-filtered indexed solutions next time the query is run. // Resolve plan cache key from (query, sort, projection) in query settings entry. // Concurrency note: There's no harm in removing plan cache entries one at at time. // Only way that PlanCache::remove() can fail is when the query shape has been removed from // the cache by some other means (re-index, collection info reset, ...). This is OK since // that's the intended effect of calling the remove() function with the key from the hint entry. for (vector<AllowedIndexEntry*>::const_iterator i = entries.begin(); i != entries.end(); ++i) { AllowedIndexEntry* entry = *i; invariant(entry); // Create canonical query. CanonicalQuery* cqRaw; Status result = CanonicalQuery::canonicalize( ns, entry->query, entry->sort, entry->projection, &cqRaw, whereCallback); invariant(result.isOK()); scoped_ptr<CanonicalQuery> cq(cqRaw); // Remove plan cache entry. planCache->remove(*cq); } return Status::OK(); }
/** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. */ const DiskLoc DataFileMgr::updateRecord( const char *ns, Collection* collection, Record *toupdate, const DiskLoc& dl, const char *_buf, int _len, OpDebug& debug, bool god) { dassert( toupdate == dl.rec() ); BSONObj objOld = BSONObj::make(toupdate); BSONObj objNew(_buf); DEV verify( objNew.objsize() == _len ); DEV verify( objNew.objdata() == _buf ); if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { /* add back the old _id value if the update removes it. Note this implementation is slow (copies entire object multiple times), but this shouldn't happen often, so going for simple code, not speed. */ BSONObjBuilder b; BSONElement e; verify( objOld.getObjectID(e) ); b.append(e); // put _id first, for best performance b.appendElements(objNew); objNew = b.obj(); } NamespaceString nsstring(ns); if (nsstring.coll() == "system.users") { V2UserDocumentParser parser; uassertStatusOK(parser.checkValidUserDocument(objNew)); } uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew, objNew["_id"] == objOld["_id"]); /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerVector<UpdateTicket> updateTickets; updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount()); for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) { auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i)); auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get())); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || ignoreUniqueIndex(descriptor->getOnDisk()); updateTickets.mutableVector()[i] = new UpdateTicket(); Status ret = iam->validateUpdate(objOld, objNew, dl, options, updateTickets.mutableVector()[i]); if (Status::OK() != ret) { uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString()); } } if ( toupdate->netLength() < objNew.objsize() ) { // doesn't fit. reallocate ----------------------------------------------------- moveCounter.increment(); uassert( 10003, "failing update: objects in a capped ns cannot grow", !(collection && collection->details()->isCapped())); collection->details()->paddingTooSmall(); deleteRecord(ns, toupdate, dl); DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god); if (debug.nmoved == -1) // default of -1 rather than 0 debug.nmoved = 1; else debug.nmoved += 1; return res; } collection->infoCache()->notifyOfWriteOp(); collection->details()->paddingFits(); debug.keyUpdates = 0; for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) { auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i)); auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get())); int64_t updatedKeys; Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys); if (Status::OK() != ret) { // This shouldn't happen unless something disastrous occurred. massert(16799, "update failed: " + ret.toString(), false); } debug.keyUpdates += updatedKeys; } // update in place int sz = objNew.objsize(); memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz); return dl; }
bool mergeChunks( OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg ) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(), *errMsg ); if ( !configLoc.isValid() ){ warning() << *errMsg << endl; return false; } // // Get the distributed lock // ScopedDistributedLock collLock( configLoc, nss.ns() ); collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey ); Status acquisitionStatus = collLock.tryAcquire(); if (!acquisitionStatus.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(acquisitionStatus); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if ( !status.isOK() ) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy( status.reason() ); warning() << *errMsg << endl; return false; } if ( epoch.isSet() && shardVersion.epoch() != epoch ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() ); if ( !metadata || metadata->getKeyPattern().isEmpty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert( metadata->getShardVersion().equals( shardVersion ) ); if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString( minKey, maxKey ) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); OwnedPointerVector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin( minKey ); itChunk.setMax( minKey ); itChunk.setNS( nss.ns() ); itChunk.setShard( shardingState.getShardName() ); while ( itChunk.getMax().woCompare( maxKey ) < 0 && metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) { auto_ptr<ChunkType> saved( new ChunkType ); itChunk.cloneTo( saved.get() ); chunksToMerge.mutableVector().push_back( saved.release() ); } if ( chunksToMerge.empty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin(); BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax(); // minKey is inclusive bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey ); if ( !minKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin(); BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 && lastDocMax.woCompare( maxKey ) >= 0; if ( !maxKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0; bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0; if ( !validRangeStartKey || !validRangeEndKey ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" ) << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" ) << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" ); warning() << *errMsg << endl; return false; } if ( chunksToMerge.size() == 1 ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString( minKey, maxKey ); warning() << *errMsg << endl; return false; } bool holeInRange = false; // Look for hole in range ChunkType* prevChunk = *chunksToMerge.begin(); ChunkType* nextChunk = NULL; for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); it != chunksToMerge.end(); ++it ) { if ( it == chunksToMerge.begin() ) continue; nextChunk = *it; if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) { holeInRange = true; break; } prevChunk = nextChunk; } if ( holeInRange ) { dassert( NULL != nextChunk ); *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString( minKey, maxKey ) << " at " << rangeToString( prevChunk->getMax(), nextChunk->getMin() ); warning() << *errMsg << endl; return false; } // // Run apply ops command // BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge, shardVersion, mergeVersion ); bool ok; BSONObj result; try { ScopedDbConnection conn( configLoc, 30.0 ); ok = conn->runCommand( "config", applyOpsCmd, result ); if ( !ok ) *errMsg = result.toString(); conn.done(); } catch( const DBException& ex ) { ok = false; *errMsg = ex.toString(); } if ( !ok ) { *errMsg = stream() << "could not merge chunks for " << nss.ns() << ", writing to config failed" << causedBy( errMsg ); warning() << *errMsg << endl; return false; } // // Install merged chunk metadata // { Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge, shardVersion, mergeVersion ); configServer.logChange( "merge", nss.ns(), mergeLogEntry ); return true; }
void ReplSetImpl::loadConfig(OperationContext* txn) { startupStatus = LOADINGCONFIG; startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)"); LOG(1) << "loadConfig() " << rsConfigNs << endl; while (1) { try { OwnedPointerVector<ReplSetConfig> configs; try { configs.mutableVector().push_back(ReplSetConfig::makeDirect(txn)); } catch (DBException& e) { log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog; } for (vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++) { try { configs.mutableVector().push_back(ReplSetConfig::make(txn, *i)); } catch (DBException& e) { log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog; } } ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); { scoped_lock lck(replSettings.discoveredSeeds_mx); if (replSettings.discoveredSeeds.size() > 0) { for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); i != replSettings.discoveredSeeds.end(); i++) { try { configs.mutableVector().push_back( ReplSetConfig::make(txn, HostAndPort(*i))); } catch (DBException&) { LOG(1) << "replSet exception trying to load config from discovered " "seed " << *i << rsLog; replSettings.discoveredSeeds.erase(*i); } } } } if (!replSettings.reconfig.isEmpty()) { try { configs.mutableVector().push_back(ReplSetConfig::make(txn, replSettings.reconfig, true)); } catch (DBException& re) { log() << "replSet couldn't load reconfig: " << re.what() << rsLog; replSettings.reconfig = BSONObj(); } } int nok = 0; int nempty = 0; for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin(); i != configs.mutableVector().end(); i++) { if ((*i)->ok()) nok++; if ((*i)->empty()) nempty++; } if (nok == 0) { if (nempty == (int) configs.mutableVector().size()) { startupStatus = EMPTYCONFIG; startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)"); log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog; static unsigned once; if (++once == 1) { log() << "replSet info you may need to run replSetInitiate -- rs.initia" "te() in the shell -- if that is not already done" << rsLog; } if (_seeds->size() == 0) { LOG(1) << "replSet info no seed hosts were specified on the --replSet " "command line" << rsLog; } } else { startupStatus = EMPTYUNREACHABLE; startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)"); log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog; } sleepsecs(1); continue; } if (!_loadConfigFinish(txn, configs.mutableVector())) { log() << "replSet info Couldn't load config yet. Sleeping 3 sec and will try " "again." << rsLog; sleepsecs(3); continue; } } catch (DBException& e) { startupStatus = BADCONFIG; startupStatusMsg.set("replSet error loading set config (BADCONFIG)"); log() << "replSet error loading configurations " << e.toString() << rsLog; log() << "replSet error replication will not start" << rsLog; sethbmsg("error loading set config"); fassertFailedNoTrace(18754); throw; } break; } startupStatusMsg.set("? started"); startupStatus = STARTED; }