/* get a table scan cursor, but can be forward or reverse direction. order.$natural - if set, > 0 means forward (asc), < 0 backward (desc). */ shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) { BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 } if ( el.number() >= 0 ) return DataFileMgr::findAll(ns, startLoc); // "reverse natural order" NamespaceDetails *d = nsdetails(ns); if ( !d ) return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); if ( !d->isCapped() ) { if ( !startLoc.isNull() ) return shared_ptr<Cursor>(new ReverseCursor( startLoc )); Extent *e = d->lastExtent().ext(); while ( e->lastRecord.isNull() && !e->xprev.isNull() ) { OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl; e = e->getPrevExtent(); } return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord )); } else { return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) ); } }
bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) ); massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails bool ok; { Lock::DBWrite lk(ns); BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str()); Client::Context ctx(ns); NamespaceDetails *d = nsdetails(ns.c_str()); massert( 13660, str::stream() << "namespace " << ns << " does not exist", d ); massert( 13661, "cannot compact capped collection", !d->isCapped() ); log() << "compact " << ns << " begin" << endl; if( pf != 0 || pb != 0 ) { log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl; } try { ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb); } catch(...) { log() << "compact " << ns << " end (with error)" << endl; throw; } log() << "compact " << ns << " end" << endl; } return ok; }
BSONObj Sync::getMissingDoc(const BSONObj& o) { OplogReader missingObjReader; const char *ns = o.getStringField("ns"); // capped collections NamespaceDetails *nsd = nsdetails(ns); if ( nsd && nsd->isCapped() ) { log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl; return BSONObj(); } uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn)); // might be more than just _id in the update criteria BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); BSONObj missingObj; try { missingObj = missingObjReader.findOne(ns, query); } catch(DBException& e) { log() << "replication assertion fetching missing object: " << e.what() << endl; throw; } return missingObj; }
/** @param fromRepl false if from ApplyOpsCmd @return true if was and update should have happened and the document DNE. see replset initial sync code. */ bool applyOperation_inlock(const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) { LOG(6) << "applying op: " << op << endl; bool failedUpdate = false; OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; const char *names[] = { "o", "ns", "op", "b" }; BSONElement fields[4]; op.getFields(4, names, fields); BSONObj o; if( fields[0].isABSONObj() ) o = fields[0].embeddedObject(); const char *ns = fields[1].valuestrsafe(); Lock::assertWriteLocked(ns); NamespaceDetails *nsd = nsdetails(ns); // operation type -- see logOp() comments for types const char *opType = fields[2].valuestrsafe(); if ( *opType == 'i' ) { opCounters->gotInsert(); const char *p = strchr(ns, '.'); if ( p && strcmp(p, ".system.indexes") == 0 ) { // updates aren't allowed for indexes -- so we will do a regular insert. if index already // exists, that is ok. theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize()); } else { // do upserts for inserts as we might get replayed more than once OpDebug debug; BSONElement _id; if( !o.getObjectID(_id) ) { /* No _id. This will be very slow. */ Timer t; updateObjectsForReplication(ns, o, o, true, false, false, debug, false, QueryPlanSelectionPolicy::idElseNatural() ); if( t.millis() >= 2 ) { RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; } } else { // probably don't need this since all replicated colls have _id indexes now // but keep it just in case RARELY if ( nsd && !nsd->isCapped() ) { ensureHaveIdIndex(ns); } /* todo : it may be better to do an insert here, and then catch the dup key exception and do update then. very few upserts will not be inserts... */ BSONObjBuilder b; b.append(_id); updateObjectsForReplication(ns, o, b.done(), true, false, false , debug, false, QueryPlanSelectionPolicy::idElseNatural() ); } } }
void operator()(DBClientCursorBatchIterator &i) { const string to_dbname = nsToDatabase(to_collection); while (i.moreInCurrentBatch()) { if (n % 128 == 127) { time_t now = time(0); if (now - lastLog >= 60) { // report progress if (lastLog) { log() << "clone " << to_collection << ' ' << n << endl; } lastLog = now; } mayInterrupt(_mayBeInterrupted); } BSONObj js = i.nextSafe(); ++n; if (isindex) { verify(strstr(from_collection, "system.indexes")); storedForLater->push_back(fixindex(js, to_dbname).getOwned()); } else { try { Client::ReadContext ctx(to_collection); if (_isCapped) { NamespaceDetails *d = nsdetails(to_collection); verify(d->isCapped()); BSONObj pk = js["$_"].Obj(); BSONObjBuilder rowBuilder; BSONObjIterator it(js); while (it.moreWithEOO()) { BSONElement e = it.next(); if (e.eoo()) { break; } if (!mongoutils::str::equals(e.fieldName(), "$_")) { rowBuilder.append(e); } } BSONObj row = rowBuilder.obj(); d->insertObjectIntoCappedWithPK(pk, row, NamespaceDetails::NO_LOCKTREE); } else { insertObject(to_collection, js, 0, logForRepl); } } catch (UserException& e) { error() << "error: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; throw; } RARELY if ( time( 0 ) - saveLast > 60 ) { log() << n << " objects cloned so far from collection " << from_collection << endl; saveLast = time( 0 ); } } } }
NamespaceDetails* NamespaceIndex::details(const Namespace& ns) { if ( !_ht ) return 0; NamespaceDetails *d = _ht->get(ns); if ( d && d->isCapped() ) d->cappedCheckMigrate(); return d; }
BSONObj Sync::getMissingDoc(const BSONObj& o) { OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? const char *ns = o.getStringField("ns"); // capped collections NamespaceDetails *nsd = nsdetails(ns); if ( nsd && nsd->isCapped() ) { log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl; return BSONObj(); } const int retryMax = 3; for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { if (retryCount != 1) { // if we are retrying, sleep a bit to let the network possibly recover sleepsecs(retryCount * retryCount); } try { bool ok = missingObjReader.connect(hn); if (!ok) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } } catch (const SocketException&) { warning() << "network problem detected while connecting to the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } // might be more than just _id in the update criteria BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); BSONObj missingObj; try { missingObj = missingObjReader.findOne(ns, query); } catch (const SocketException&) { warning() << "network problem detected while fetching a missing document from the " << "sync source, attempt " << retryCount << " of " << retryMax << endl; continue; // try again } catch (DBException& e) { log() << "replication assertion fetching missing object: " << e.what() << endl; throw; } // success! return missingObj; } // retry count exceeded msgasserted(15916, str::stream() << "Can no longer connect to initial sync source: " << hn); }
// prefetch for an oplog operation void prefetchPagesForReplicatedOp(const BSONObj& op) { const char *opField; const char *opType = op.getStringField("op"); switch (*opType) { case 'i': // insert case 'd': // delete opField = "o"; break; case 'u': // update opField = "o2"; break; default: // prefetch ignores other ops return; } BSONObj obj = op.getObjectField(opField); const char *ns = op.getStringField("ns"); NamespaceDetails *nsd = nsdetails(ns); if (!nsd) return; // maybe not opened yet LOG(4) << "index prefetch for op " << *opType << endl; DEV Lock::assertAtLeastReadLocked(ns); // should we prefetch index pages on updates? if the update is in-place and doesn't change // indexed values, it is actually slower - a lot slower if there are a dozen indexes or // lots of multikeys. possible variations (not all mutually exclusive): // 1) current behavior: full prefetch // 2) don't do it for updates // 3) don't do multikey indexes for updates // 4) don't prefetchIndexPages on some heuristic; e.g., if it's an $inc. // 5) if not prefetching index pages (#2), we should do it if we are upsertings and it // will be an insert. to do that we could do the prefetchRecordPage first and if DNE // then we do #1. // // note that on deletes 'obj' does not have all the keys we would want to prefetch on. // a way to achieve that would be to prefetch the record first, and then afterwards do // this part. // prefetchIndexPages(nsd, obj); // do not prefetch the data for inserts; it doesn't exist yet // // we should consider doing the record prefetch for the delete op case as we hit the record // when we delete. note if done we only want to touch the first page. // // update: do record prefetch. if ((*opType == 'u') && // do not prefetch the data for capped collections because // they typically do not have an _id index for findById() to use. !nsd->isCapped()) { prefetchRecordPages(ns, obj); } }
virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string coll = cmdObj.firstElement().valuestr(); if( coll.empty() || db.empty() ) { errmsg = "no collection name specified"; return false; } if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force"; return false; } string ns = db + '.' + coll; if ( ! NamespaceString::normal(ns.c_str()) ) { errmsg = "bad namespace name"; return false; } // parameter validation to avoid triggering assertions in compact() if ( str::contains(ns, ".system.") ) { errmsg = "can't compact a system namespace"; return false; } { Lock::DBWrite lk(ns); Client::Context ctx(ns); NamespaceDetails *d = nsdetails(ns.c_str()); if( ! d ) { errmsg = "namespace does not exist"; return false; } if ( d->isCapped() ) { errmsg = "cannot compact a capped collection"; return false; } } double pf = 1.0; int pb = 0; if( cmdObj.hasElement("paddingFactor") ) { pf = cmdObj["paddingFactor"].Number(); verify( pf >= 1.0 && pf <= 4.0 ); } if( cmdObj.hasElement("paddingBytes") ) { pb = (int) cmdObj["paddingBytes"].Number(); verify( pb >= 0 && pb <= 1024 * 1024 ); } bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment bool ok = compact(ns, errmsg, validate, result, pf, pb); return ok; }
void insertObjects(const char *ns, const vector<BSONObj> &objs, bool keepGoing, uint64_t flags, bool logop ) { if (mongoutils::str::contains(ns, "system.")) { massert(16748, "need transaction to run insertObjects", cc().txnStackSize() > 0); uassert(10095, "attempt to insert in reserved database name 'system'", !mongoutils::str::startsWith(ns, "system.")); massert(16750, "attempted to insert multiple objects into a system namspace at once", objs.size() == 1); if (handle_system_collection_insert(ns, objs[0], logop) != 0) { return; } } NamespaceDetails *details = getAndMaybeCreateNS(ns, logop); NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get(ns); for (size_t i = 0; i < objs.size(); i++) { const BSONObj &obj = objs[i]; try { uassert( 10059 , "object to insert too large", obj.objsize() <= BSONObjMaxUserSize); BSONObjIterator i( obj ); while ( i.more() ) { BSONElement e = i.next(); uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' ); } uassert( 16440 , "_id cannot be an array", obj["_id"].type() != Array ); BSONObj objModified = obj; BSONElementManipulator::lookForTimestamps(objModified); if (details->isCapped() && logop) { // unfortunate hack we need for capped collections // we do this because the logic for generating the pk // and what subsequent rows to delete are buried in the // namespace details object. There is probably a nicer way // to do this, but this works. details->insertObjectIntoCappedAndLogOps(objModified, flags); if (nsdt != NULL) { nsdt->notifyOfWriteOp(); } } else { insertOneObject(details, nsdt, objModified, flags); // may add _id field if (logop) { OpLogHelpers::logInsert(ns, objModified, &cc().txn()); } } } catch (const UserException &) { if (!keepGoing || i == objs.size() - 1) { throw; } } } }
shared_ptr<Cursor> DataFileMgr::findAll(const StringData& ns, const DiskLoc &startLoc) { NamespaceDetails * d = nsdetails( ns ); if ( ! d ) return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); DiskLoc loc = d->firstExtent(); if ( loc.isNull() ) return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); Extent *e = getExtent(loc); DEBUGGING { out() << "listing extents for " << ns << endl; DiskLoc tmp = loc; set<DiskLoc> extents; while ( 1 ) { Extent *f = getExtent(tmp); out() << "extent: " << tmp.toString() << endl; extents.insert(tmp); tmp = f->xnext; if ( tmp.isNull() ) break; f = f->getNextExtent(); } out() << endl; d->dumpDeleted(&extents); } if ( d->isCapped() ) return shared_ptr<Cursor>( ForwardCappedCursor::make( d , startLoc ) ); if ( !startLoc.isNull() ) return shared_ptr<Cursor>(new BasicCursor( startLoc )); while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { /* todo: if extent is empty, free it for reuse elsewhere. that is a bit complicated have to clean up the freelists. */ RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl; // find a nonempty extent // it might be nice to free the whole extent here! but have to clean up free recs then. e = e->getNextExtent(); } return shared_ptr<Cursor>(new BasicCursor( e->firstRecord )); }
// Does not check magic system collection inserts. void _insertObjects(const char *ns, const vector<BSONObj> &objs, bool keepGoing, uint64_t flags, bool logop ) { NamespaceDetails *details = getAndMaybeCreateNS(ns, logop); for (size_t i = 0; i < objs.size(); i++) { const BSONObj &obj = objs[i]; try { uassert( 10059 , "object to insert too large", obj.objsize() <= BSONObjMaxUserSize); BSONObjIterator i( obj ); while ( i.more() ) { BSONElement e = i.next(); // check no $ modifiers. note we only check top level. // (scanning deep would be quite expensive) uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' ); // check no regexp for _id (SERVER-9502) if (str::equals(e.fieldName(), "_id")) { uassert(17033, "can't use a regex for _id", e.type() != RegEx); } } uassert( 16440 , "_id cannot be an array", obj["_id"].type() != Array ); BSONObj objModified = obj; BSONElementManipulator::lookForTimestamps(objModified); if (details->isCapped() && logop) { // unfortunate hack we need for capped collections // we do this because the logic for generating the pk // and what subsequent rows to delete are buried in the // namespace details object. There is probably a nicer way // to do this, but this works. details->insertObjectIntoCappedAndLogOps(objModified, flags); details->notifyOfWriteOp(); } else { insertOneObject(details, objModified, flags); // may add _id field if (logop) { OpLogHelpers::logInsert(ns, objModified); } } } catch (const UserException &) { if (!keepGoing || i == objs.size() - 1) { throw; } } } }
NamespaceDetails* getOrCreateProfileCollection(Database *db, bool force, string* errmsg ) { fassert(16372, db); const char* profileName = db->profileName.c_str(); NamespaceDetails* details = db->namespaceIndex.details(profileName); if (!details && (cmdLine.defaultProfile || force)) { // system.profile namespace doesn't exist; create it log() << "creating profile collection: " << profileName << endl; string myerrmsg; if (!userCreateNS(db->profileName.c_str(), BSON("capped" << true << "size" << 1024 * 1024), myerrmsg , false)) { myerrmsg = str::stream() << "could not create ns " << db->profileName << ": " << myerrmsg; log() << myerrmsg << endl; if ( errmsg ) *errmsg = myerrmsg; return NULL; } details = db->namespaceIndex.details(profileName); } else if ( details && !details->isCapped() ) { string myerrmsg = str::stream() << profileName << " exists but isn't capped"; log() << myerrmsg << endl; if ( errmsg ) *errmsg = myerrmsg; return NULL; } if (!details) { // failed to get or create profile collection static time_t last = time(0) - 10; // warn the first time if( time(0) > last+10 ) { log() << "profile: warning ns " << profileName << " does not exist" << endl; last = time(0); } } return details; }
/* ns: namespace, e.g. <database>.<collection> pattern: the "where" clause / criteria justOne: stop after 1 match god: allow access to system namespaces, and don't yield */ long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god, RemoveSaver * rs ) { if( !god ) { if ( strstr(ns, ".system.") ) { /* note a delete from system.indexes would corrupt the db if done here, as there are pointers into those objects in NamespaceDetails. */ uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) ); } if ( strchr( ns , '$' ) ) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 ); } } { NamespaceDetails *d = nsdetails( ns ); if ( ! d ) return 0; uassert( 10101 , "can't remove from a capped collection" , ! d->isCapped() ); } long long nDeleted = 0; shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern ); if( !creal->ok() ) return nDeleted; shared_ptr< Cursor > cPtr = creal; auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) ); cc->setDoingDeletes( true ); CursorId id = cc->cursorid(); bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic()); do { // TODO: we can generalize this I believe // bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern ); if ( ! willNeedRecord ) { // TODO: this is a total hack right now // check if the index full encompasses query if ( pattern.nFields() == 1 && str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) ) willNeedRecord = true; } if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) { cc.release(); // has already been deleted elsewhere // TODO should we assert or something? break; } if ( !cc->ok() ) { break; // if we yielded, could have hit the end } // this way we can avoid calling prepareToYield() every time (expensive) // as well as some other nuances handled cc->setDoingDeletes( true ); DiskLoc rloc = cc->currLoc(); BSONObj key = cc->currKey(); bool match = creal->currentMatches(); cc->advance(); if ( ! match ) continue; // SERVER-5198 Advance past the document to be modified, but see SERVER-5725. while( cc->ok() && rloc == cc->currLoc() ) { cc->advance(); } bool foundAllResults = ( justOne || !cc->ok() ); if ( !foundAllResults ) { // NOTE: Saving and restoring a btree cursor's position was historically described // as slow here. cc->c()->prepareToTouchEarlierIterate(); } if ( logop ) { BSONElement e; if( BSONObj::make( rloc.rec() ).getObjectID( e ) ) { BSONObjBuilder b; b.append( e ); bool replJustOne = true; logOp( "d", ns, b.done(), 0, &replJustOne ); } else { problem() << "deleted object without id, not logging" << endl; } } theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc); nDeleted++; if ( foundAllResults ) { break; } cc->c()->recoverFromTouchingEarlierIterate(); if( !god ) getDur().commitIfNeeded(); if( debug && god && nDeleted == 100 ) log() << "warning high number of deletes with god=true which could use significant memory" << endl; } while ( cc->ok() ); if ( cc.get() && ClientCursor::find( id , false ) == 0 ) { // TODO: remove this and the id declaration above if this doesn't trigger // if it does, then i'm very confused (ERH 06/2011) error() << "this should be impossible" << endl; printStackTrace(); cc.release(); } return nDeleted; }
/** @param fromRepl false if from ApplyOpsCmd @return true if was and update should have happened and the document DNE. see replset initial sync code. */ bool applyOperation_inlock(const BSONObj& op, bool fromRepl, bool convertUpdateToUpsert) { LOG(3) << "applying op: " << op << endl; bool failedUpdate = false; OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; const char *names[] = { "o", "ns", "op", "b" }; BSONElement fields[4]; op.getFields(4, names, fields); BSONObj o; if( fields[0].isABSONObj() ) o = fields[0].embeddedObject(); const char *ns = fields[1].valuestrsafe(); Lock::assertWriteLocked(ns); NamespaceDetails *nsd = nsdetails(ns); // operation type -- see logOp() comments for types const char *opType = fields[2].valuestrsafe(); if ( *opType == 'i' ) { opCounters->gotInsert(); const char *p = strchr(ns, '.'); if ( p && strcmp(p, ".system.indexes") == 0 ) { if (o["background"].trueValue()) { IndexBuilder* builder = new IndexBuilder(ns, o); // This spawns a new thread and returns immediately. builder->go(); } else { IndexBuilder builder(ns, o); // Finish the foreground build before returning builder.build(); } } else { // do upserts for inserts as we might get replayed more than once OpDebug debug; BSONElement _id; if( !o.getObjectID(_id) ) { /* No _id. This will be very slow. */ Timer t; const NamespaceString requestNs(ns); UpdateRequest request( requestNs, debug, QueryPlanSelectionPolicy::idElseNatural()); request.setQuery(o); request.setUpdates(o); request.setUpsert(); request.setFromReplication(); update(request); if( t.millis() >= 2 ) { RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; } } else { // probably don't need this since all replicated colls have _id indexes now // but keep it just in case RARELY if ( nsd && !nsd->isCapped() ) { ensureHaveIdIndex(ns, false); } /* todo : it may be better to do an insert here, and then catch the dup key exception and do update then. very few upserts will not be inserts... */ BSONObjBuilder b; b.append(_id); const NamespaceString requestNs(ns); UpdateRequest request( requestNs, debug, QueryPlanSelectionPolicy::idElseNatural()); request.setQuery(b.done()); request.setUpdates(o); request.setUpsert(); request.setFromReplication(); update(request); } } }
DiskLoc DataFileMgr::insert(const char* ns, const void* obuf, int32_t len, bool mayInterrupt, bool god, bool mayAddIndex, bool* addedID) { Database* database = cc().database(); bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); } bool addIndex = wouldAddIndex && mayAddIndex; Collection* collection = database->getCollection( ns ); if ( collection == NULL ) { collection = database->createCollection( ns, false, NULL ); int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using // initialExtentSize() which is more for user collections. // TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } collection->increaseStorageSize( ies, false); if ( !god ) ensureIdIndexForNewNs(ns); } NamespaceDetails* d = collection->details(); string tabletoidxns; Collection* collectionToIndex = 0; NamespaceDetails* tableToIndex = 0; BSONObj fixedIndexObject; if ( addIndex ) { verify( obuf ); BSONObj io((const char *) obuf); tabletoidxns = io.getStringField( "ns" ); uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos); massert(10097, str::stream() << "trying to create index on wrong db " << " db: " << database->name() << " collection: " << tabletoidxns, database->ownsNS( tabletoidxns ) ); collectionToIndex = database->getCollection( tabletoidxns ); if ( !collectionToIndex ) { collectionToIndex = database->createCollection( tabletoidxns, false, NULL ); verify( collectionToIndex ); if ( !god ) ensureIdIndexForNewNs( tabletoidxns.c_str() ); } tableToIndex = collectionToIndex->details(); Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io ); if ( status.code() == ErrorCodes::IndexAlreadyExists ) { // dup index, we ignore return DiskLoc(); } uassert( 17199, str::stream() << "cannot build index on " << tabletoidxns << " because of " << status.toString(), status.isOK() ); if( !prepareToBuildIndex(io, mayInterrupt, god, tabletoidxns ) ) { // prepare creates _id itself, or this indicates to fail the build silently (such // as if index already exists) return DiskLoc(); } fixedIndexObject = IndexCatalog::fixIndexSpec( io ); obuf = fixedIndexObject.objdata(); len = fixedIndexObject.objsize(); } IDToInsert idToInsert; // only initialized if needed if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections in local as they don't have an _id index if( idField.eoo() && !wouldAddIndex && nsToDatabase( ns ) != "local" && d->haveIdIndex() ) { if( addedID ) *addedID = true; idToInsert.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize ); fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) ); // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->isCapped() && !god) { BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) ); Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp ); uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped() << endl; verify(d->isCapped()); return DiskLoc(); } Record *r = loc.rec(); { verify( r->lengthWithHeaders() >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( idToInsert.needed() ) { /* a little effort was made here to avoid a double copy when we add an ID */ int originalSize = *((int*) obuf); ((int&)*r->data()) = originalSize + idToInsert.size(); memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data(), obuf, len); } } addRecordToRecListInExtent(r, loc); d->incrementStats( r->netLength(), 1 ); // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) collection->infoCache()->notifyOfWriteOp(); if ( tableToIndex ) { insert_makeIndex(collectionToIndex, loc, mayInterrupt); } /* add this record to our indexes */ if ( d->getTotalIndexCount() > 0 ) { try { BSONObj obj(r->data()); collection->getIndexCatalog()->indexRecord(obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->isCapped() ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; setLastError(0, s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } d->paddingFits(); return loc; }
/** * Run a query -- includes checking for and running a Command. * @return points to ns if exhaust mode. 0=normal mode * @locks the db mutex for reading (and potentially for writing temporarily to create a new db). * @yields the db mutex periodically after acquiring it. * @asserts on scan and order memory exhaustion and other cases. */ const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) ); ParsedQuery& pq( *pq_shared ); BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; if( logLevel >= 2 ) log() << "runQuery called " << ns << " " << jsobj << endl; curop.debug().ns = ns; curop.debug().ntoreturn = pq.getNumToReturn(); curop.debug().query = jsobj; curop.setQuery(jsobj); // Run a command. if ( pq.couldBeCommand() ) { BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { curop.debug().iscommand = true; curop.debug().query = jsobj; curop.markCommand(); auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData( qr.release(), true ); } else { uasserted(13530, "bad or malformed command request?"); } return 0; } bool explain = pq.isExplain(); BSONObj order = pq.getOrder(); BSONObj query = pq.getFilter(); /* The ElemIter will not be happy if this isn't really an object. So throw exception here when that is true. (Which may indicate bad data from client.) */ if ( query.objsize() == 0 ) { out() << "Bad query object?\n jsobj:"; out() << jsobj.toString() << "\n query:"; out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } Client::ReadContext ctx( ns , dbpath ); // read locks const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns ); replVerifyReadsOk(&pq); if ( pq.hasOption( QueryOption_CursorTailable ) ) { NamespaceDetails *d = nsdetails( ns ); uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() ); const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; } else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } // Run a simple id query. if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { int n = 0; bool nsFound = false; bool indexFound = false; BSONObj resObject; Client& c = cc(); bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); if ( nsFound == false || indexFound == true ) { if ( shardingState.needShardChunkManager( ns ) ) { ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns ); if ( m && ! m->belongsToMe( resObject ) ) { // I have something this _id // but it doesn't belong to me // so return nothing resObject = BSONObj(); found = false; } } BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); bb.skip(sizeof(QueryResult)); curop.debug().idhack = true; if ( found ) { n = 1; fillQueryResultFromObj( bb , pq.getFields() , resObject ); } auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = n; result.setData( qr.release(), true ); return NULL; } } // Run a regular query. BSONObj oldPlan; if ( explain && ! pq.hasIndexSpecifier() ) { MultiPlanScanner mps( ns, query, order ); if ( mps.usingCachedPlan() ) { oldPlan = mps.oldExplain().firstElement().embeddedObject() .firstElement().embeddedObject().getOwned(); } } // In some cases the query may be retried if there is an in memory sort size assertion. for( int retry = 0; retry < 2; ++retry ) { try { return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order, pq_shared, oldPlan, shardingVersionAtStart, result ); } catch ( const QueryRetryException & ) { verify( retry == 0 ); } } verify( false ); return 0; }
/** * For a given query, get a runner. The runner could be a SingleSolutionRunner, a * CachedQueryRunner, or a MultiPlanRunner, depending on the cache/query solver/etc. */ Status getRunner(CanonicalQuery* rawCanonicalQuery, Runner** out) { verify(rawCanonicalQuery); auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery); // Try to look up a cached solution for the query. // TODO: Can the cache have negative data about a solution? PlanCache* localCache = PlanCache::get(canonicalQuery->ns()); if (NULL != localCache) { CachedSolution* cs = localCache->get(*canonicalQuery); if (NULL != cs) { // We have a cached solution. Hand the canonical query and cached solution off to // the cached plan runner, which takes ownership of both. WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*cs->solution, &root, &ws)); *out = new CachedPlanRunner(canonicalQuery.release(), cs, root, ws); return Status::OK(); } } // No entry in cache for the query. We have to solve the query ourself. // Get the indices that we could possibly use. NamespaceDetails* nsd = nsdetails(canonicalQuery->ns().c_str()); // If this is NULL, there is no data but the query is valid. You're allowed to query for // data on an empty collection and it's not an error. There just isn't any data... if (NULL == nsd) { const std::string& ns = canonicalQuery->ns(); *out = new EOFRunner(canonicalQuery.release(), ns); return Status::OK(); } // Tailable: If the query requests tailable the collection must be capped. if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) { if (!nsd->isCapped()) { return Status(ErrorCodes::BadValue, "tailable cursor requested on non capped collection"); } // If a sort is specified it must be equal to expectedSort. const BSONObj expectedSort = BSON("$natural" << 1); const BSONObj& actualSort = canonicalQuery->getParsed().getSort(); if (!actualSort.isEmpty() && !(actualSort == expectedSort)) { return Status(ErrorCodes::BadValue, "invalid sort specified for tailable cursor: " + actualSort.toString()); } } // If it's not NULL, we may have indices. vector<IndexEntry> indices; for (int i = 0; i < nsd->getCompletedIndexCount(); ++i) { auto_ptr<IndexDescriptor> desc(CatalogHack::getDescriptor(nsd, i)); indices.push_back(IndexEntry(desc->keyPattern(), desc->isMultikey(), desc->isSparse(), desc->indexName())); } vector<QuerySolution*> solutions; size_t options = QueryPlanner::DEFAULT; if (storageGlobalParams.noTableScan) { const string& ns = canonicalQuery->ns(); // There are certain cases where we ignore this restriction: bool ignore = canonicalQuery->getQueryObj().isEmpty() || (string::npos != ns.find(".system.")) || (0 == ns.find("local.")); if (!ignore) { options |= QueryPlanner::NO_TABLE_SCAN; } } else { options |= QueryPlanner::INCLUDE_COLLSCAN; } QueryPlanner::plan(*canonicalQuery, indices, options, &solutions); /* for (size_t i = 0; i < solutions.size(); ++i) { QLOG() << "solution " << i << " is " << solutions[i]->toString() << endl; } */ // We cannot figure out how to answer the query. Should this ever happen? if (0 == solutions.size()) { return Status(ErrorCodes::BadValue, "Can't create a plan for the canonical query " + canonicalQuery->toString()); } if (1 == solutions.size()) { // Only one possible plan. Run it. Build the stages from the solution. WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*solutions[0], &root, &ws)); // And, run the plan. *out = new SingleSolutionRunner(canonicalQuery.release(), solutions[0], root, ws); return Status::OK(); } else { // Many solutions. Let the MultiPlanRunner pick the best, update the cache, and so on. auto_ptr<MultiPlanRunner> mpr(new MultiPlanRunner(canonicalQuery.release())); for (size_t i = 0; i < solutions.size(); ++i) { WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*solutions[i], &root, &ws)); // Takes ownership of all arguments. mpr->addPlan(solutions[i], root, ws); } *out = mpr.release(); return Status::OK(); } }
void validateNS(const string& ns, Collection* collection, const BSONObj& cmdObj, BSONObjBuilder& result) { const bool full = cmdObj["full"].trueValue(); const bool scanData = full || cmdObj["scandata"].trueValue(); NamespaceDetails* nsd = collection->details(); bool valid = true; BSONArrayBuilder errors; // explanation(s) for why valid = false if ( collection->isCapped() ){ result.append("capped", nsd->isCapped()); result.appendNumber("max", nsd->maxCappedDocs()); } if ( nsd->firstExtent().isNull() ) result.append( "firstExtent", "null" ); else result.append( "firstExtent", str::stream() << nsd->firstExtent().toString() << " ns:" << nsd->firstExtent().ext()->nsDiagnostic.toString()); if ( nsd->lastExtent().isNull() ) result.append( "lastExtent", "null" ); else result.append( "lastExtent", str::stream() << nsd->lastExtent().toString() << " ns:" << nsd->lastExtent().ext()->nsDiagnostic.toString()); BSONArrayBuilder extentData; int extentCount = 0; try { if ( !nsd->firstExtent().isNull() ) { nsd->firstExtent().ext()->assertOk(); nsd->lastExtent().ext()->assertOk(); } DiskLoc extentDiskLoc = nsd->firstExtent(); while (!extentDiskLoc.isNull()) { Extent* thisExtent = extentDiskLoc.ext(); if (full) { extentData << thisExtent->dump(); } if (!thisExtent->validates(extentDiskLoc, &errors)) { valid = false; } DiskLoc nextDiskLoc = thisExtent->xnext; if (extentCount > 0 && !nextDiskLoc.isNull() && nextDiskLoc.ext()->xprev != extentDiskLoc) { StringBuilder sb; sb << "'xprev' pointer " << nextDiskLoc.ext()->xprev.toString() << " in extent " << nextDiskLoc.toString() << " does not point to extent " << extentDiskLoc.toString(); errors << sb.str(); valid = false; } if (nextDiskLoc.isNull() && extentDiskLoc != nsd->lastExtent()) { StringBuilder sb; sb << "'lastExtent' pointer " << nsd->lastExtent().toString() << " does not point to last extent in list " << extentDiskLoc.toString(); errors << sb.str(); valid = false; } extentDiskLoc = nextDiskLoc; extentCount++; killCurrentOp.checkForInterrupt(); } } catch (const DBException& e) { StringBuilder sb; sb << "exception validating extent " << extentCount << ": " << e.what(); errors << sb.str(); valid = false; } result.append("extentCount", extentCount); if ( full ) result.appendArray( "extents" , extentData.arr() ); result.appendNumber("datasize", nsd->dataSize()); result.appendNumber("nrecords", nsd->numRecords()); result.appendNumber("lastExtentSize", nsd->lastExtentSize()); result.appendNumber("padding", nsd->paddingFactor()); try { bool testingLastExtent = false; try { if (nsd->firstExtent().isNull()) { // this is ok } else { result.append("firstExtentDetails", nsd->firstExtent().ext()->dump()); if (!nsd->firstExtent().ext()->xprev.isNull()) { StringBuilder sb; sb << "'xprev' pointer in 'firstExtent' " << nsd->firstExtent().toString() << " is " << nsd->firstExtent().ext()->xprev.toString() << ", should be null"; errors << sb.str(); valid=false; } } testingLastExtent = true; if (nsd->lastExtent().isNull()) { // this is ok } else { if (nsd->firstExtent() != nsd->lastExtent()) { result.append("lastExtentDetails", nsd->lastExtent().ext()->dump()); if (!nsd->lastExtent().ext()->xnext.isNull()) { StringBuilder sb; sb << "'xnext' pointer in 'lastExtent' " << nsd->lastExtent().toString() << " is " << nsd->lastExtent().ext()->xnext.toString() << ", should be null"; errors << sb.str(); valid = false; } } } } catch (const DBException& e) { StringBuilder sb; sb << "exception processing '" << (testingLastExtent ? "lastExtent" : "firstExtent") << "': " << e.what(); errors << sb.str(); valid = false; } set<DiskLoc> recs; if( scanData ) { int n = 0; int nInvalid = 0; long long nQuantizedSize = 0; long long nPowerOf2QuantizedSize = 0; long long len = 0; long long nlen = 0; long long bsonLen = 0; int outOfOrder = 0; DiskLoc cl_last; DiskLoc cl; Runner::RunnerState state; auto_ptr<Runner> runner(InternalPlanner::collectionScan(ns)); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &cl))) { n++; if ( n < 1000000 ) recs.insert(cl); if ( nsd->isCapped() ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = cl.rec(); len += r->lengthWithHeaders(); nlen += r->netLength(); if ( r->lengthWithHeaders() == NamespaceDetails::quantizeAllocationSpace ( r->lengthWithHeaders() ) ) { // Count the number of records having a size consistent with // the quantizeAllocationSpace quantization implementation. ++nQuantizedSize; } if ( r->lengthWithHeaders() == NamespaceDetails::quantizePowerOf2AllocationSpace ( r->lengthWithHeaders() - 1 ) ) { // Count the number of records having a size consistent with the // quantizePowerOf2AllocationSpace quantization implementation. // Because of SERVER-8311, power of 2 quantization is not idempotent and // r->lengthWithHeaders() - 1 must be checked instead of the record // length itself. ++nPowerOf2QuantizedSize; } if (full){ BSONObj obj = BSONObj::make(r); if (!obj.isValid() || !obj.valid()){ // both fast and deep checks valid = false; if (nInvalid == 0) // only log once; errors << "invalid bson object detected (see logs for more info)"; nInvalid++; if (strcmp("_id", obj.firstElementFieldName()) == 0){ try { obj.firstElement().validate(); // throws on error log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl; } catch(...){ log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl; } } else { log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl; } } else { bsonLen += obj.objsize(); } } } if (Runner::RUNNER_EOF != state) { // TODO: more descriptive logging. warning() << "Internal error while reading collection " << ns << endl; } if ( nsd->isCapped() && !nsd->capLooped() ) { result.append("cappedOutOfOrder", outOfOrder); if ( outOfOrder > 1 ) { valid = false; errors << "too many out of order records"; } } result.append("objectsFound", n); if (full) { result.append("invalidObjects", nInvalid); } result.appendNumber("nQuantizedSize", nQuantizedSize); result.appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize); result.appendNumber("bytesWithHeaders", len); result.appendNumber("bytesWithoutHeaders", nlen); if (full) { result.appendNumber("bytesBson", bsonLen); } } BSONArrayBuilder deletedListArray; for ( int i = 0; i < Buckets; i++ ) { deletedListArray << nsd->deletedListEntry(i).isNull(); } int ndel = 0; long long delSize = 0; BSONArrayBuilder delBucketSizes; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = nsd->deletedListEntry(i); try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( nsd->isCapped() && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } string err( str::stream() << "bad pointer in deleted record list: " << loc.toString() << " bucket: " << i << " k: " << k ); errors << err; valid = false; break; } DeletedRecord *d = loc.drec(); delSize += d->lengthWithHeaders(); loc = d->nextDeleted(); k++; killCurrentOp.checkForInterrupt(); } delBucketSizes << k; } catch (...) { errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i)); valid = false; } } result.appendNumber("deletedCount", ndel); result.appendNumber("deletedSize", delSize); if ( full ) { result << "delBucketSizes" << delBucketSizes.arr(); } if ( incorrect ) { errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list"); valid = false; } int idxn = 0; try { IndexCatalog* indexCatalog = collection->getIndexCatalog(); result.append("nIndexes", nsd->getCompletedIndexCount()); BSONObjBuilder indexes; // not using subObjStart to be exception safe NamespaceDetails::IndexIterator i = nsd->ii(); while( i.more() ) { IndexDetails& id = i.next(); log() << "validating index " << idxn << ": " << id.indexNamespace() << endl; IndexDescriptor* descriptor = indexCatalog->getDescriptor( idxn ); verify( descriptor ); IndexAccessMethod* iam = indexCatalog->getIndex( descriptor ); verify( iam ); int64_t keys; iam->validate(&keys); indexes.appendNumber(id.indexNamespace(), static_cast<long long>(keys)); idxn++; } result.append("keysPerIndex", indexes.done()); } catch (...) { errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn)); valid=false; } } catch (AssertionException) { errors << "exception during validate"; valid = false; } result.appendBool("valid", valid); result.append("errors", errors.arr()); if ( !full ){ result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan."); } if ( !valid ) { result.append("advice", "ns corrupt, requires repair"); } }
virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string source = cmdObj.getStringField( name.c_str() ); string target = cmdObj.getStringField( "to" ); uassert(15967, "invalid collection name: " + target, NamespaceString::validCollectionComponent(target.c_str())); if ( source.empty() || target.empty() ) { errmsg = "invalid command syntax"; return false; } string sourceDB = nsToDatabase(source); string targetDB = nsToDatabase(target); string databaseName = sourceDB; databaseName += ".system.indexes"; int longestIndexNameLength = 0; vector<BSONObj> oldIndSpec = Helpers::findAll(databaseName, BSON("ns" << source)); for (size_t i = 0; i < oldIndSpec.size(); ++i) { int thisLength = oldIndSpec[i].getField("name").valuesize(); if (thisLength > longestIndexNameLength) { longestIndexNameLength = thisLength; } } unsigned int longestAllowed = maxNamespaceLen - longestIndexNameLength - 1; if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; uasserted(16451, sb.str()); } bool capped = false; long long size = 0; std::vector<BSONObj> indexesInProg; { Client::Context ctx( source ); NamespaceDetails *nsd = nsdetails( source ); uassert( 10026 , "source namespace does not exist", nsd ); indexesInProg = stopIndexBuilds(dbname, cmdObj); capped = nsd->isCapped(); if ( capped ) for( DiskLoc i = nsd->firstExtent(); !i.isNull(); i = i.ext()->xnext ) size += i.ext()->length; } Client::Context ctx( target ); if ( nsdetails( target ) ) { uassert( 10027 , "target namespace exists", cmdObj["dropTarget"].trueValue() ); Status s = cc().database()->dropCollection( target ); if ( !s.isOK() ) { errmsg = s.toString(); return false; } } // if we are renaming in the same database, just // rename the namespace and we're done. { if ( sourceDB == targetDB ) { Status s = ctx.db()->renameCollection( source, target, cmdObj["stayTemp"].trueValue() ); if ( !s.isOK() ) { errmsg = s.toString(); return false; } return true; } } // renaming across databases, so we must copy all // the data and then remove the source collection. BSONObjBuilder spec; if ( capped ) { spec.appendBool( "capped", true ); spec.append( "size", double( size ) ); } if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) ) return false; auto_ptr< DBClientCursor > c; DBDirectClient bridge; { c = bridge.query( source, BSONObj(), 0, 0, 0, fromRepl ? QueryOption_SlaveOk : 0 ); } while( 1 ) { { if ( !c->more() ) break; } BSONObj o = c->next(); theDataFileMgr.insertWithObjMod( target.c_str(), o ); } string sourceIndexes = nsToDatabase( source ) + ".system.indexes"; string targetIndexes = nsToDatabase( target ) + ".system.indexes"; { c = bridge.query( sourceIndexes, QUERY( "ns" << source ), 0, 0, 0, fromRepl ? QueryOption_SlaveOk : 0 ); } while( 1 ) { { if ( !c->more() ) break; } BSONObj o = c->next(); BSONObjBuilder b; BSONObjIterator i( o ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; if ( strcmp( e.fieldName(), "ns" ) == 0 ) { b.append( "ns", target ); } else { b.append( e ); } } BSONObj n = b.done(); theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n ); } { Client::Context ctx( source ); Status s = ctx.db()->dropCollection( source ); if ( !s.isOK() ) { errmsg = s.toString(); return false; } IndexBuilder::restoreIndexes(targetIndexes, indexesInProg); } return true; }
virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string coll = cmdObj.firstElement().valuestr(); if( coll.empty() || db.empty() ) { errmsg = "no collection name specified"; return false; } if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force"; return false; } string ns = db + '.' + coll; if ( ! NamespaceString::normal(ns.c_str()) ) { errmsg = "bad namespace name"; return false; } // parameter validation to avoid triggering assertions in compact() if ( str::contains(ns, ".system.") ) { errmsg = "can't compact a system namespace"; return false; } { Lock::DBWrite lk(ns); Client::Context ctx(ns); NamespaceDetails *d = nsdetails(ns); if( ! d ) { errmsg = "namespace does not exist"; return false; } if ( d->isCapped() ) { errmsg = "cannot compact a capped collection"; return false; } } double pf = 1.0; int pb = 0; // preservePadding trumps all other compact methods bool preservePadding = false; // useDefaultPadding is used to track whether or not a padding requirement was passed in // if it wasn't than UsePowerOf2Sizes will be maintained when compacting bool useDefaultPadding = true; if (cmdObj.hasElement("preservePadding")) { preservePadding = cmdObj["preservePadding"].trueValue(); useDefaultPadding = false; } if( cmdObj.hasElement("paddingFactor") ) { if (preservePadding == true) { errmsg = "preservePadding is incompatible with paddingFactor"; return false; } useDefaultPadding = false; pf = cmdObj["paddingFactor"].Number(); verify( pf >= 1.0 && pf <= 4.0 ); } if( cmdObj.hasElement("paddingBytes") ) { if (preservePadding == true) { errmsg = "preservePadding is incompatible with paddingBytes"; return false; } useDefaultPadding = false; pb = (int) cmdObj["paddingBytes"].Number(); verify( pb >= 0 && pb <= 1024 * 1024 ); } bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) ); massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails bool ok; { Lock::DBWrite lk(ns); BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str()); Client::Context ctx(ns); NamespaceDetails *d = nsdetails(ns); massert( 13660, str::stream() << "namespace " << ns << " does not exist", d ); massert( 13661, "cannot compact capped collection", !d->isCapped() ); log() << "compact " << ns << " begin" << endl; std::vector<BSONObj> indexesInProg = stopIndexBuilds(db, cmdObj); if( pf != 0 || pb != 0 ) { log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl; } try { ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb, useDefaultPadding, preservePadding); } catch(...) { log() << "compact " << ns << " end (with error)" << endl; throw; } log() << "compact " << ns << " end" << endl; IndexBuilder::restoreIndexes(indexesInProg); } return ok; }