bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) { LOG(1) << "create collection " << ns << ' ' << options << endl; if ( nsdetails(ns) ) { err = "collection already exists"; return false; } long long size = Extent::initialSize(128); { BSONElement e = options.getField("size"); if ( e.isNumber() ) { size = e.numberLong(); uassert( 10083 , "create collection invalid size spec", size >= 0 ); size += 0xff; size &= 0xffffffffffffff00LL; if ( size < Extent::minSize() ) size = Extent::minSize(); } } bool newCapped = false; long long mx = 0; if( options["capped"].trueValue() ) { newCapped = true; BSONElement e = options.getField("max"); if ( e.isNumber() ) { mx = e.numberLong(); uassert( 16495, "max in a capped collection has to be < 2^31 or not set", NamespaceDetails::validMaxCappedDocs(&mx) ); } } cc().database()->createCollection( ns, options["capped"].trueValue(), &options ); Collection* collection = cc().database()->getCollection( ns ); verify( collection ); // $nExtents just for debug/testing. BSONElement e = options.getField( "$nExtents" ); if ( e.type() == Array ) { // We create one extent per array entry, with size specified // by the array value. BSONObjIterator i( e.embeddedObject() ); while( i.more() ) { BSONElement e = i.next(); int size = int( e.number() ); verify( size <= 0x7fffffff ); // $nExtents is just for testing - always allocate new extents // rather than reuse existing extents so we have some predictibility // in the extent size used by our tests collection->increaseStorageSize( (int)size, false ); } } else if ( int( e.number() ) > 0 ) { // We create '$nExtents' extents, each of size 'size'. int nExtents = int( e.number() ); verify( size <= 0x7fffffff ); for ( int i = 0; i < nExtents; ++i ) { verify( size <= 0x7fffffff ); // $nExtents is just for testing - always allocate new extents // rather than reuse existing extents so we have some predictibility // in the extent size used by our tests collection->increaseStorageSize( (int)size, false ); } } else { // This is the non test case, where we don't have a $nExtents spec. while ( size > 0 ) { const int max = Extent::maxSize(); const int min = Extent::minSize(); int desiredExtentSize = static_cast<int> (size > max ? max : size); desiredExtentSize = static_cast<int> (desiredExtentSize < min ? min : desiredExtentSize); desiredExtentSize &= 0xffffff00; Extent* e = collection->increaseStorageSize( (int)desiredExtentSize, true ); size -= e->length; } } NamespaceDetails *d = nsdetails(ns); verify(d); bool ensure = true; // respect autoIndexId if set. otherwise, create an _id index for all colls, except for // capped ones in local w/o autoIndexID (reason for the exception is for the oplog and // non-replicated capped colls) if( options.hasField( "autoIndexId" ) || (newCapped && nsToDatabase( ns ) == "local" ) ) { ensure = options.getField( "autoIndexId" ).trueValue(); } if( ensure ) { if( deferIdIndex ) *deferIdIndex = true; else ensureIdIndexForNewNs( ns ); } if ( mx > 0 ) d->setMaxCappedDocs( mx ); if ( options["flags"].numberInt() ) { d->replaceUserFlags( options["flags"].numberInt() ); } return true; }
DiskLoc DataFileMgr::insert(const char* ns, const void* obuf, int32_t len, bool mayInterrupt, bool god, bool mayAddIndex, bool* addedID) { Database* database = cc().database(); bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); } bool addIndex = wouldAddIndex && mayAddIndex; Collection* collection = database->getCollection( ns ); if ( collection == NULL ) { collection = database->createCollection( ns, false, NULL ); int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using // initialExtentSize() which is more for user collections. // TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } collection->increaseStorageSize( ies, false); if ( !god ) ensureIdIndexForNewNs(ns); } NamespaceDetails* d = collection->details(); string tabletoidxns; Collection* collectionToIndex = 0; NamespaceDetails* tableToIndex = 0; BSONObj fixedIndexObject; if ( addIndex ) { verify( obuf ); BSONObj io((const char *) obuf); tabletoidxns = io.getStringField( "ns" ); uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos); massert(10097, str::stream() << "trying to create index on wrong db " << " db: " << database->name() << " collection: " << tabletoidxns, database->ownsNS( tabletoidxns ) ); collectionToIndex = database->getCollection( tabletoidxns ); if ( !collectionToIndex ) { collectionToIndex = database->createCollection( tabletoidxns, false, NULL ); verify( collectionToIndex ); if ( !god ) ensureIdIndexForNewNs( tabletoidxns.c_str() ); } tableToIndex = collectionToIndex->details(); Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io ); if ( status.code() == ErrorCodes::IndexAlreadyExists ) { // dup index, we ignore return DiskLoc(); } uassert( 17199, str::stream() << "cannot build index on " << tabletoidxns << " because of " << status.toString(), status.isOK() ); if( !prepareToBuildIndex(io, mayInterrupt, god, tabletoidxns ) ) { // prepare creates _id itself, or this indicates to fail the build silently (such // as if index already exists) return DiskLoc(); } fixedIndexObject = IndexCatalog::fixIndexSpec( io ); obuf = fixedIndexObject.objdata(); len = fixedIndexObject.objsize(); } IDToInsert idToInsert; // only initialized if needed if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections in local as they don't have an _id index if( idField.eoo() && !wouldAddIndex && nsToDatabase( ns ) != "local" && d->haveIdIndex() ) { if( addedID ) *addedID = true; idToInsert.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize ); fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) ); // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->isCapped() && !god) { BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) ); Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp ); uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped() << endl; verify(d->isCapped()); return DiskLoc(); } Record *r = loc.rec(); { verify( r->lengthWithHeaders() >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( idToInsert.needed() ) { /* a little effort was made here to avoid a double copy when we add an ID */ int originalSize = *((int*) obuf); ((int&)*r->data()) = originalSize + idToInsert.size(); memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data(), obuf, len); } } addRecordToRecListInExtent(r, loc); d->incrementStats( r->netLength(), 1 ); // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) collection->infoCache()->notifyOfWriteOp(); if ( tableToIndex ) { insert_makeIndex(collectionToIndex, loc, mayInterrupt); } /* add this record to our indexes */ if ( d->getTotalIndexCount() > 0 ) { try { BSONObj obj(r->data()); collection->getIndexCatalog()->indexRecord(obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->isCapped() ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; setLastError(0, s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } d->paddingFits(); return loc; }
bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot) { massert( 10289 , "useReplAuth is not written to replication log", !useReplAuth || !logForRepl ); string todb = cc().database()->name; stringstream a,b; a << "localhost:" << cmdLine.port; b << "127.0.0.1:" << cmdLine.port; bool masterSameProcess = ( a.str() == masterHost || b.str() == masterHost ); if ( masterSameProcess ) { if ( fromdb == todb && cc().database()->path == dbpath ) { // guard against an "infinite" loop /* if you are replicating, the local.sources config may be wrong if you get this */ errmsg = "can't clone from self (localhost)."; return false; } } /* todo: we can put these releases inside dbclient or a dbclient specialization. or just wait until we get rid of global lock anyway. */ string ns = fromdb + ".system.namespaces"; list<BSONObj> toClone; { dbtemprelease r; auto_ptr<DBClientCursor> c; { if ( conn.get() ) { // nothing to do } else if ( !masterSameProcess ) { auto_ptr< DBClientConnection > c( new DBClientConnection() ); if ( !c->connect( masterHost, errmsg ) ) return false; if( !replAuthenticate(c.get()) ) return false; conn = c; } else { conn.reset( new DBDirectClient() ); } c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 ); } if ( c.get() == 0 ) { errmsg = "query failed " + ns; return false; } while ( c->more() ){ BSONObj collection = c->next(); log(2) << "\t cloner got " << collection << endl; BSONElement e = collection.getField("name"); if ( e.eoo() ) { string s = "bad system.namespaces object " + collection.toString(); massert( 10290 , s.c_str(), false); } assert( !e.eoo() ); assert( e.type() == String ); const char *from_name = e.valuestr(); if( strstr(from_name, ".system.") ) { /* system.users is cloned -- but nothing else from system. */ if( legalClientSystemNS( from_name , true ) == 0 ){ log(2) << "\t\t not cloning because system collection" << endl; continue; } } if( ! nsDollarCheck( from_name ) ){ log(2) << "\t\t not cloning because has $ " << endl; continue; } toClone.push_back( collection.getOwned() ); } } for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ){ { dbtemprelease r; } BSONObj collection = *i; log(2) << " really will clone: " << collection << endl; const char * from_name = collection["name"].valuestr(); BSONObj options = collection.getObjectField("options"); /* change name "<fromdb>.collection" -> <todb>.collection */ const char *p = strchr(from_name, '.'); assert(p); string to_name = todb + p; bool wantIdIndex = false; { string err; const char *toname = to_name.c_str(); /* we defer building id index for performance - building it in batch is much faster */ userCreateNS(toname, options, err, logForRepl, &wantIdIndex); } log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl; Query q; if( snapshot ) q.snapshot(); copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, q); if( wantIdIndex ) { /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations that occur during the initial sync. inDBRepair makes dropDups be true. */ bool old = inDBRepair; try { inDBRepair = true; ensureIdIndexForNewNs(to_name.c_str()); inDBRepair = old; } catch(...) { inDBRepair = old; throw; } } } // now build the indexes string system_indexes_from = fromdb + ".system.indexes"; string system_indexes_to = todb + ".system.indexes"; /* [dm]: is the ID index sometimes not called "_id_"? There is other code in the system that looks for a "_id" prefix rather than this exact value. we should standardize. OR, remove names - which is in the bugdb. Anyway, this is dubious here at the moment. */ copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, BSON( "name" << NE << "_id_" ) ); return true; }