/** * actually applies a reduce, to a list of tuples (key, value). * After the call, tuples will hold a single tuple {"0": key, "1": value} */ void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) { int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128; // need to build the reduce args: ( key, [values] ) BSONObjBuilder reduceArgs( sizeEstimate ); boost::scoped_ptr<BSONArrayBuilder> valueBuilder; int sizeSoFar = 0; unsigned n = 0; for ( ; n<tuples.size(); n++ ) { BSONObjIterator j(tuples[n]); BSONElement keyE = j.next(); if ( n == 0 ) { reduceArgs.append( keyE ); key = keyE.wrap(); sizeSoFar = 5 + keyE.size(); valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) )); } BSONElement ee = j.next(); uassert( 14837 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) ); if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) { assert( n > 1 ); // if not, inf. loop break; } valueBuilder->append( ee ); sizeSoFar += ee.size(); } assert(valueBuilder); valueBuilder->done(); BSONObj args = reduceArgs.obj(); Scope * s = _func.scope(); s->invokeSafe( _func.func() , &args, 0, 0, false, true, true ); ++numReduces; if ( s->type( "return" ) == Array ) { uasserted( 14838 , "reduce -> multiple not supported yet"); return; } endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() ); if ( n == tuples.size() ) return; // the input list was too large, add the rest of elmts to new tuples and reduce again // note: would be better to use loop instead of recursion to avoid stack overflow BSONList x; for ( ; n < tuples.size(); n++ ) { x.push_back( tuples[n] ); } BSONObjBuilder temp( endSizeEstimate ); temp.append( key.firstElement() ); s->append( temp , "1" , "return" ); x.push_back( temp.obj() ); _reduce( x , key , endSizeEstimate ); }
bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool) { string dbname = cc().database()->name; // this has to come before dbtemprelease dbtemprelease temprelease; // we don't touch the db directly string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe(); MRSetup mr( dbname , cmdObj.firstElement().embeddedObjectUserCheck() , false ); set<ServerAndQuery> servers; BSONObjBuilder shardCounts; map<string,long long> counts; BSONObj shards = cmdObj["shards"].embeddedObjectUserCheck(); vector< auto_ptr<DBClientCursor> > shardCursors; BSONObjIterator i( shards ); while ( i.more() ) { BSONElement e = i.next(); string shard = e.fieldName(); BSONObj res = e.embeddedObjectUserCheck(); uassert( 10078 , "something bad happened" , shardedOutputCollection == res["result"].valuestrsafe() ); servers.insert( shard ); shardCounts.appendAs( res["counts"] , shard.c_str() ); BSONObjIterator j( res["counts"].embeddedObjectUserCheck() ); while ( j.more() ) { BSONElement temp = j.next(); counts[temp.fieldName()] += temp.numberLong(); } } BSONObj sortKey = BSON( "_id" << 1 ); ParallelSortClusteredCursor cursor( servers , dbname + "." + shardedOutputCollection , Query().sort( sortKey ) ); auto_ptr<Scope> s = globalScriptEngine->getPooledScope( ns ); ScriptingFunction reduceFunction = s->createFunction( mr.reduceCode.c_str() ); ScriptingFunction finalizeFunction = 0; if ( mr.finalizeCode.size() ) finalizeFunction = s->createFunction( mr.finalizeCode.c_str() ); BSONList values; result.append( "result" , mr.finalShort ); DBDirectClient db; while ( cursor.more() ) { BSONObj t = cursor.next().getOwned(); if ( values.size() == 0 ) { values.push_back( t ); continue; } if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) { values.push_back( t ); continue; } db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) ); values.clear(); values.push_back( t ); } if ( values.size() ) db.insert( mr.tempLong , reduceValues( values , s.get() , reduceFunction , 1 , finalizeFunction ) ); long long finalCount = mr.renameIfNeeded( db ); log(0) << " mapreducefinishcommand " << mr.finalLong << " " << finalCount << endl; for ( set<ServerAndQuery>::iterator i=servers.begin(); i!=servers.end(); i++ ) { ScopedDbConnection conn( i->_server ); conn->dropCollection( dbname + "." + shardedOutputCollection ); } result.append( "shardCounts" , shardCounts.obj() ); { BSONObjBuilder c; for ( map<string,long long>::iterator i=counts.begin(); i!=counts.end(); i++ ) { c.append( i->first , i->second ); } result.append( "counts" , c.obj() ); } return 1; }
void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) { uassert( 10074 , "need values" , tuples.size() ); int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128; BSONObjBuilder reduceArgs( sizeEstimate ); boost::scoped_ptr<BSONArrayBuilder> valueBuilder; int sizeSoFar = 0; unsigned n = 0; for ( ; n<tuples.size(); n++ ) { BSONObjIterator j(tuples[n]); BSONElement keyE = j.next(); if ( n == 0 ) { reduceArgs.append( keyE ); key = keyE.wrap(); sizeSoFar = 5 + keyE.size(); valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) )); } BSONElement ee = j.next(); uassert( 13070 , "value to large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) ); if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) { assert( n > 1 ); // if not, inf. loop break; } valueBuilder->append( ee ); sizeSoFar += ee.size(); } assert(valueBuilder); valueBuilder->done(); BSONObj args = reduceArgs.obj(); Scope * s = _func.scope(); s->invokeSafe( _func.func() , args ); if ( s->type( "return" ) == Array ) { uasserted( 10075 , "reduce -> multiple not supported yet"); return; } endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() ); if ( n == tuples.size() ) return; // the input list was too large BSONList x; for ( ; n < tuples.size(); n++ ) { x.push_back( tuples[n] ); } BSONObjBuilder temp( endSizeEstimate ); temp.append( key.firstElement() ); s->append( temp , "1" , "return" ); x.push_back( temp.obj() ); _reduce( x , key , endSizeEstimate ); }
bool run(const char *dbname, BSONObj& cmd, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { Timer t; Client::GodScope cg; Client& client = cc(); CurOp * op = client.curop(); MRSetup mr( client.database()->name , cmd ); log(1) << "mr ns: " << mr.ns << endl; if ( ! db.exists( mr.ns ) ) { errmsg = "ns doesn't exist"; return false; } bool shouldHaveData = false; long long num = 0; long long inReduce = 0; BSONObjBuilder countsBuilder; BSONObjBuilder timingBuilder; try { MRState state( mr ); state.scope->injectNative( "emit" , fast_emit ); MRTL * mrtl = new MRTL( state ); _tlmr.reset( mrtl ); ProgressMeter & pm = op->setMessage( "m/r: (1/3) emit phase" , db.count( mr.ns , mr.filter ) ); auto_ptr<DBClientCursor> cursor = db.query( mr.ns , mr.q ); long long mapTime = 0; Timer mt; while ( cursor->more() ) { BSONObj o = cursor->next(); if ( mr.verbose ) mt.reset(); state.scope->setThis( &o ); if ( state.scope->invoke( state.map , state.setup.mapparams , 0 , true ) ) throw UserException( 9014, (string)"map invoke failed: " + state.scope->getError() ); if ( mr.verbose ) mapTime += mt.micros(); num++; if ( num % 100 == 0 ) { Timer t; mrtl->checkSize(); inReduce += t.micros(); killCurrentOp.checkForInterrupt(); dbtemprelease temprlease; } pm.hit(); if ( mr.limit && num >= mr.limit ) break; } pm.finished(); countsBuilder.appendNumber( "input" , num ); countsBuilder.appendNumber( "emit" , mrtl->numEmits ); if ( mrtl->numEmits ) shouldHaveData = true; timingBuilder.append( "mapTime" , mapTime / 1000 ); timingBuilder.append( "emitLoop" , t.millis() ); // final reduce op->setMessage( "m/r: (2/3) final reduce in memory" ); mrtl->reduceInMemory(); mrtl->dump(); BSONObj sortKey = BSON( "0" << 1 ); db.ensureIndex( mr.incLong , sortKey ); BSONObj prev; BSONList all; assert( userCreateNS( mr.tempLong.c_str() , BSONObj() , errmsg , mr.replicate ) ); pm = op->setMessage( "m/r: (3/3) final reduce to collection" , db.count( mr.incLong ) ); cursor = db.query( mr.incLong, Query().sort( sortKey ) ); while ( cursor->more() ) { BSONObj o = cursor->next().getOwned(); pm.hit(); if ( o.woSortOrder( prev , sortKey ) == 0 ) { all.push_back( o ); if ( pm.hits() % 1000 == 0 ) { dbtemprelease tl; } continue; } state.finalReduce( all ); all.clear(); prev = o; all.push_back( o ); killCurrentOp.checkForInterrupt(); dbtemprelease tl; } state.finalReduce( all ); pm.finished(); _tlmr.reset( 0 ); } catch ( ... ) { log() << "mr failed, removing collection" << endl; db.dropCollection( mr.tempLong ); db.dropCollection( mr.incLong ); throw; } db.dropCollection( mr.incLong ); long long finalCount = mr.renameIfNeeded( db ); timingBuilder.append( "total" , t.millis() ); result.append( "result" , mr.finalShort ); result.append( "timeMillis" , t.millis() ); countsBuilder.appendNumber( "output" , finalCount ); if ( mr.verbose ) result.append( "timing" , timingBuilder.obj() ); result.append( "counts" , countsBuilder.obj() ); if ( finalCount == 0 && shouldHaveData ) { result.append( "cmd" , cmd ); errmsg = "there were emits but no data!"; return false; } return true; }