void Statsdb::addDocsIndexed ( ) { if ( ! isClockInSync() ) return; // only once per five seconds long now = getTimeLocal(); static long s_lastTime = 0; if ( now - s_lastTime < 5 ) return; s_lastTime = now; long long total = 0LL; static long long s_lastTotal = 0LL; // every 5 seconds update docs indexed count for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = &g_hostdb.m_hosts[i]; // must have something if ( h->m_docsIndexed <= 0 ) return; // add it up total += h->m_docsIndexed; } // divide by # of groups total /= g_hostdb.getNumGroups(); // skip if no change if ( total == s_lastTotal ) return; s_lastTotal = total; // add it if changed though long long nowms = gettimeofdayInMillisecondsGlobal(); addStat ( MAX_NICENESS,"docs_indexed", nowms, nowms, (float)total ); }
void Statsdb::addDocsIndexed ( ) { if ( ! isClockInSync() ) return; if ( g_hostdb.hasDeadHost() ) return; // only host #0 needs this if ( g_hostdb.m_hostId != 0 ) return; // only once per five seconds int32_t now = getTimeLocal(); static int32_t s_lastTime = 0; if ( now - s_lastTime < 5 ) return; int32_t interval = now - s_lastTime; s_lastTime = now; int64_t total = 0LL; static int64_t s_lastTotal = 0LL; // every 5 seconds update docs indexed count for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = &g_hostdb.m_hosts[i]; // must have something if ( h->m_pingInfo.m_totalDocsIndexed <= 0 ) continue; // add it up total += h->m_pingInfo.m_totalDocsIndexed; } // divide by # of groups total /= g_hostdb.getNumHostsPerShard(); // skip if no change if ( total == s_lastTotal ) return; int32_t docsIndexedInInterval = total - s_lastTotal; float docsPerSecond = docsIndexedInInterval / (float)interval; log("build: total docs indexed: %f. docs per second %f %i %i", (float)total, docsPerSecond, docsIndexedInInterval, interval); // add it if changed though int64_t nowms = gettimeofdayInMillisecondsGlobal(); addStat ( MAX_NICENESS,"docs_indexed", nowms, nowms, (float)total ); // Prevent a datapoint which adds all of the docs indexed to date. if( s_lastTotal != 0 ) { addStat ( MAX_NICENESS,"docs_per_second", nowms, nowms, docsPerSecond ); } s_lastTotal = total; }
void flushStatsWrapper ( int fd , void *state ) { g_statsdb.addDocsIndexed(); // force a statsdb tree dump if running out of room Rdb *rdb = &g_statsdb.m_rdb; RdbTree *tree = &rdb->m_tree; // if we got 20% room left and 50k available mem, do not dump if ( (float)tree->getNumUsedNodes() * 1.2 < (float)tree->getNumAvailNodes () && //tree->getNumAvailNodes () > 1000 && rdb-> m_mem.getAvailMem() > 50000 ) return; if ( ! isClockInSync() ) return; // force a dump rdb->dumpTree ( 1 ); }
int64_t gettimeofdayInMillisecondsSynced() { // sanity check if ( ! isClockInSync() ) { static int s_printed = 0; if ( (s_printed % 100) == 0 ) { log("xml: clock not in sync with host #0 yet!!!!!!"); } s_printed++; } int64_t now; struct timeval tv; gettimeofday ( &tv , NULL ); now = (int64_t)(tv.tv_usec/1000)+((int64_t)tv.tv_sec)*1000; // adjust from Msg0x11 time adjustments now += s_adjustment; return now; }
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) { // get request char *request = udpSlot->m_readBuf; int32_t reqSize = udpSlot->m_readBufSize; // shortcut UdpServer *us = &g_udpServer; // breathe QUICKPOLL ( niceness ); // shortcut char *reply = udpSlot->m_tmpBuf; // // . is it confirming that he got all the locks? // . if so, remove the doledb record and dock the doleiptable count // before adding a waiting tree entry to re-pop the doledb record // if ( reqSize == sizeof(ConfirmRequest) ) { char *msg = NULL; ConfirmRequest *cq = (ConfirmRequest *)request; // confirm the lock HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 ); if ( slot < 0 ) { log("spider: got a confirm request for a key not " "in the table! coll must have been deleted " " or reset " "while lock request was outstanding."); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; //char *xx=NULL;*xx=0; } } UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); lock->m_confirmed = true; // note that if ( g_conf.m_logDebugSpider ) // Wait ) log("spider: got confirm lock request for ip=%s", iptoa(lock->m_firstIp)); // get it SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum); // make it negative cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL; // and add the negative rec to doledb (deletion operation) Rdb *rdb = &g_doledb.m_rdb; if ( ! rdb->addRecord ( cq->m_collnum, (char *)&cq->m_doledbKey, NULL , // data 0 , //dataSize 1 )){ // niceness // tree is dumping or something, probably ETRYAGAIN if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno)); } //char *xx=NULL;*xx=0; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // now remove from doleiptable since we removed from doledb if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp ); // how many spiders outstanding for this coll and IP? //int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp); // DO NOT add back to waiting tree if max spiders // out per ip was 1 OR there was a crawldelay. but better // yet, take care of that in the winReq code above. // . now add to waiting tree so we add another spiderdb // record for this firstip to doledb // . true = callForScan // . do not add to waiting tree if we have enough outstanding // spiders for this ip. we will add to waiting tree when // we receive a SpiderReply in addSpiderReply() if ( sc && //out < cq->m_maxSpidersOutPerIp && // this will just return true if we are not the // responsible host for this firstip // DO NOT populate from this!!! say "false" here... ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) && // must be an error... g_errno ) { msg = "FAILED TO ADD TO WAITING TREE"; log("spider: %s %s",msg,mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // success!! reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // sanity check if ( reqSize != sizeof(LockRequest) ) { log("spider: bad msg12 request size of %" PRId32,reqSize); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , EBADREQUEST ); return; } // deny it if we are not synced yet! otherwise we core in // getTimeGlobal() below if ( ! isClockInSync() ) { // log it so we can debug it //log("spider: clock not in sync with host #0. so " // "returning etryagain for lock reply"); // let admin know why we are not spidering log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , ETRYAGAIN ); return; } LockRequest *lr = (LockRequest *)request; //uint64_t lockKey = *(int64_t *)request; //int32_t lockSequence = *(int32_t *)(request+8); // is this a remove operation? assume not //bool remove = false; // get top bit //if ( lockKey & 0x8000000000000000LL ) remove = true; // mask it out //lockKey &= 0x7fffffffffffffffLL; // sanity check, just 6 bytes! (48 bits) if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 request uh48=%" PRId64" remove=%" PRId32, lr->m_lockKeyUh48, (int32_t)lr->m_removeLock); // get time int32_t nowGlobal = getTimeGlobal(); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port ); // this must be legit - sanity check if ( hostId < 0 ) { char *xx=NULL;*xx=0; } // remove expired locks from locktable removeExpiredLocks ( hostId ); int64_t lockKey = lr->m_lockKeyUh48; // check tree int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 ); // put it here UrlLock *lock = NULL; // if there say no no if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if doing a remove operation and that was our hostid then unlock it if ( lr->m_removeLock && lock && lock->m_hostId == hostId && lock->m_lockSequence == lr->m_lockSequence ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // ok, at this point all remove ops return if ( lr->m_removeLock ) { reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } ///////// // // add new lock // ///////// // if lock > 1 hour old then remove it automatically!! if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // note it for now log("spider: removing lock after %" PRId32" seconds " "for lockKey=%" PRIu64" hid=%" PRId32, (nowGlobal - lock->m_timestamp), lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // if lock still there, do not grant another lock if ( lock ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: refusing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); reply[0] = 0; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // make the new lock UrlLock tmp; tmp.m_hostId = hostId; tmp.m_lockSequence = lr->m_lockSequence; tmp.m_timestamp = nowGlobal; tmp.m_expires = 0; tmp.m_firstIp = lr->m_firstIp; tmp.m_collnum = lr->m_collnum; // when the spider returns we remove its lock on reception of the // spiderReply, however, we actually just set the m_expires time // to 5 seconds into the future in case there is a current request // to get a lock for that url in progress. but, we do need to // indicate that the spider has indeed completed by setting // m_spiderOutstanding to true. this way, addToWaitingTree() will // not count it towards a "max spiders per IP" quota when deciding // on if it should add a new entry for this IP. tmp.m_spiderOutstanding = true; // this is set when all hosts in the group (shard) have granted the // lock and the host sends out a confirmLockAcquisition() request. // until then we do not know if the lock will be granted by all hosts // in the group (shard) tmp.m_confirmed = false; // put it into the table if ( ! ht->addKey ( &lockKey , &tmp ) ) { // return error if that failed! log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // note it for now if ( g_conf.m_logDebugSpider ) log("spider: granting lock for lockKey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // grant the lock reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; }
// . m_key bitmap in statsdb: // tttttttt tttttttt tttttttt tttttttt t = time in milliseconds, t1 // tttttttt tttttttt tttttttt tttttttt // hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh h = hash32 of m_title // . returns false if could not add stat, true otherwise // . do not set g_errno if we return false just to keep things simple // . we only add the stat to our local statsdb rdb, but because // we might be dumping statsdb to disk or something it is possible // we get an ETRYAGAIN error, so we try to accumulate stats in a // local buffer in that case // . "label" is something like "queryLatency" or whatever // . [t1,t2] are the time endpoints for the operation being measured // . "value" is usually "numBytes", or a quantity indicator of whatever // was processed. // . oldVal, newVal are reflect a state change, like maybe changing the // value of a parm. typically for such things t1 equals t2 bool Statsdb::addStat ( long niceness , char *label , long long t1Arg , long long t2Arg , float value , // y-value really, "numBytes" long parmHash , float oldVal , float newVal , long userId32 ) { if ( ! g_conf.m_useStatsdb ) return true; // so Process.cpp can turn it off when dumping core if ( m_disabled ) return true; // not thread safe! //if ( g_threads.amThread() ) { // log("statsdb: called from thread"); // char *xx=NULL;*xx=0; //} // . for now we can only add stats if we are synced with host #0 clock // . this is kinda a hack and it would be nice to not miss stats! if ( ! isClockInSync() ) return true; RdbTree *tree = &m_rdb.m_tree; // do not add stats to our tree if it is loading if ( tree->m_isLoading ) return true; // convert into host #0 synced time t1Arg = localToGlobalTimeMilliseconds ( t1Arg ); t2Arg = localToGlobalTimeMilliseconds ( t2Arg ); // sanity check if ( ! label ) { char *xx=NULL;*xx=0; } long labelHash; if ( parmHash ) labelHash = parmHash; else labelHash = hash32n ( label ); // fix it for parm changes, and docs_indexed stat, etc. if ( t1Arg == t2Arg ) t2Arg++; // how many SECONDS did the op take? (convert from ms to secs) float dtms = (t2Arg - t1Arg); float dtSecs = dtms / 1000.0; // we have already flushed stats 30+ seconds old, so if this op took // 30 seconds, discard it! if ( dtSecs >= 30 ) { //log("statsdb: stat is %li secs > 30 secs old, discarding.", // (long)dtSecs); return true; } long long nextup; // loop over all "second" buckets for ( long long tx = t1Arg ; tx < t2Arg ; tx = nextup ) { // get next second-aligned point in milliseconds nextup = ((tx +1000)/ 1000) * 1000; // truncate if we need to if ( nextup > t2Arg ) nextup = t2Arg; // . how much of the stat is in this time interval? // . like if operation took 3 seconds, we might cover // 50% of the first 1-second interval. so we use this // as a weight for the stats we keep for that particular // second. then we can plot a point for each second // in time which is an average of all the queries that // were in progress at that second. float fractionTime = ((float)(nextup - tx)) / dtms; // . get the time point bucket in which this stat belongs // . every "second" in time has a bucket unsigned long t1 = tx / 1000; StatKey sk; sk.m_zero = 0x01; // make it a positive key sk.m_time1 = t1; sk.m_labelHash = labelHash; // so we can show just the stats for a particular user... if ( userId32 ) { sk.m_zero = userId32; // make it positive sk.m_zero |= 0x01; } // if we already have added a bucket for this "second" then // get it from the tree so we can add to its accumulated stats. long node1 = tree->getNode ( 0 , (char *)&sk ); long node2; StatData *sd; // get that stat, see if we are accumulating it already if ( node1 >= 0 ) sd = (StatData *)tree->getData ( node1 ); // make a new one if not there else { StatData tmp; // init it tmp.m_totalOps = 0.0; tmp.m_totalQuantity = 0.0; tmp.m_totalTime = 0.0; // save this long saved = g_errno; // need to add using rdb so it can memcpy the data if ( ! m_rdb.addRecord ( (collnum_t)0 , (char *)&sk, (char *)&tmp, sizeof(StatData), niceness ) ) { if ( g_errno != ETRYAGAIN ) log("statsdb: add rec failed: %s", mstrerror(g_errno)); // caller does not care about g_errno g_errno = saved; return false; } // caller does not care about g_errno g_errno = saved; // get the node in the tree //sd = (StatData *)tree->getData ( node1 ); // must be there! node2 = tree->getNode ( 0 , (char *)&sk ); // must be there! if ( node2 < 0 ) { char *xx=NULL;*xx=0; } // point to it sd = (StatData *)tree->getData ( node2 ); } // use the milliseconds elapsed as the value if none given //if ( value == 0 && ! parmHash ) // value = t2Arg - t1Arg; // if we got it for this time, accumulate it // convert x into pixel position sd->m_totalOps += 1 * fractionTime; sd->m_totalQuantity += value * fractionTime; sd->m_totalTime += dtSecs * fractionTime; if ( ! parmHash ) continue; sd->m_totalOps = 0; sd->m_totalQuantity = oldVal; sd->m_newVal = newVal; // no fractions for this! break; } //logf(LOG_DEBUG,"statsdb: sp=0x%lx",(long)sp); return true; }
void DailyMerge::dailyMergeLoop ( ) { // disable for now! //return; // if in repair mode, do not do daily merge if ( g_repairMode ) return; // or if in read only mode if ( g_conf.m_readOnlyMode ) return; // skip if proxy, a proxy can be hostid 0! if ( g_proxy.isProxy() ) return; // wait for clock to be synced with host #0 if ( ! isClockInSync() ) return; // get local time int64_t nowLocalMS = gettimeofdayInMillisecondsLocal(); // get our hostid int32_t hid = g_hostdb.m_myHost->m_hostId; // if process only recently started (1 min ago or less) // then do not immediately do this... if (hid==0 && nowLocalMS - g_process.m_processStartTime < 1*60*1000) return; // wait until the right time (this is in UTC) time_t nowSynced = getTimeSynced(); // get time since midnight struct tm *tt ; // how many MINUTES into the day are we? (in UTC) tt = gmtime ( &nowSynced ); int32_t elapsedMins = tt->tm_hour * 60 + tt->tm_min ; // what collnum to merge? collnum_t i ; // . if we are not 0, just use host #0's collnum // . an error here will screw up the whole daily merge process if ( hid != 0 && m_mergeMode == 0 ) { // get host #0 Host *h = &g_hostdb.m_hosts[0]; // must have got a ping reply from him if ( ! h->m_gotPingReply ) return; // hostid #0 must NOT be in mode 0 if ( h->m_pingInfo.m_flags & PFLAG_MERGEMODE0 ) return; // get the collnum that host #0 is currently daily merging i = g_hostdb.m_hosts[0].m_pingInfo.m_dailyMergeCollnum; // this means host #0 is not daily merging a collnum now if ( i < 0 ) return; // if it is valid, the CollectionRec MUST be there CollectionRec *cr = g_collectiondb.getRec ( i ); if ( ! cr ) { log("daily: host #0 bad collnum %"INT32"",(int32_t)i);return;} // if valid, use it m_cr = cr; // we set m_cr, go to next mode m_mergeMode = 1; // set the start time here, but don't commit to m_cr just yet m_savedStartTime = nowSynced; } // . only host #0 should do this loop!!! // . loop through each collection to check the time for (i=0; hid==0&&m_mergeMode==0 && i<g_collectiondb.m_numRecs; i++) { // get collection rec for collnum #i CollectionRec *cr = g_collectiondb.getRec ( i ); // skip if empty, it was deleted at some point if ( ! cr ) continue; // skip if daily merge trigger is < 0 (do not do dailies) if ( cr->m_dailyMergeTrigger < 0 ) continue; // . skip if not time yet // . !!!!!THIS IS IN MINUTES!!!!!!!! if ( (int32_t)elapsedMins < (int32_t)cr->m_dailyMergeTrigger ) continue; // do not start more than 15 mins after the trigger time, // if we miss that cuz we are down, then too bad if ( (int32_t)elapsedMins > (int32_t)cr->m_dailyMergeTrigger + 15 ) continue; // . how long has it been (in seconds) // . !!!!!THIS IS IN SECONDS!!!!!!!! int32_t diff = nowSynced - cr->m_dailyMergeStarted; // crazy? if ( diff < 0 ) continue; // if less than 24 hours ago, we already did it if ( diff < 24*3600 ) continue; // . we must now match the day of week // . use <= 0 to do it every day // . 0 = sunday ... 6 = saturday // . comma separated list is ok ("0,1, 6") // . leave blank or at least no numbers to do every day char *s = cr->m_dailyMergeDOWList; char dowCounts[8]; memset(dowCounts,0,8); for ( ; *s ; s++ ) { if ( ! is_digit(*s) ) continue; int32_t num = atoi(s); if ( num < 0 ) continue; if ( num > 6 ) continue; dowCounts[num]++; } // get our dow int32_t todayDOW = tt->tm_wday + 1; // make sure 1 to 7 if ( todayDOW < 0 || todayDOW > 6 ) { log("merge: bad today dow of %i for coll %s", (int)todayDOW,cr->m_coll); return; } //if ( todayDOW > 6 ) { char *xx=NULL;*xx=0; } // skip if not a dayofweek to merge on if ( dowCounts [ todayDOW ] == 0 ) continue; // set the start time here, but don't commit to m_cr just yet m_savedStartTime = nowSynced; // . wait for everyone to be in mode #0 in case they just // finished another daily merge. only host #0 does this loop. // . PROBLEM: if host #0 crashes before everyone can get into // mode 1+ and then host #0 is brought back up, then // obviously, we will not be able to meet this condition, // therefore only check to see if this condition is // satisfied our "second time around" (so we must complete // one daily merge before checking this again). that is why // i added "m_didDaily". -- MDW for ( int32_t i = 0 ; m_didDaily && i<g_hostdb.m_numHosts ; i++){ // skip ourselves, obviously we are in merge mode 2 if ( &g_hostdb.m_hosts[i] == g_hostdb.m_myHost ) continue; // that's good if he is in mode 0 if ( g_hostdb.m_hosts[i].m_pingInfo.m_flags & PFLAG_MERGEMODE0 ) continue; // oops, someone is not mode 0 return; } // got one, save it m_cr = cr; // if we were hostid 0, go into merge mode 1 now m_mergeMode = 1; // bust out of loop break; } // can we advance to merge mode 1? if ( m_mergeMode == 1 ) { // no candidates, go back to mode 0 now, we are done if ( ! m_cr ) { log("daily: Could not get coll rec."); m_mergeMode = 0; return; } // ok, we got a collection that needs it so turn off spiders m_mergeMode = 2; // turn spiders off to keep query latency down m_spideringEnabled = g_conf.m_spideringEnabled; //m_injectionEnabled = g_conf.m_injectionEnabled; g_conf.m_spideringEnabled = false; //g_conf.m_injectionEnabled = false; // log it log("daily: Starting daily merge for %s.",m_cr->m_coll); log("daily: Waiting for other hosts to enter merge mode."); } // wait for everyone to make it to mode 1+ before going on if ( m_mergeMode == 2 ) { // check the ping packet flags for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // get the host Host *h = &g_hostdb.m_hosts[i]; // skip ourselves, obviously we are in merge mode 2 if ( h == g_hostdb.m_myHost ) continue; // skip dead hosts if ( g_hostdb.isDead(h) ) continue; // return if a host still in merge mode 0. wait for it. if ( h->m_pingInfo.m_flags & PFLAG_MERGEMODE0 ) return; } // ok, everyone is out of mode 0 now m_mergeMode = 3; // log it log("daily: Waiting for all hosts to have 0 " "spiders out."); } // wait for ALL spiders in network to clear if ( m_mergeMode == 3 ) { // return if we got spiders out! if ( g_spiderLoop.m_numSpidersOut > 0 ) return; // check the ping packet flags for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // skip ourselves, obviously we are in merge mode 2 if ( &g_hostdb.m_hosts[i] == g_hostdb.m_myHost ) continue; // if host still has spiders out, we can't go to mode 4 if ( g_hostdb.m_hosts[i].m_pingInfo.m_flags & PFLAG_HASSPIDERS ) return; } // ok, nobody has spiders now m_mergeMode = 4; // log it log("daily: Dumping trees."); } // start the dumps if ( m_mergeMode == 4 ) { // . set when we did it last, save that to disk to avoid thrash // . TODO: BUT do not allow it to be set in the spider // controls! // . THIS IS IN SECONDS!!!!!!! // . use the time we started, otherwise the merge time keeps // getting pushed back. m_cr->m_dailyMergeStarted = m_savedStartTime; // nowSynced; // tell it to save, otherwise this might not get saved m_cr->m_needsSave = true; // initiate dumps g_indexdb.getRdb ()->dumpTree(1); // niceness = 1 //g_datedb.getRdb ()->dumpTree(1); // niceness = 1 g_spiderdb.getRdb ()->dumpTree(1); // niceness = 1 g_linkdb.getRdb ()->dumpTree(1); // niceness = 1 // if neither has recs in tree, go to next mode if(g_indexdb .getRdb()->getNumUsedNodes()>0) return; //if(g_datedb .getRdb()->getNumUsedNodes()>0) return; if(g_spiderdb.getRdb()->getNumUsedNodes()>0) return; if(g_linkdb .getRdb()->getNumUsedNodes()>0) return; // ok, all trees are clear and dumped m_mergeMode = 5; // log it log("daily: Merging indexdb and datedb files."); } // start the merge if ( m_mergeMode == 5 ) { // kick off the merges if not already going //g_indexdb.getRdb()->attemptMerge(1,true,false); //g_datedb .getRdb()->attemptMerge(1,true,false); // if has more than one file, bail on it RdbBase *base; base = g_indexdb .getRdb()->getBase(m_cr->m_collnum); // . niceness,forced?,doLog?,minFilesToMerge // . only does a merge if there are 2 or more "big" indexdb // files present. Merges so that there are LESS THAN 2 files. // just another way of describing a tight merge. base->attemptMerge (1,true,false,2); if ( base->getNumFiles() >= 2 ) return; //base = g_datedb .getRdb()->getBase(m_cr->m_collnum); //base->attemptMerge (1,true,false,2); //if ( base->getNumFiles() >= 2 ) return; base = g_spiderdb.getRdb()->getBase(m_cr->m_collnum); base->attemptMerge (1,true,false,2); if ( base->getNumFiles() >= 2 ) return; base = g_linkdb .getRdb()->getBase(m_cr->m_collnum); base->attemptMerge (1,true,false,2); if ( base->getNumFiles() >= 2 ) return; // . minimize titledb merging at spider time, too // . will perform a merge IFF there are 200 or more titledb // files present, otherwise, it will not. will do the merge // such that LESS THAN 200 titledb files will be present // AFTER the merge is completed. // . do NOT force merge ALL files on this one, we just want // to make sure there are not 200+ titledb files base = g_titledb .getRdb()->getBase(m_cr->m_collnum); // we seem to dump about 70 per day at a decent spider rate // so merge enough so that we don't have to merge while // spidering base->attemptMerge (1,false,false,230-70); if ( base->getNumFiles() >= 230-70 ) return; // set m_cr to NULL up here, so that the last guy to // complete the daily merge, does not "cycle back" and // try to re-daily merge the same collection! m_cr = NULL; // ok, merges are done m_mergeMode = 6; // log it log("daily: Waiting for all hosts to finish merging."); } // wait for all to finish before re-enabling spiders if ( m_mergeMode == 6 ) { // check the ping packet flags for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // skip ourselves, obviously we are ok if ( &g_hostdb.m_hosts[i] == g_hostdb.m_myHost ) continue; // if host in mode 6 or 0, that's good if ( g_hostdb.m_hosts[i].m_pingInfo.m_flags & PFLAG_MERGEMODE0OR6) continue; // otherwise, wait for it to be in 6 or 0 return; } // ok, nobody has spiders now, everyone is 6 or 0 m_mergeMode = 0; // no coll rec now m_cr = NULL; // spiders back on g_conf.m_spideringEnabled = m_spideringEnabled; //g_conf.m_injectionEnabled = m_injectionEnabled; // log it log("daily: Daily merge completed."); // now the next time we do a daily we must make sure all hosts // are in merge mode #0 before we start m_didDaily = true; } }
// a cacheTime of -1 means browser should not cache at all void HttpMime::makeMime ( long totalContentLen , long cacheTime , time_t lastModified , long offset , long bytesToSend , char *ext , bool POSTReply , char *contentType , char *charset , long httpStatus , char *cookie ) { // assume UTF-8 //if ( ! charset ) charset = "utf-8"; // . make the content type line // . uses a static buffer if ( ! contentType ) contentType = (char *)getContentTypeFromExtension ( ext ); // do not cache plug ins if ( contentType && strcmp(contentType,"application/x-xpinstall")==0) cacheTime = -2; // assume UTF-8, but only if content type is text // . No No No!!! // . This prevents charset specification in html files // . -partap //if ( ! charset && contentType && strncmp(contentType,"text",4)==0) // charset = "utf-8"; // this is used for bz2 and gz files (mp3?) const char *contentEncoding = getContentEncodingFromExtension ( ext ); // the string char enc[128]; if ( contentEncoding ) sprintf ( enc , "Content-Encoding: %s\r\n", contentEncoding ); else enc[0] = '\0'; // get the time now //time_t now = getTimeGlobal(); time_t now; if ( isClockInSync() ) now = getTimeGlobal(); else now = getTimeLocal(); // get the greenwhich mean time (GMT) char ns[128]; struct tm *timeStruct = gmtime ( &now ); // Wed, 20 Mar 2002 16:47:30 GMT strftime ( ns , 126 , "%a, %d %b %Y %T GMT" , timeStruct ); // if lastModified is 0 use now if ( lastModified == 0 ) lastModified = now; // convert lastModified greenwhich mean time (GMT) char lms[128]; timeStruct = gmtime ( &lastModified ); // Wed, 20 Mar 2002 16:47:30 GMT strftime ( lms , 126 , "%a, %d %b %Y %T GMT" , timeStruct ); // . the pragma no cache string (used just for proxy servers?) // . also use cache-control: for the browser itself (HTTP1.1, though) // . pns = "Pragma: no-cache\nCache-Control: no-cache\nExpires: -1\n"; char tmp[128]; char *pns ; // with cache-control on, when you hit the back button, it reloads // the page, this is bad for most things... so we only avoid the // cache for index.html and PageAddUrl.cpp (the main and addurl page) if ( cacheTime == -2 ) pns = "Cache-Control: no-cache\r\n" "Pragma: no-cache\r\n" "Expires: -1\r\n"; // so when we click on a control link, it responds correctly. // like turning spiders on. else if ( cacheTime == -1 ) pns = "Pragma: no-cache\r\n" "Expires: -1\r\n"; // don't specify cache times if it's 0 (let browser regulate it) else if ( cacheTime == 0 ) pns = ""; // otherwise, expire tag: "Expires: Wed, 23 Dec 2001 10:23:01 GMT" else { time_t expDate = now + cacheTime; timeStruct = gmtime ( &expDate ); strftime ( tmp , 100 , "Expires: %a, %d %b %Y %T GMT\r\n", timeStruct ); pns = tmp; } // . set httpStatus // . a reply to a POST (not a GET or HEAD) should be 201 char *p = m_buf; char *smsg = ""; if ( POSTReply ) { if ( httpStatus == -1 ) httpStatus = 200; if ( httpStatus == 200 ) smsg = " OK"; if ( ! charset ) charset = "utf-8"; //sprintf ( m_buf , p += sprintf ( p, "HTTP/1.0 %li%s\r\n" "Date: %s\r\n" //"P3P: CP=\"CAO PSA OUR\"\r\n" "Server: Gigablast/1.0\r\n" "Content-Length: %li\r\n" //"Expires: Wed, 23 Dec 2003 10:23:01 GMT\r\n" //"Expires: -1\r\n" "Connection: Close\r\n" "%s" "Content-Type: %s\r\n\r\n", //"Connection: Keep-Alive\r\n" //"%s" //"Location: f**k\r\n" //"Location: http://192.168.0.4:8000/cgi/3.cgi\r\n" //"Last-Modified: %s\r\n\r\n" , httpStatus , smsg , ns , totalContentLen , enc , contentType ); //pns , //ns ); //lms ); } // . is it partial content? // . if bytesToSend is < 0 it means "totalContentLen" else if ( offset > 0 || bytesToSend != -1 ) { if ( httpStatus == -1 ) httpStatus = 206; if ( ! charset ) charset = "utf-8"; //sprintf ( m_buf , p += sprintf( p, "HTTP/1.0 %li Partial content\r\n" "%s" "Content-Length: %li\r\n" "Content-Range: %li-%li(%li)\r\n"// added "bytes" "Connection: Close\r\n" //"P3P: CP=\"CAO PSA OUR\"\r\n" "Server: Gigablast/1.0\r\n" "%s" "Date: %s\r\n" "Last-Modified: %s\r\n" "Content-Type: %s\r\n", httpStatus , enc ,bytesToSend , offset , offset + bytesToSend , totalContentLen , pns , ns , lms , contentType ); // otherwise, do a normal mime } else { char encoding[256]; if (charset) sprintf(encoding, "; charset=%s", charset); else encoding[0] = '\0'; if ( httpStatus == -1 ) httpStatus = 200; if ( httpStatus == 200 ) smsg = " OK"; //sprintf ( m_buf , p += sprintf( p, "HTTP/1.0 %li%s\r\n" // make it at least 4 spaces so we can change // the length of the content should we insert // a login bar in Proxy::storeLoginBar() "Content-Length: %04li\r\n" "%s" "Content-Type: %s", httpStatus , smsg , totalContentLen , enc , contentType ); if ( charset ) p += sprintf ( p , "; charset=%s", charset ); p += sprintf ( p , "\r\n"); p += sprintf ( p , //"Connection: Keep-Alive\r\n" "Connection: Close\r\n" //"P3P: CP=\"CAO PSA OUR\"\r\n" "Server: Gigablast/1.0\r\n" "%s" "Date: %s\r\n" "Last-Modified: %s\r\n" , pns , ns , lms ); } // write the cookie if we have one if (cookie) { // now it is a list of Set-Cookie: x=y\r\n lines //p += sprintf ( p, "Set-Cookie: %s\r\n", cookie); if ( strncmp(cookie,"Set-Cookie",10 ) ) p += sprintf(p,"Set-Cookie: "); p += sprintf ( p, "%s", cookie); if ( p[-1] != '\n' && p[-2] != '\r' ) { *p++ = '\r'; *p++ = '\n'; } } // write another line to end the mime p += sprintf(p, "\r\n"); // set the mime's length //m_bufLen = gbstrlen ( m_buf ); m_bufLen = p - m_buf; }