static bool loadSpiderProxyStats() { initProxyTables(); // take this out for now since i was seeing dups in s_iptab for // some reason. was causing an infinite loop bug calling goto redo: // all the time above. // save hashtable //s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat"); //s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat"); // save hash table. this also returns false if does not exist. //if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") ) // return false; // unset some flags for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); sp->m_isWaiting = false; } return true; }
SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) { for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); if ( sp->m_ip != ip ) continue; if ( sp->m_port != port ) continue; return sp; } return NULL; }
void gbiconv_reset(){ for (int32_t i=0;i<s_convTable.getNumSlots();i++){ //int32_t key = *(int32_t *)s_convTable.getKey(i); //if (!key) continue; if ( ! s_convTable.m_flags[i] ) continue; iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i); if (! pconv) continue; iconv_t iconv = *pconv; //logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv); g_mem.rmMem((void*)iconv, 52, "iconv"); libiconv_close(iconv); } s_convTable.reset(); }
// . use msg 0x55 to say you are done using the proxy // . we now use the top part of the Msg13Request as the proxy request void returnProxy ( Msg13Request *preq , UdpSlot *udpSlot ) { //char *p = request; //int32_t proxyIp = *(int32_t *)p; p += 4; //int16_t proxyPort = *(int16_t *)p; p += 2; //int32_t lbId = *(int32_t *)p; p += 4; int32_t urlIp = preq->m_urlIp; // // update the load bucket // // scan over all that match to find lbid int32_t hslot = s_loadTable.getSlot ( &urlIp ); // scan all proxies that have this urlip outstanding int32_t i;for (i=hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){ // get the bucket LoadBucket *lb= (LoadBucket *)s_loadTable.getValueFromSlot(i); // is it the right id? if ( lb->m_id != preq->m_lbId ) continue; if ( lb->m_proxyIp != preq->m_proxyIp ) continue; if ( lb->m_proxyPort != preq->m_proxyPort ) continue; // that's it. set the download end time int64_t nowms = gettimeofdayInMillisecondsLocal(); lb->m_downloadEndTimeMS = nowms; break; } if ( i < 0 ) log("sproxy: could not find load bucket id #%" PRId32,preq->m_lbId); // if no slot provided, return to called without sending back reply, // they are banning a proxy and need to also return it before // we send them back another proxy to try. if ( ! udpSlot ) return; // gotta send reply back g_udpServer.sendReply(0, 0, 0, 0, udpSlot); }
// . sets *current to how many downloads are CURRENTly outstanding for // this proxy // . returns total # of load points, i.e. downloads, in the last // LOADPOINT_EXPIRE_MS milliseconds (currently 10 minutes) static int32_t getNumLoadPoints(SpiderProxy *sp, int32_t *current) { // currently outstanding downloads on proxy *current = 0; int32_t count = 0; // scan all proxies that have this urlip outstanding for ( int32_t i = 0 ; i < s_loadTable.m_numSlots ; i++ ) { // skip if empty if ( ! s_loadTable.m_flags[i] ) continue; // get the bucket LoadBucket *lb= (LoadBucket *)s_loadTable.getValueFromSlot(i); // get the spider proxy this load point was for if ( lb->m_proxyIp != sp->m_ip ) continue; if ( lb->m_proxyPort != sp->m_port ) continue; // currently outstanding downloads on proxy if ( lb->m_downloadEndTimeMS == 0LL ) *current = *current + 1; count++; } return count; }
// get the id from a 2 character country code uint8_t getCountryId ( char *cc ) { static bool s_init = false; static char buf[2000]; static HashTableX ht; char tmp[4]; if ( ! s_init ) { s_init = true; // hash them up ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids"); // now add in all the country codes long n = (long) sizeof(s_countryCode) / sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *s = (char *)s_countryCode[i]; //long slen = gbstrlen ( s ); // sanity check if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; } // map it to a 4 byte key tmp[0]=s[0]; tmp[1]=s[1]; tmp[2]=0; tmp[3]=0; // a val of 0 does not mean empty in HashTableX, // that is an artifact of HashTableT uint8_t val = i; // +1; // add 1 cuz 0 means lang unknown if ( ! ht.addKey ( tmp , &val ) ) { char *xx=NULL;*xx=0; } } } // lookup tmp[0]=to_lower_a(cc[0]); tmp[1]=to_lower_a(cc[1]); tmp[2]=0; tmp[3]=0; long slot = ht.getSlot ( tmp ); if ( slot < 0 ) return 0; void *val = ht.getValueFromSlot ( slot ); return *(uint8_t *)val ; }
// a host is asking us (host #0) what proxy to use? static void handleRequest54(UdpSlot *udpSlot, int32_t niceness) { char *request = udpSlot->m_readBuf; int32_t requestSize = udpSlot->m_readBufSize; // we now use the top part of the Msg13Request as the ProxyRequest Msg13Request *preq = (Msg13Request *)request; // sanity check if ( requestSize != preq->getProxyRequestSize() ) { log("db: Got bad request 0x54 size of %" PRId32" bytes. bad", requestSize ); g_udpServer.sendErrorReply ( udpSlot , EBADREQUESTSIZE ); return; } // is the request telling us it is done downloading through a proxy? if ( preq->m_opCode == OP_RETPROXY ) { returnProxy ( preq , udpSlot ); return; } // if sender is asking for a new proxy and wants us to ban // the previous proxy we sent for this urlIp... if ( preq->m_banProxyIp ) { // don't core if misses sanity. it seems we don't always // NULLify these or something. // these must match if(preq->m_banProxyIp != preq->m_proxyIp || preq->m_banProxyPort != preq->m_proxyPort){ log("db: proxy: banproxyip != proxyip. mismatch!"); g_udpServer.sendErrorReply ( udpSlot , EBADENGINEER); return; } // this will "return" the banned proxy returnProxy ( preq , NULL ); // now add it to the banned table int64_t uip = preq->m_urlIp; int64_t pip = preq->m_banProxyIp; int64_t h64 = hash64h ( uip , pip ); if ( ! s_proxyBannedTable.isInTable ( &h64 ) ) { s_proxyBannedTable.addKey ( &h64 ); // for stats counting. each proxy ip maps to # // of unique website IPs that have banned it. s_banCountTable.addTerm32((uint32_t)pip); } } // shortcut int32_t urlIp = preq->m_urlIp; // send to a proxy that is up and has the least amount // of LoadBuckets with this urlIp, if tied, go to least loaded. // clear counts for this url ip for scoring the best proxy to use for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); sp->m_countForThisIp = 0; sp->m_lastTimeUsedForThisIp = 0LL; } // this table maps a url's current IP to a possibly MULTIPLE slots // which tell us what proxy is downloading a page from that IP. // so we can try to find a proxy that is not download a url from // this IP currently, or hasn't been for the longest time... int32_t hslot = s_loadTable.getSlot ( &urlIp ); // scan all proxies that have this urlip outstanding for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){ // get the bucket LoadBucket *lb; lb = (LoadBucket *)s_loadTable.getValueFromSlot(i); // get the spider proxy this load point was for uint64_t key = (uint32_t)lb->m_proxyIp; key <<= 16; key |= (uint16_t)lb->m_proxyPort; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValue(&key); // must be there unless user remove it from the list if ( ! sp ) continue; // count it up if ( lb->m_downloadEndTimeMS == 0LL ) sp->m_countForThisIp++; // set the last time used to the most recently downloaded time // that this proxy has downloaded from this ip if ( lb->m_downloadEndTimeMS && lb->m_downloadEndTimeMS > sp->m_lastTimeUsedForThisIp ) sp->m_lastTimeUsedForThisIp = lb->m_downloadEndTimeMS; } // first try to get a spider proxy that is not "dead" bool skipDead = true; int32_t numBannedProxies = 0; int32_t aliveProxyCandidates = 0; redo: // get the min of the counts int32_t minCount = 999999; for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) { numBannedProxies++; continue; } } // if it failed the last test, skip it if ( skipDead && sp->m_lastDownloadError ) continue; if ( skipDead ) aliveProxyCandidates++; if ( sp->m_countForThisIp >= minCount ) continue; minCount = sp->m_countForThisIp; } // all dead? then get the best dead one if ( minCount == 999999 ) { skipDead = false; goto redo; } // . we only use one proxy if none are banned by this IP // . when that gets banned, we will use the next 2 proxies with // a higher backoff/crawlDelay, etc. int32_t threshHold; if ( numBannedProxies <= 0 ) threshHold = 1; // if first proxy gets banned, try next 2 proxies until both get ban'd else if ( numBannedProxies == 1 ) threshHold = 2; else if ( numBannedProxies < 1+2) threshHold = 3 - numBannedProxies; // if next two proxies got banned, try next 4 proxies until banned else if ( numBannedProxies == 3 ) threshHold = 4; else if ( numBannedProxies < 3+4) threshHold = 7 - numBannedProxies; // if next 4 proxies got banned, try next 8 proxies until they get band else if ( numBannedProxies == 7 ) threshHold = 8; else if ( numBannedProxies < 7+8) threshHold = 15 - numBannedProxies; else if ( numBannedProxies == 15) threshHold = 16; else if ( numBannedProxies < 15+16 ) threshHold = 31-numBannedProxies; else if ( numBannedProxies == 31 ) threshHold = 32; else if ( numBannedProxies < 31+32)threshHold=63-numBannedProxies; else if ( numBannedProxies == 63 ) threshHold = 64; else if ( numBannedProxies < 63+64)threshHold=127-numBannedProxies; else if ( numBannedProxies == 127 ) threshHold = 128; else if ( numBannedProxies < 127+128)threshHold=255-numBannedProxies; else if ( numBannedProxies == 255 ) threshHold = 256; else if ( numBannedProxies < 255+256)threshHold=512-numBannedProxies; else if ( numBannedProxies == 511 ) threshHold = 512; else if ( numBannedProxies < 511+512)threshHold=1024-numBannedProxies; else threshHold = 1024; if ( threshHold <= 0 ) { log("proxy: spiderproxy error in threshold of %" PRId32" " "for banned=%" PRId32,threshHold,numBannedProxies); threshHold = 1; } // reset minCount so we can take the min over those we check here minCount = -1; int64_t oldest = 0x7fffffffffffffffLL; SpiderProxy *winnersp = NULL; int32_t count = 0; // start at a random slot based on url's IP so we don't // overload the first proxy int32_t start = ((uint32_t)urlIp) % s_iptab.getNumSlots(); int32_t slotCount = s_iptab.getNumSlots(); // . now find the best proxy wih the minCount for ( int32_t i = start ; ; i++ ) { // scan all slots in hash table, then stop if ( slotCount-- <= 0 ) break; // wrap around to zero if we hit the end if ( i == s_iptab.getNumSlots() ) i = 0; // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if it failed the last test, skip it... not here... if ( skipDead && sp->m_lastDownloadError ) continue; // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue; } // if some proxies are "alive" then only pick from // the first half of the proxies that are alive (i.e. still // work). that way, when one of those goes dead we will inc // the backoff (crawldelay) and a new proxy that we haven't // used for this url's IP will take it's place. and such // new proxies will only have the new backoff count used // through them. that way, we don't get ALL of our proxies // banned at about the same time since we do somewhat uniform // load balancing over them. if ( skipDead && count >= threshHold)//aliveProxyCandidates/2 ) continue; // count the alive/non-banned candidates count++; // if all hosts were "dead" because they all had // m_lastDownloadError set then minCount will be 999999 // and nobody should continue from this statement: if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue; // then go by last download time for this ip if ( sp->m_countForThisIp == minCount && minCount>=0 && sp->m_lastTimeUsedForThisIp >= oldest ) continue; // pick the spider proxy used longest ago oldest = sp->m_lastTimeUsedForThisIp; minCount = sp->m_countForThisIp; // got a new winner winnersp = sp; } // we must have a winner if ( ! winnersp ) { g_process.shutdownAbort(true); } int64_t nowms = gettimeofdayInMillisecondsLocal(); // add a new load bucket then! LoadBucket bb; bb.m_urlIp = urlIp; // the time it started bb.m_downloadStartTimeMS = nowms; // download has not ended yet bb.m_downloadEndTimeMS = 0LL; // the host using the proxy bb.m_hostId = udpSlot->getHostId(); // key is this for m_prTable bb.m_proxyIp = winnersp->m_ip; bb.m_proxyPort = winnersp->m_port; // a new id. we use this to update the downloadEndTime when done static int32_t s_lbid = 0; // add it now bb.m_id = s_lbid++; s_loadTable.addKey ( &urlIp , &bb ); // winner count update winnersp->m_timesUsed++; // sanity if ( (int32_t)sizeof(ProxyReply) > TMPBUFSIZE ){g_process.shutdownAbort(true);} // and give proxy ip/port back to the requester so they can // use that to download their url ProxyReply *prep = (ProxyReply *)udpSlot->m_tmpBuf; prep->m_proxyIp = winnersp->m_ip; prep->m_proxyPort = winnersp->m_port; // this is just '\0' if none strcpy(prep->m_usernamePwd,winnersp->m_usernamePwd); // do not count the proxy we are returning as "more" prep->m_hasMoreProxiesToTry = ( aliveProxyCandidates > 1 ); // and the loadbucket id, so requester can tell us it is done // downloading through the proxy and we can update the LoadBucket // for this transaction (m_lbId) prep->m_lbId = bb.m_id; // requester wants to know how many proxies have been banned by the // urlIp so it can increase a self-imposed crawl-delay to be more // sensitive to the spider policy. prep->m_numBannedProxies = numBannedProxies; //char *p = udpSlot->m_tmpBuf; //*(int32_t *)p = winnersp->m_ip ; p += 4; //*(int16_t *)p = winnersp->m_port; p += 2; // and the loadbucket id //*(int32_t *)p = bb.m_id; p += 4; // with dup keys we end up with long chains of crap and this // takes forever. so just flush the whole thing every 2 minutes AND // when 20000+ entries are in there static time_t s_lastTime = 0; time_t now = nowms / 1000; if ( s_lastTime == 0 ) s_lastTime = now; time_t elapsed = now - s_lastTime; if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) { log("sproxy: flushing %i entries from proxy loadtable that " "have accumulated since %i seconds ago", (int)s_loadTable.m_numSlotsUsed,(int)elapsed); s_loadTable.clear(); // only do this one per minute s_lastTime = now; } int32_t sanityCount = 0;//s_loadTable.getNumSlots(); // top: // now remove old entries from the load table. entries that // have completed and have a download end time more than 10 mins ago. for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) { // skip if empty if ( ! s_loadTable.m_flags[i] ) continue; // get the bucket LoadBucket *pp =(LoadBucket *)s_loadTable.getValueFromSlot(i); // skip if still active if ( pp->m_downloadEndTimeMS == 0LL ) continue; // delta t int64_t took = nowms - pp->m_downloadEndTimeMS; // < 10 mins? now it's < 15 seconds to prevent clogging. if ( took < LOADPOINT_EXPIRE_MS ) continue; // 100 at a time so we don't slam cpu if ( sanityCount++ > 100 ) break; // ok, its too old, nuke it to save memory s_loadTable.removeSlot(i); // the keys might have buried us but we really should not // mis out on analyzing any keys if we just keep looping here // should we? TODO: figure it out. if we miss a few it's not // a big deal. //i--; //goto top; } // send the proxy ip/port/LBid back to user g_udpServer.sendReply(udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot); }
// . we call this from Parms.cpp which prints out the proxy related controls // and this table below them... // . allows user to see the stats of each spider proxy bool printSpiderProxyTable ( SafeBuf *sb ) { // only host #0 will have the stats ... so print that link if ( g_hostdb.m_myHost->m_hostId != 0 ) { Host *h = g_hostdb.getHost(0); sb->safePrintf("<br>" "<b>See table on <a href=http://%s:%" PRId32"/" "admin/proxies>" "host #0</a></b>" "<br>" , iptoa(h->m_ip) , (int32_t)(h->getInternalHttpPort()) ); //return true; } // print host table sb->safePrintf ( "<table %s>" "<tr><td colspan=10><center>" "<b>Spider Proxies " "</b>" "</center></td></tr>" "<tr bgcolor=#%s>" "<td>" "<b>proxy IP</b></td>" "<td><b>proxy port</b></td>" "<td><b>times used</b></td>" "<td><b># website IPs banning</b></td>" "<td><b>load points</b></td>" "<td><b>currently out</b></td>" // time of last successful download. print "none" // if never successfully used "<td><b>test url last successful download</b></td>" // we fetch a test url every minute or so through // each proxy to ensure it is up. typically this should // be your website so you do not make someone angry. "<td><b>test url last download attempt</b></td>" // print "FAILED" in red if it failed to download "<td><b>test url download took</b></td>" "<td><b>last bytes downloaded</b></td>" "<td><b>last test url error</b></td>" "</tr>" , TABLE_STYLE , DARK_BLUE ); int32_t now = getTimeLocal(); // print it for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); const char *bg = LIGHT_BLUE; // mark with light red bg if last test url attempt failed if ( sp->m_lastDownloadTookMS == -1 && sp->m_lastDownloadTestAttemptMS>0 ) bg = "ffa6a6"; // or a perm denied error (as opposed to a timeout above) if ( sp->m_lastDownloadError ) bg = "ffa6a6"; // print it sb->safePrintf ( "<tr bgcolor=#%s>" "<td>%s</td>" // proxy ip "<td>%" PRIu32"</td>" // port , bg , iptoa(sp->m_ip) , (uint32_t)(uint16_t)sp->m_port ); sb->safePrintf("<td>%" PRId64"</td>",sp->m_timesUsed); int32_t banCount = s_banCountTable.getScore32(sp->m_ip); if ( banCount < 0 ) banCount = 0; sb->safePrintf("<td>%" PRId32"</td>",banCount); int32_t currentLoad; // get # times it appears in loadtable int32_t np = getNumLoadPoints ( sp , ¤tLoad ); sb->safePrintf("<td>%" PRId32"</td>",np); // currently outstanding downloads on this proxy sb->safePrintf("<td>%" PRId32"</td>",currentLoad); // last SUCCESSFUL download time ago. when it completed. int32_t ago = now - sp->m_lastSuccessfulTestMS/1000; sb->safePrintf("<td>"); // like 1 minute ago etc. if ( sp->m_lastSuccessfulTestMS <= 0 ) sb->safePrintf("none"); else printTimeAgo(sb, ago, now, true); sb->safePrintf("</td>"); // last download time ago ago = now - sp->m_lastDownloadTestAttemptMS/1000; sb->safePrintf("<td>"); // like 1 minute ago etc. if ( sp->m_lastDownloadTestAttemptMS<= 0 ) sb->safePrintf("none"); else printTimeAgo(sb, ago, now, true); sb->safePrintf("</td>"); // how long to download the test url? if ( sp->m_lastDownloadTookMS != -1 ) sb->safePrintf("<td>%" PRId32"ms</td>", (int32_t)sp->m_lastDownloadTookMS); else if ( sp->m_lastDownloadTestAttemptMS<= 0 ) sb->safePrintf("<td>unknown</td>"); else sb->safePrintf("<td>" "<font color=red>FAILED</font>" "</td>"); sb->safePrintf("<td>%" PRId32"</td>",sp->m_lastBytesDownloaded); if ( sp->m_lastDownloadError ) sb->safePrintf("<td><font color=red>%s</font></td>", mstrerror(sp->m_lastDownloadError)); else sb->safePrintf("<td>none</td>"); sb->safePrintf("</tr>\n"); } sb->safePrintf("</table><br>"); return true; }
// hostId is the remote hostid sending us the lock request void removeExpiredLocks ( int32_t hostId ) { // when we last cleaned them out static time_t s_lastTime = 0; int32_t nowGlobal = getTimeGlobalNoCore(); // only do this once per second at the most if ( nowGlobal <= s_lastTime ) return; // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; restart: // scan the slots int32_t ns = ht->m_numSlots; // . clean out expired locks... // . if lock was there and m_expired is up, then nuke it! // . when Rdb.cpp receives the "fake" title rec it removes the // lock, only it just sets the m_expired to a few seconds in the // future to give the negative doledb key time to be absorbed. // that way we don't repeat the same url we just got done spidering. // . this happens when we launch our lock request on a url that we // or a twin is spidering or has just finished spidering, and // we get the lock, but we avoided the negative doledb key. for ( int32_t i = 0 ; i < ns ; i++ ) { // breathe QUICKPOLL(MAX_NICENESS); // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); int64_t lockKey = *(int64_t *)ht->getKeyFromSlot(i); // if collnum got deleted or reset collnum_t collnum = lock->m_collnum; if ( collnum >= g_collectiondb.m_numRecs || ! g_collectiondb.m_recs[collnum] ) { log("spider: removing lock from missing collnum " "%" PRId32,(int32_t)collnum); goto nuke; } // skip if not yet expired if ( lock->m_expires == 0 ) continue; if ( lock->m_expires >= nowGlobal ) continue; // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock after waiting. elapsed=%" PRId32"." " lockKey=%" PRIu64" hid=%" PRId32" expires=%" PRIu32" " "nowGlobal=%" PRIu32, (nowGlobal - lock->m_timestamp), lockKey,hostId, (uint32_t)lock->m_expires, (uint32_t)nowGlobal); nuke: // nuke the slot and possibly re-chain ht->removeSlot ( i ); // gotta restart from the top since table may have shrunk goto restart; } // store it s_lastTime = nowGlobal; }
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) { // get request char *request = udpSlot->m_readBuf; int32_t reqSize = udpSlot->m_readBufSize; // shortcut UdpServer *us = &g_udpServer; // breathe QUICKPOLL ( niceness ); // shortcut char *reply = udpSlot->m_tmpBuf; // // . is it confirming that he got all the locks? // . if so, remove the doledb record and dock the doleiptable count // before adding a waiting tree entry to re-pop the doledb record // if ( reqSize == sizeof(ConfirmRequest) ) { char *msg = NULL; ConfirmRequest *cq = (ConfirmRequest *)request; // confirm the lock HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 ); if ( slot < 0 ) { log("spider: got a confirm request for a key not " "in the table! coll must have been deleted " " or reset " "while lock request was outstanding."); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; //char *xx=NULL;*xx=0; } } UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); lock->m_confirmed = true; // note that if ( g_conf.m_logDebugSpider ) // Wait ) log("spider: got confirm lock request for ip=%s", iptoa(lock->m_firstIp)); // get it SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum); // make it negative cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL; // and add the negative rec to doledb (deletion operation) Rdb *rdb = &g_doledb.m_rdb; if ( ! rdb->addRecord ( cq->m_collnum, (char *)&cq->m_doledbKey, NULL , // data 0 , //dataSize 1 )){ // niceness // tree is dumping or something, probably ETRYAGAIN if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno)); } //char *xx=NULL;*xx=0; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // now remove from doleiptable since we removed from doledb if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp ); // how many spiders outstanding for this coll and IP? //int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp); // DO NOT add back to waiting tree if max spiders // out per ip was 1 OR there was a crawldelay. but better // yet, take care of that in the winReq code above. // . now add to waiting tree so we add another spiderdb // record for this firstip to doledb // . true = callForScan // . do not add to waiting tree if we have enough outstanding // spiders for this ip. we will add to waiting tree when // we receive a SpiderReply in addSpiderReply() if ( sc && //out < cq->m_maxSpidersOutPerIp && // this will just return true if we are not the // responsible host for this firstip // DO NOT populate from this!!! say "false" here... ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) && // must be an error... g_errno ) { msg = "FAILED TO ADD TO WAITING TREE"; log("spider: %s %s",msg,mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // success!! reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // sanity check if ( reqSize != sizeof(LockRequest) ) { log("spider: bad msg12 request size of %" PRId32,reqSize); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , EBADREQUEST ); return; } // deny it if we are not synced yet! otherwise we core in // getTimeGlobal() below if ( ! isClockInSync() ) { // log it so we can debug it //log("spider: clock not in sync with host #0. so " // "returning etryagain for lock reply"); // let admin know why we are not spidering log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , ETRYAGAIN ); return; } LockRequest *lr = (LockRequest *)request; //uint64_t lockKey = *(int64_t *)request; //int32_t lockSequence = *(int32_t *)(request+8); // is this a remove operation? assume not //bool remove = false; // get top bit //if ( lockKey & 0x8000000000000000LL ) remove = true; // mask it out //lockKey &= 0x7fffffffffffffffLL; // sanity check, just 6 bytes! (48 bits) if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 request uh48=%" PRId64" remove=%" PRId32, lr->m_lockKeyUh48, (int32_t)lr->m_removeLock); // get time int32_t nowGlobal = getTimeGlobal(); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port ); // this must be legit - sanity check if ( hostId < 0 ) { char *xx=NULL;*xx=0; } // remove expired locks from locktable removeExpiredLocks ( hostId ); int64_t lockKey = lr->m_lockKeyUh48; // check tree int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 ); // put it here UrlLock *lock = NULL; // if there say no no if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if doing a remove operation and that was our hostid then unlock it if ( lr->m_removeLock && lock && lock->m_hostId == hostId && lock->m_lockSequence == lr->m_lockSequence ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // ok, at this point all remove ops return if ( lr->m_removeLock ) { reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } ///////// // // add new lock // ///////// // if lock > 1 hour old then remove it automatically!! if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // note it for now log("spider: removing lock after %" PRId32" seconds " "for lockKey=%" PRIu64" hid=%" PRId32, (nowGlobal - lock->m_timestamp), lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // if lock still there, do not grant another lock if ( lock ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: refusing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); reply[0] = 0; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // make the new lock UrlLock tmp; tmp.m_hostId = hostId; tmp.m_lockSequence = lr->m_lockSequence; tmp.m_timestamp = nowGlobal; tmp.m_expires = 0; tmp.m_firstIp = lr->m_firstIp; tmp.m_collnum = lr->m_collnum; // when the spider returns we remove its lock on reception of the // spiderReply, however, we actually just set the m_expires time // to 5 seconds into the future in case there is a current request // to get a lock for that url in progress. but, we do need to // indicate that the spider has indeed completed by setting // m_spiderOutstanding to true. this way, addToWaitingTree() will // not count it towards a "max spiders per IP" quota when deciding // on if it should add a new entry for this IP. tmp.m_spiderOutstanding = true; // this is set when all hosts in the group (shard) have granted the // lock and the host sends out a confirmLockAcquisition() request. // until then we do not know if the lock will be granted by all hosts // in the group (shard) tmp.m_confirmed = false; // put it into the table if ( ! ht->addKey ( &lockKey , &tmp ) ) { // return error if that failed! log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // note it for now if ( g_conf.m_logDebugSpider ) log("spider: granting lock for lockKey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // grant the lock reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; }