// . when the Conf::m_proxyIps parm is updated we call this to rebuild // s_iptab, our table of SpiderProxy instances, which has the proxies and // their performance statistics. // . we try to maintain stats of ip/ports that did NOT change when rebuilding. bool buildProxyTable ( ) { // scan the NEW list of proxy ip/port pairs in g_conf char *p = g_conf.m_proxyIps.getBufStart(); HashTableX tmptab; tmptab.set(8,0,16,NULL,0,false,"tmptab"); // scan the user inputted space-separated list of ip:ports // (optional username:password@ip:port) for ( ; *p ; ) { // skip white space if ( is_wspace_a(*p) ) { p++; continue; } // skip http:// if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; } // scan in an ip:port char *s = p; char *portStr = NULL; int32_t dc = 0, pc = 0, gc = 0, bc = 0; const char *msg; char *usernamePwd = NULL; int32_t usernamePwdLen = 0; char *ipStart = p; // scan all characters until we hit \0 or another whitespace for ( ; *s && !is_wspace_a(*s); s++) { if ( *s == '@' ) { // must be username:pwd if ( pc != 1 ) { msg = "bad username:password"; goto hadError; } usernamePwd = p; usernamePwdLen = s - p; if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) { msg = "username:password too long"; goto hadError; } dc = 0; gc = 0; bc = 0; pc = 0; portStr = NULL; ipStart = s+1; continue; } if ( *s == '.' ) { dc++; continue; } if ( *s == ':' ) { portStr=s; pc++; continue; } if ( is_digit(*s) ) { gc++; continue; } bc++; continue; } // ensure it is a legit ip:port combo msg = NULL; if ( gc < 4 ) msg = "not enough digits for an ip"; if ( pc > 1 ) msg = "too many colons"; if ( dc != 3 ) msg = "need 3 dots for an ip address"; if ( bc ) msg = "got illegal char in ip:port listing"; if ( msg ) { hadError: char c = *s; *s = '\0'; log("buf: %s for %s",msg,p); *s = c; return false; } // convert it int32_t iplen = s - ipStart; if ( portStr ) iplen = portStr - ipStart; int32_t ip = atoip(ipStart,iplen); // another sanity check if ( ip == 0 || ip == -1 ) { log("spider: got bad proxy ip for %s",p); return false; } // and the port default is 80 int32_t port = 80; if ( portStr ) port = atol2(portStr+1,s-portStr-1); if ( port < 0 || port > 65535 ) { log("spider: got bad proxy port for %s",p); return false; } // . we got a legit ip:port // . see if already in our table uint64_t ipKey = (uint32_t)ip; ipKey <<= 16; ipKey |= (uint16_t)(port & 0xffff); // also store into tmptable to see what we need to remove tmptab.addKey(&ipKey); // see if in table int32_t islot = s_iptab.getSlot( &ipKey); // advance p p = s; // if in there, keep it as is if ( islot >= 0 ) continue; // otherwise add new entry SpiderProxy newThing; memset ( &newThing , 0 , sizeof(SpiderProxy)); newThing.m_ip = ip; newThing.m_port = port; newThing.m_lastDownloadTookMS = -1; newThing.m_lastSuccessfulTestMS = -1; gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen); // ensure it is NULL terminated newThing.m_usernamePwd[usernamePwdLen] = '\0'; if ( ! s_iptab.addKey ( &ipKey, &newThing ) ) return false; } redo: int32_t removed = 0; // scan all SpiderProxies in tmptab for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty buckets in hashtable s_iptab if ( ! s_iptab.m_flags[i] ) continue; // get the key int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i); // must also exist in tmptab, otherwise it got removed by user if ( tmptab.isInTable ( &key ) ) continue; // skip if not in table if ( s_iptab.getSlot ( &key ) < 0 ) { log("sproxy: iptable hashing messed up"); continue; } // shoot, it got removed. not in the new list of ip:ports s_iptab.removeKey ( &key ); removed++; // hashtable is messed up now, start over //goto redo; } if ( removed ) goto redo; return true; }
// hostId is the remote hostid sending us the lock request void removeExpiredLocks ( int32_t hostId ) { // when we last cleaned them out static time_t s_lastTime = 0; int32_t nowGlobal = getTimeGlobalNoCore(); // only do this once per second at the most if ( nowGlobal <= s_lastTime ) return; // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; restart: // scan the slots int32_t ns = ht->m_numSlots; // . clean out expired locks... // . if lock was there and m_expired is up, then nuke it! // . when Rdb.cpp receives the "fake" title rec it removes the // lock, only it just sets the m_expired to a few seconds in the // future to give the negative doledb key time to be absorbed. // that way we don't repeat the same url we just got done spidering. // . this happens when we launch our lock request on a url that we // or a twin is spidering or has just finished spidering, and // we get the lock, but we avoided the negative doledb key. for ( int32_t i = 0 ; i < ns ; i++ ) { // breathe QUICKPOLL(MAX_NICENESS); // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); int64_t lockKey = *(int64_t *)ht->getKeyFromSlot(i); // if collnum got deleted or reset collnum_t collnum = lock->m_collnum; if ( collnum >= g_collectiondb.m_numRecs || ! g_collectiondb.m_recs[collnum] ) { log("spider: removing lock from missing collnum " "%" PRId32,(int32_t)collnum); goto nuke; } // skip if not yet expired if ( lock->m_expires == 0 ) continue; if ( lock->m_expires >= nowGlobal ) continue; // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock after waiting. elapsed=%" PRId32"." " lockKey=%" PRIu64" hid=%" PRId32" expires=%" PRIu32" " "nowGlobal=%" PRIu32, (nowGlobal - lock->m_timestamp), lockKey,hostId, (uint32_t)lock->m_expires, (uint32_t)nowGlobal); nuke: // nuke the slot and possibly re-chain ht->removeSlot ( i ); // gotta restart from the top since table may have shrunk goto restart; } // store it s_lastTime = nowGlobal; }