// Slightly modified from getTextEntity short get_iana_charset(char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") ) return log("build: Could not init table of " "IANA Charsets."); // now add in all the charset entries long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( long i = 0 ; i < n ; i++ ) { long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(&h, i+1) ) return log("build: add term failed"); } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } long long h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) long i = (long) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (short)s_charsets[i-1].mib_enum; }
// Slightly modified from getTextEntity int16_t get_iana_charset(const char *cs, int len) { if (!s_isInitialized){ // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) { log(LOG_WARN, "build: Could not init table of IANA Charsets."); return csUnknown; } // now add in all the charset entries int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset); // turn off quickpolling char saved = g_conf.m_useQuickpoll; g_conf.m_useQuickpoll = false; for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) ); // store the charset index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) { log(LOG_WARN, "build: add term failed"); return csUnknown; } } g_conf.m_useQuickpoll = saved; s_isInitialized = true; } int64_t h = hash64Lower_a ( cs , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore(h); // return 0 if no match if ( i == 0 ) return csUnknown; // return the iso character return (int16_t)s_charsets[i-1].mib_enum; }
static bool loadSpiderProxyStats() { initProxyTables(); // take this out for now since i was seeing dups in s_iptab for // some reason. was causing an infinite loop bug calling goto redo: // all the time above. // save hashtable //s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat"); //s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat"); // save hash table. this also returns false if does not exist. //if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") ) // return false; // unset some flags for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); sp->m_isWaiting = false; } return true; }
// . init s_mimeTable in this call // . called from HttpServer::init // . returns false and sets g_errno on error bool HttpMime::init ( ) { // only need to call once if ( s_init ) return true; // make sure only called once s_init = true; //s_mimeTable.set ( 256 ); //s_mimeTable.setLabel("mimetbl"); if ( ! s_mimeTable.set(4,sizeof(char *),256,NULL,0,false,1,"mimetbl")) return false; // set table from internal list for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) { int32_t key = hash32n ( s_ext[i] ); if ( ! s_mimeTable.addKey ( &key , &s_ext[i+1] ) ) return log("HttpMime::init: failed to set table."); } // quick text const char *tt = getContentTypeFromExtension ( "zip" ); if ( strcmp(tt,"application/zip") != 0 ) { g_errno = EBADENGINEER; return log("http: Failed to init mime table correctly."); } // a more thorough test for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) { tt = getContentTypeFromExtension ( s_ext[i] ); if ( strcmp(tt,s_ext[i+1]) == 0 ) continue; g_errno = EBADENGINEER; return log("http: Failed to do mime table correctly. i=%" PRId32,i); } // TODO: set it from a user supplied file here return true; }
bool AdultBit::isDirty ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_dirty[] = { "anal", "analsex", "b*****b", "blowjobs", "boob", "boobs", "clitoris", "c**k", "cocks", "cum", "dick", "dicks", "g******g", "gangbangs", "gangbanging", "movie", "movies", "oral", "oralsex", "p**n", "porno", "pussy", "pussies", "sex", "sexy", "tit", "t**s", "video", "videos", "xxx", "xxxx", "xxxx" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0, "adulttab")) return log("build: Error initializing " "dirty word hash table." ); // now add in all the dirty words int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_dirty [i] ); if ( ! s_dtable.addTerm (&h, i+1) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_dtable.getScore ( &h ); }
static bool initProxyTables() { // initialize proxy/urlip ban table? if ( ! s_init ) return true; s_init = false; s_proxyBannedTable.set(8,0,0,NULL,0,false,"proxban"); s_banCountTable.set(4,4,0,NULL,0,false,"bancnt"); return true; }
// save the stats bool saveSpiderProxyStats ( ) { // save hashtable s_proxyBannedTable.save(g_hostdb.m_dir,"proxybantable.dat"); s_banCountTable.save(g_hostdb.m_dir,"proxybancounttable.dat"); // save hash table return s_iptab.save(g_hostdb.m_dir,"spiderproxystats.dat"); }
// // . new code for saving hashtablex in a thread // . so Process.cpp's call to g_spiderCache.save() can save the doleiptable // without blocking... // static void *saveWrapper ( void *state , class ThreadEntry *t ) { // get this class HashTableX *THIS = (HashTableX *)state; // this returns false and sets g_errno on error THIS->save( THIS->m_dir , THIS->m_filename , THIS->m_tbuf , THIS->m_tsize ); // now exit the thread, bogus return return NULL; }
bool resetProxyStats ( ) { // s_proxyBannedTable.reset(); // s_banCountTable.reset(); // s_iptab.reset(); s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,"siptab",true); // skip port part of key magic, and get LSB of the IP as key magic s_iptab.m_maskKeyOffset = 5; s_proxyBannedTable.set(8,0,0,NULL,0,false,"proxban"); s_banCountTable.set(4,4,0,NULL,0,false,"bancnt"); return buildProxyTable(); }
SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) { for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); if ( sp->m_ip != ip ) continue; if ( sp->m_port != port ) continue; return sp; } return NULL; }
void gbiconv_reset(){ for (int32_t i=0;i<s_convTable.getNumSlots();i++){ //int32_t key = *(int32_t *)s_convTable.getKey(i); //if (!key) continue; if ( ! s_convTable.m_flags[i] ) continue; iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i); if (! pconv) continue; iconv_t iconv = *pconv; //logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv); g_mem.rmMem((void*)iconv, 52, "iconv"); libiconv_close(iconv); } s_convTable.reset(); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }
iconv_t gbiconv_open( char *tocode, char *fromcode) { // get hash for to/from uint32_t hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0); uint32_t hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0); uint32_t hash = hash32h(hash1, hash2); g_errno = 0; iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash); iconv_t conv = NULL; if ( convp ) conv = *convp; //log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%"XINT32": 0x%"XINT32"", // fromcode, tocode, // hash, conv); if (!conv){ //log(LOG_DEBUG, "uni: Allocating new convertor for " // "%s to %s (hash: 0x%"XINT32")", // fromcode, tocode,hash); conv = iconv_open(tocode, fromcode); if (conv == (iconv_t) -1) { log(LOG_WARN, "uni: failed to open converter for " "%s to %s: %s (%d)", fromcode, tocode, strerror(errno), errno); // need to stop if necessary converters don't open //char *xx=NULL; *xx = 0; g_errno = errno; if (errno == EINVAL) g_errno = EBADCHARSET; return conv; } // add mem to table to keep track g_mem.addMem((void*)conv, 52, "iconv", 1); // cache convertor s_convTable.addKey(&hash, &conv); //log(LOG_DEBUG, "uni: Saved convertor 0x%"INT32" under hash 0x%"XINT32"", // conv, hash); } else{ // reset convertor char *dummy = NULL; size_t dummy2 = 0; // JAB: warning abatement //size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2); iconv(conv,NULL,NULL,&dummy,&dummy2); } return conv; }
bool saveHashTable ( ) { if ( s_ht.m_numSlotsUsed <= 0 ) return true; SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: saving crctable.dat"); s_ht.save ( fn.getBufStart() , "crctable.dat" ); return true; }
bool AdultBit::isObscene ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_obscene[] = { "c**t", "clits", // "cum", magna cum laude "cums", "cumshot", "c**t", "cunts", "milf", "rimjob", "felch", "f**k", "f****d", "f****r", "f*****g", "f***s", "w***e", "w****s" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0, "obscenetab") ) return log("build: Error initializing " "obscene word hash table." ); // now add in all the stop words int32_t n = sizeof(s_obscene) / sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_obscene[i] ); if ( ! s_otable.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_otable.getScore ( &h ); }
nodeid_t getTagId ( char *s , NodeType **retp ) { // init table? static bool s_init = false; static HashTableX s_ht; static char s_buf[10000]; if ( ! s_init ) { s_init = true; s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0 // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { char *name = g_nodes[i].m_nodeName; int32_t nlen = gbstrlen(name); int64_t h = hash64Upper_a ( name,nlen,0LL ); NodeType *nt = &g_nodes[i]; if ( ! s_ht.addKey(&h,&nt) ) { char *xx=NULL;*xx=0; } } // sanity if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; } // sanity test nodeid_t tt = getTagId ( "br" ); if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; } } // find end of tag name. hyphens are ok to be in name. // facebook uses underscores like <start_time> char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++); // hash it for lookup int64_t h = hash64Upper_a ( s , e - s , 0 ); // look it up NodeType **ntp = (NodeType **)s_ht.getValue(&h); // assume none if ( retp ) *retp = NULL; // none? if ( ! ntp ) return 0; // got one if ( retp ) *retp = *ntp; // get id otherwise return (*ntp)->m_nodeId; }
const char *extensionToContentTypeStr2 ( const char *ext , int32_t elen ) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return NULL; if ( elen <= 0 ) return NULL; // get hash for table look up int32_t key = hash32 ( ext , elen ); char **pp = (char **)s_mimeTable.getValue ( &key ); if ( ! pp ) return NULL; return *pp; }
// . list of types is on: http://www.duke.edu/websrv/file-extensions.html // . i copied it to the bottom of this file though const char *HttpMime::getContentTypeFromExtension ( const char *ext ) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return "text/html"; // get hash for table look up int32_t key = hash32n ( ext ); char **pp = (char **)s_mimeTable.getValue ( &key ); // if not found in table, assume text/html if ( ! pp ) return "text/html"; return *pp; }
// call this at startup to register the handlers bool initSpiderProxyStuff() { // do this for all hosts in case host #0 goes dead, then everyone // will, according to Msg13.cpp, send to host #1, the next in line // if she is alive //if ( g_hostdb.m_myHostId != 0 ) return true; // only host #0 has handlers if ( ! g_udpServer.registerHandler ( msg_type_54, handleRequest54 )) return false; // key is ip/port s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,"siptab",true); // skip port part of key magic, and get LSB of the IP as key magic s_iptab.m_maskKeyOffset = 5; loadSpiderProxyStats(); // build the s_iptab hashtable for the first time buildProxyTable (); // reset spider proxy stats every hour to alleviate false positives (moved from Process.cpp) if (!g_loop.registerSleepCallback(3600000, NULL, resetProxyStatWrapper, 0)) { gbshutdownResourceError(); } // make the loadtable hashtable static bool s_flag = 0; if ( s_flag ) return true; s_flag = true; return s_loadTable.set(4, sizeof(LoadBucket), 128, NULL, 0, // this slows us down true, // allow dups? "lbtab", true); // use key magic to mix things up }
// . use msg 0x55 to say you are done using the proxy // . we now use the top part of the Msg13Request as the proxy request void returnProxy ( Msg13Request *preq , UdpSlot *udpSlot ) { //char *p = request; //int32_t proxyIp = *(int32_t *)p; p += 4; //int16_t proxyPort = *(int16_t *)p; p += 2; //int32_t lbId = *(int32_t *)p; p += 4; int32_t urlIp = preq->m_urlIp; // // update the load bucket // // scan over all that match to find lbid int32_t hslot = s_loadTable.getSlot ( &urlIp ); // scan all proxies that have this urlip outstanding int32_t i;for (i=hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){ // get the bucket LoadBucket *lb= (LoadBucket *)s_loadTable.getValueFromSlot(i); // is it the right id? if ( lb->m_id != preq->m_lbId ) continue; if ( lb->m_proxyIp != preq->m_proxyIp ) continue; if ( lb->m_proxyPort != preq->m_proxyPort ) continue; // that's it. set the download end time int64_t nowms = gettimeofdayInMillisecondsLocal(); lb->m_downloadEndTimeMS = nowms; break; } if ( i < 0 ) log("sproxy: could not find load bucket id #%" PRId32,preq->m_lbId); // if no slot provided, return to called without sending back reply, // they are banning a proxy and need to also return it before // we send them back another proxy to try. if ( ! udpSlot ) return; // gotta send reply back g_udpServer.sendReply(0, 0, 0, 0, udpSlot); }
// get the id from a 2 character country code uint8_t getCountryId ( char *cc ) { static bool s_init = false; static char buf[2000]; static HashTableX ht; char tmp[4]; if ( ! s_init ) { s_init = true; // hash them up ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids"); // now add in all the country codes long n = (long) sizeof(s_countryCode) / sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *s = (char *)s_countryCode[i]; //long slen = gbstrlen ( s ); // sanity check if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; } // map it to a 4 byte key tmp[0]=s[0]; tmp[1]=s[1]; tmp[2]=0; tmp[3]=0; // a val of 0 does not mean empty in HashTableX, // that is an artifact of HashTableT uint8_t val = i; // +1; // add 1 cuz 0 means lang unknown if ( ! ht.addKey ( tmp , &val ) ) { char *xx=NULL;*xx=0; } } } // lookup tmp[0]=to_lower_a(cc[0]); tmp[1]=to_lower_a(cc[1]); tmp[2]=0; tmp[3]=0; long slot = ht.getSlot ( tmp ); if ( slot < 0 ) return 0; void *val = ht.getValueFromSlot ( slot ); return *(uint8_t *)val ; }
// . how many keys are dups // . returns -1 on error long HashTableX::getNumDups() { if ( ! m_allowDups ) return 0; HashTableX tmp; if ( ! tmp.set ( m_ks, 0, m_numSlots, NULL , 0 , false , m_niceness, "htxtmp") ) return -1; // put into that table for ( long i = 0 ; i < m_numSlots ; i++ ) { // skip empty bucket if ( ! m_flags[i] ) continue; // get the key char *kp = (char *)getKeyFromSlot(i); // add to new table if ( ! tmp.addKey ( kp ) ) return -1; } // the unqieus long uniques = tmp.m_numSlotsUsed; // the dups long dups = m_numSlotsUsed - uniques; // that's it return dups; }
// we come here after thread exits static void threadDoneWrapper ( void *state , class ThreadEntry *t ) { // get this class HashTableX *THIS = (HashTableX *)state; // store save error into g_errno //g_errno = THIS->m_saveErrno; // log it log("db: done saving %s/%s",THIS->m_dir,THIS->m_filename); // . resume adding to the hashtable // . this will also allow other threads to be queued // . if we did this at the end of the thread we could end up with // an overflow of queued SAVETHREADs THIS->m_isSaving = false; // we do not need to be saved now? THIS->m_needsSave = false; // g_errno should be preserved from the thread so if threadSave() // had an error it will be set if ( g_errno ) log("db: Had error saving hashtable to disk for %s: %s.", THIS->m_allocName,mstrerror(g_errno)); // . call callback if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state ); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) { log("build: Could not init table of HTML entities."); return false; } // now add in all the html entities const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // convert the unicode codepoints to an utf8 string char *buf = (char *)s_entities[i].utf8; for(int j=0; j<s_entities[i].codepoints; j++) { UChar32 codepoint = s_entities[i].codepoint[j]; int32_t len = utf8Encode(codepoint,buf); if ( len == 0 ) { g_process.shutdownAbort(true); } // make modification to make parsing easier if ( codepoint == 160 ) { // nbsp buf[0] = ' '; len = 1; } buf += len; } s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8); // must not exist! if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);} // store the entity index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) return false; } s_isInitialized = true; } return true; }
// . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... static const Entity *getTextEntity ( const char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore(h); // return 0 if no match if ( i == 0 ) return NULL; // point to the utf8 char. these is 1 or 2 bytes it seems return s_entities+i-1; }
// . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... uint32_t getTextEntity ( const char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return 0; // point to the utf8 char. these is 1 or 2 bytes it seems char *p = (char *)s_entities[i-1].utf8; // encode into unicode uint32_t c = utf8Decode ( p ); // return that return c; }
// . sets *current to how many downloads are CURRENTly outstanding for // this proxy // . returns total # of load points, i.e. downloads, in the last // LOADPOINT_EXPIRE_MS milliseconds (currently 10 minutes) static int32_t getNumLoadPoints(SpiderProxy *sp, int32_t *current) { // currently outstanding downloads on proxy *current = 0; int32_t count = 0; // scan all proxies that have this urlip outstanding for ( int32_t i = 0 ; i < s_loadTable.m_numSlots ; i++ ) { // skip if empty if ( ! s_loadTable.m_flags[i] ) continue; // get the bucket LoadBucket *lb= (LoadBucket *)s_loadTable.getValueFromSlot(i); // get the spider proxy this load point was for if ( lb->m_proxyIp != sp->m_ip ) continue; if ( lb->m_proxyPort != sp->m_port ) continue; // currently outstanding downloads on proxy if ( lb->m_downloadEndTimeMS == 0LL ) *current = *current + 1; count++; } return count; }
// . is "s" an HTML entity? (ascii representative of an iso char) // . return the 32-bit unicode char it represents // . returns 0 if none // . JAB: const-ness for optimizer... uint32_t getTextEntity ( char *s , int32_t len ) { if ( !initEntityTable()) return 0; // take the ; off, if any if ( s[len-1] == ';' ) len--; // compute the hash of the entity including &, but not ; int64_t h = hash64 ( s , len ); // get the entity index from table (stored in the score field) int32_t i = (int32_t) s_table.getScore ( &h ); // return 0 if no match if ( i == 0 ) return 0; // point to the utf8 char. these is 1 or 2 bytes it seems char *p = (char *)s_entities[i-1].utf8; // encode into unicode uint32_t c = utf8Decode ( p ); // return that return c; // return the iso character //printf("Converted text entity \""); //for(int si=0;si<len;si++)putchar(s[si]); //printf("\" to 0x%x(%d)\"%c\"\n",s_entities[i-1].c,s_entities[i-1].c, // s_entities[i-1].c); //return (uint32_t)s_entities[i-1].c; }
// . when the Conf::m_proxyIps parm is updated we call this to rebuild // s_iptab, our table of SpiderProxy instances, which has the proxies and // their performance statistics. // . we try to maintain stats of ip/ports that did NOT change when rebuilding. bool buildProxyTable ( ) { // scan the NEW list of proxy ip/port pairs in g_conf char *p = g_conf.m_proxyIps.getBufStart(); HashTableX tmptab; tmptab.set(8,0,16,NULL,0,false,"tmptab"); // scan the user inputted space-separated list of ip:ports // (optional username:password@ip:port) for ( ; *p ; ) { // skip white space if ( is_wspace_a(*p) ) { p++; continue; } // skip http:// if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; } // scan in an ip:port char *s = p; char *portStr = NULL; int32_t dc = 0, pc = 0, gc = 0, bc = 0; const char *msg; char *usernamePwd = NULL; int32_t usernamePwdLen = 0; char *ipStart = p; // scan all characters until we hit \0 or another whitespace for ( ; *s && !is_wspace_a(*s); s++) { if ( *s == '@' ) { // must be username:pwd if ( pc != 1 ) { msg = "bad username:password"; goto hadError; } usernamePwd = p; usernamePwdLen = s - p; if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) { msg = "username:password too long"; goto hadError; } dc = 0; gc = 0; bc = 0; pc = 0; portStr = NULL; ipStart = s+1; continue; } if ( *s == '.' ) { dc++; continue; } if ( *s == ':' ) { portStr=s; pc++; continue; } if ( is_digit(*s) ) { gc++; continue; } bc++; continue; } // ensure it is a legit ip:port combo msg = NULL; if ( gc < 4 ) msg = "not enough digits for an ip"; if ( pc > 1 ) msg = "too many colons"; if ( dc != 3 ) msg = "need 3 dots for an ip address"; if ( bc ) msg = "got illegal char in ip:port listing"; if ( msg ) { hadError: char c = *s; *s = '\0'; log("buf: %s for %s",msg,p); *s = c; return false; } // convert it int32_t iplen = s - ipStart; if ( portStr ) iplen = portStr - ipStart; int32_t ip = atoip(ipStart,iplen); // another sanity check if ( ip == 0 || ip == -1 ) { log("spider: got bad proxy ip for %s",p); return false; } // and the port default is 80 int32_t port = 80; if ( portStr ) port = atol2(portStr+1,s-portStr-1); if ( port < 0 || port > 65535 ) { log("spider: got bad proxy port for %s",p); return false; } // . we got a legit ip:port // . see if already in our table uint64_t ipKey = (uint32_t)ip; ipKey <<= 16; ipKey |= (uint16_t)(port & 0xffff); // also store into tmptable to see what we need to remove tmptab.addKey(&ipKey); // see if in table int32_t islot = s_iptab.getSlot( &ipKey); // advance p p = s; // if in there, keep it as is if ( islot >= 0 ) continue; // otherwise add new entry SpiderProxy newThing; memset ( &newThing , 0 , sizeof(SpiderProxy)); newThing.m_ip = ip; newThing.m_port = port; newThing.m_lastDownloadTookMS = -1; newThing.m_lastSuccessfulTestMS = -1; gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen); // ensure it is NULL terminated newThing.m_usernamePwd[usernamePwdLen] = '\0'; if ( ! s_iptab.addKey ( &ipKey, &newThing ) ) return false; } redo: int32_t removed = 0; // scan all SpiderProxies in tmptab for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty buckets in hashtable s_iptab if ( ! s_iptab.m_flags[i] ) continue; // get the key int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i); // must also exist in tmptab, otherwise it got removed by user if ( tmptab.isInTable ( &key ) ) continue; // skip if not in table if ( s_iptab.getSlot ( &key ) < 0 ) { log("sproxy: iptable hashing messed up"); continue; } // shoot, it got removed. not in the new list of ip:ports s_iptab.removeKey ( &key ); removed++; // hashtable is messed up now, start over //goto redo; } if ( removed ) goto redo; return true; }
// a host is asking us (host #0) what proxy to use? static void handleRequest54(UdpSlot *udpSlot, int32_t niceness) { char *request = udpSlot->m_readBuf; int32_t requestSize = udpSlot->m_readBufSize; // we now use the top part of the Msg13Request as the ProxyRequest Msg13Request *preq = (Msg13Request *)request; // sanity check if ( requestSize != preq->getProxyRequestSize() ) { log("db: Got bad request 0x54 size of %" PRId32" bytes. bad", requestSize ); g_udpServer.sendErrorReply ( udpSlot , EBADREQUESTSIZE ); return; } // is the request telling us it is done downloading through a proxy? if ( preq->m_opCode == OP_RETPROXY ) { returnProxy ( preq , udpSlot ); return; } // if sender is asking for a new proxy and wants us to ban // the previous proxy we sent for this urlIp... if ( preq->m_banProxyIp ) { // don't core if misses sanity. it seems we don't always // NULLify these or something. // these must match if(preq->m_banProxyIp != preq->m_proxyIp || preq->m_banProxyPort != preq->m_proxyPort){ log("db: proxy: banproxyip != proxyip. mismatch!"); g_udpServer.sendErrorReply ( udpSlot , EBADENGINEER); return; } // this will "return" the banned proxy returnProxy ( preq , NULL ); // now add it to the banned table int64_t uip = preq->m_urlIp; int64_t pip = preq->m_banProxyIp; int64_t h64 = hash64h ( uip , pip ); if ( ! s_proxyBannedTable.isInTable ( &h64 ) ) { s_proxyBannedTable.addKey ( &h64 ); // for stats counting. each proxy ip maps to # // of unique website IPs that have banned it. s_banCountTable.addTerm32((uint32_t)pip); } } // shortcut int32_t urlIp = preq->m_urlIp; // send to a proxy that is up and has the least amount // of LoadBuckets with this urlIp, if tied, go to least loaded. // clear counts for this url ip for scoring the best proxy to use for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); sp->m_countForThisIp = 0; sp->m_lastTimeUsedForThisIp = 0LL; } // this table maps a url's current IP to a possibly MULTIPLE slots // which tell us what proxy is downloading a page from that IP. // so we can try to find a proxy that is not download a url from // this IP currently, or hasn't been for the longest time... int32_t hslot = s_loadTable.getSlot ( &urlIp ); // scan all proxies that have this urlip outstanding for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){ // get the bucket LoadBucket *lb; lb = (LoadBucket *)s_loadTable.getValueFromSlot(i); // get the spider proxy this load point was for uint64_t key = (uint32_t)lb->m_proxyIp; key <<= 16; key |= (uint16_t)lb->m_proxyPort; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValue(&key); // must be there unless user remove it from the list if ( ! sp ) continue; // count it up if ( lb->m_downloadEndTimeMS == 0LL ) sp->m_countForThisIp++; // set the last time used to the most recently downloaded time // that this proxy has downloaded from this ip if ( lb->m_downloadEndTimeMS && lb->m_downloadEndTimeMS > sp->m_lastTimeUsedForThisIp ) sp->m_lastTimeUsedForThisIp = lb->m_downloadEndTimeMS; } // first try to get a spider proxy that is not "dead" bool skipDead = true; int32_t numBannedProxies = 0; int32_t aliveProxyCandidates = 0; redo: // get the min of the counts int32_t minCount = 999999; for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) { numBannedProxies++; continue; } } // if it failed the last test, skip it if ( skipDead && sp->m_lastDownloadError ) continue; if ( skipDead ) aliveProxyCandidates++; if ( sp->m_countForThisIp >= minCount ) continue; minCount = sp->m_countForThisIp; } // all dead? then get the best dead one if ( minCount == 999999 ) { skipDead = false; goto redo; } // . we only use one proxy if none are banned by this IP // . when that gets banned, we will use the next 2 proxies with // a higher backoff/crawlDelay, etc. int32_t threshHold; if ( numBannedProxies <= 0 ) threshHold = 1; // if first proxy gets banned, try next 2 proxies until both get ban'd else if ( numBannedProxies == 1 ) threshHold = 2; else if ( numBannedProxies < 1+2) threshHold = 3 - numBannedProxies; // if next two proxies got banned, try next 4 proxies until banned else if ( numBannedProxies == 3 ) threshHold = 4; else if ( numBannedProxies < 3+4) threshHold = 7 - numBannedProxies; // if next 4 proxies got banned, try next 8 proxies until they get band else if ( numBannedProxies == 7 ) threshHold = 8; else if ( numBannedProxies < 7+8) threshHold = 15 - numBannedProxies; else if ( numBannedProxies == 15) threshHold = 16; else if ( numBannedProxies < 15+16 ) threshHold = 31-numBannedProxies; else if ( numBannedProxies == 31 ) threshHold = 32; else if ( numBannedProxies < 31+32)threshHold=63-numBannedProxies; else if ( numBannedProxies == 63 ) threshHold = 64; else if ( numBannedProxies < 63+64)threshHold=127-numBannedProxies; else if ( numBannedProxies == 127 ) threshHold = 128; else if ( numBannedProxies < 127+128)threshHold=255-numBannedProxies; else if ( numBannedProxies == 255 ) threshHold = 256; else if ( numBannedProxies < 255+256)threshHold=512-numBannedProxies; else if ( numBannedProxies == 511 ) threshHold = 512; else if ( numBannedProxies < 511+512)threshHold=1024-numBannedProxies; else threshHold = 1024; if ( threshHold <= 0 ) { log("proxy: spiderproxy error in threshold of %" PRId32" " "for banned=%" PRId32,threshHold,numBannedProxies); threshHold = 1; } // reset minCount so we can take the min over those we check here minCount = -1; int64_t oldest = 0x7fffffffffffffffLL; SpiderProxy *winnersp = NULL; int32_t count = 0; // start at a random slot based on url's IP so we don't // overload the first proxy int32_t start = ((uint32_t)urlIp) % s_iptab.getNumSlots(); int32_t slotCount = s_iptab.getNumSlots(); // . now find the best proxy wih the minCount for ( int32_t i = start ; ; i++ ) { // scan all slots in hash table, then stop if ( slotCount-- <= 0 ) break; // wrap around to zero if we hit the end if ( i == s_iptab.getNumSlots() ) i = 0; // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if it failed the last test, skip it... not here... if ( skipDead && sp->m_lastDownloadError ) continue; // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue; } // if some proxies are "alive" then only pick from // the first half of the proxies that are alive (i.e. still // work). that way, when one of those goes dead we will inc // the backoff (crawldelay) and a new proxy that we haven't // used for this url's IP will take it's place. and such // new proxies will only have the new backoff count used // through them. that way, we don't get ALL of our proxies // banned at about the same time since we do somewhat uniform // load balancing over them. if ( skipDead && count >= threshHold)//aliveProxyCandidates/2 ) continue; // count the alive/non-banned candidates count++; // if all hosts were "dead" because they all had // m_lastDownloadError set then minCount will be 999999 // and nobody should continue from this statement: if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue; // then go by last download time for this ip if ( sp->m_countForThisIp == minCount && minCount>=0 && sp->m_lastTimeUsedForThisIp >= oldest ) continue; // pick the spider proxy used longest ago oldest = sp->m_lastTimeUsedForThisIp; minCount = sp->m_countForThisIp; // got a new winner winnersp = sp; } // we must have a winner if ( ! winnersp ) { g_process.shutdownAbort(true); } int64_t nowms = gettimeofdayInMillisecondsLocal(); // add a new load bucket then! LoadBucket bb; bb.m_urlIp = urlIp; // the time it started bb.m_downloadStartTimeMS = nowms; // download has not ended yet bb.m_downloadEndTimeMS = 0LL; // the host using the proxy bb.m_hostId = udpSlot->getHostId(); // key is this for m_prTable bb.m_proxyIp = winnersp->m_ip; bb.m_proxyPort = winnersp->m_port; // a new id. we use this to update the downloadEndTime when done static int32_t s_lbid = 0; // add it now bb.m_id = s_lbid++; s_loadTable.addKey ( &urlIp , &bb ); // winner count update winnersp->m_timesUsed++; // sanity if ( (int32_t)sizeof(ProxyReply) > TMPBUFSIZE ){g_process.shutdownAbort(true);} // and give proxy ip/port back to the requester so they can // use that to download their url ProxyReply *prep = (ProxyReply *)udpSlot->m_tmpBuf; prep->m_proxyIp = winnersp->m_ip; prep->m_proxyPort = winnersp->m_port; // this is just '\0' if none strcpy(prep->m_usernamePwd,winnersp->m_usernamePwd); // do not count the proxy we are returning as "more" prep->m_hasMoreProxiesToTry = ( aliveProxyCandidates > 1 ); // and the loadbucket id, so requester can tell us it is done // downloading through the proxy and we can update the LoadBucket // for this transaction (m_lbId) prep->m_lbId = bb.m_id; // requester wants to know how many proxies have been banned by the // urlIp so it can increase a self-imposed crawl-delay to be more // sensitive to the spider policy. prep->m_numBannedProxies = numBannedProxies; //char *p = udpSlot->m_tmpBuf; //*(int32_t *)p = winnersp->m_ip ; p += 4; //*(int16_t *)p = winnersp->m_port; p += 2; // and the loadbucket id //*(int32_t *)p = bb.m_id; p += 4; // with dup keys we end up with long chains of crap and this // takes forever. so just flush the whole thing every 2 minutes AND // when 20000+ entries are in there static time_t s_lastTime = 0; time_t now = nowms / 1000; if ( s_lastTime == 0 ) s_lastTime = now; time_t elapsed = now - s_lastTime; if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) { log("sproxy: flushing %i entries from proxy loadtable that " "have accumulated since %i seconds ago", (int)s_loadTable.m_numSlotsUsed,(int)elapsed); s_loadTable.clear(); // only do this one per minute s_lastTime = now; } int32_t sanityCount = 0;//s_loadTable.getNumSlots(); // top: // now remove old entries from the load table. entries that // have completed and have a download end time more than 10 mins ago. for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) { // skip if empty if ( ! s_loadTable.m_flags[i] ) continue; // get the bucket LoadBucket *pp =(LoadBucket *)s_loadTable.getValueFromSlot(i); // skip if still active if ( pp->m_downloadEndTimeMS == 0LL ) continue; // delta t int64_t took = nowms - pp->m_downloadEndTimeMS; // < 10 mins? now it's < 15 seconds to prevent clogging. if ( took < LOADPOINT_EXPIRE_MS ) continue; // 100 at a time so we don't slam cpu if ( sanityCount++ > 100 ) break; // ok, its too old, nuke it to save memory s_loadTable.removeSlot(i); // the keys might have buried us but we really should not // mis out on analyzing any keys if we just keep looping here // should we? TODO: figure it out. if we miss a few it's not // a big deal. //i--; //goto top; } // send the proxy ip/port/LBid back to user g_udpServer.sendReply(udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot); }