// Slightly modified from getTextEntity
short get_iana_charset(char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") )
	    return log("build: Could not init table of "
		       "IANA Charsets.");
	// now add in all the charset entries
	long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( long i = 0 ; i < n ; i++ ) {
	    long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(&h, i+1) ) 
		return log("build: add term failed");
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    long long h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    long i = (long) s_table.getScore ( &h );
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (short)s_charsets[i-1].mib_enum;
}
// Slightly modified from getTextEntity
int16_t get_iana_charset(const char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) {
		log(LOG_WARN, "build: Could not init table of IANA Charsets.");
		return csUnknown;
	}
	// now add in all the charset entries
	int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( int32_t i = 0 ; i < n ; i++ ) {
	    int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(h, i+1) ) {
			log(LOG_WARN, "build: add term failed");
			return csUnknown;
		}
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    int64_t h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    int32_t i = (int32_t) s_table.getScore(h);
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (int16_t)s_charsets[i-1].mib_enum;
}
static bool loadSpiderProxyStats() {

	initProxyTables();

	// take this out for now since i was seeing dups in s_iptab for
	// some reason. was causing an infinite loop bug calling goto redo:
	// all the time above.

	// save hashtable
	//s_proxyBannedTable.load(g_hostdb.m_dir,"proxybantable.dat");

	//s_banCountTable.load(g_hostdb.m_dir,"proxybancounttable.dat");

	// save hash table. this also returns false if does not exist.
	//if ( ! s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat") ) 
	//	return false;

	// unset some flags
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
		sp->m_isWaiting = false;
	}

	return true;
}
Ejemplo n.º 4
0
// . init s_mimeTable in this call
// . called from HttpServer::init
// . returns false and sets g_errno on error
bool HttpMime::init ( ) {
	// only need to call once
	if ( s_init ) return true;
	// make sure only called once
	s_init = true;
	//s_mimeTable.set ( 256 );
	//s_mimeTable.setLabel("mimetbl");
	if ( ! s_mimeTable.set(4,sizeof(char *),256,NULL,0,false,1,"mimetbl"))
		return false;
	// set table from internal list
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) {
		int32_t key = hash32n ( s_ext[i] );
		if ( ! s_mimeTable.addKey ( &key , &s_ext[i+1] ) ) 
			return log("HttpMime::init: failed to set table.");
	}
	// quick text
	const char *tt = getContentTypeFromExtension ( "zip" );
	if ( strcmp(tt,"application/zip") != 0 ) {
		g_errno = EBADENGINEER;
		return log("http: Failed to init mime table correctly.");
	}
	// a more thorough test
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) {
		tt = getContentTypeFromExtension ( s_ext[i] );
		if ( strcmp(tt,s_ext[i+1]) == 0 ) continue;
		g_errno = EBADENGINEER;
		return log("http: Failed to do mime table correctly. i=%" PRId32,i);
	}

	// TODO: set it from a user supplied file here
	return true;
}
bool AdultBit::isDirty ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_dirty[] = {
		"anal",
		"analsex",
		"b*****b",
		"blowjobs",
		"boob",
		"boobs",
		"clitoris",
		"c**k",
		"cocks",
		"cum",
		"dick",
		"dicks",
		"g******g",
		"gangbangs",
		"gangbanging",
		"movie",
		"movies",
		"oral",
		"oralsex",
		"p**n",
		"porno",
		"pussy",
		"pussies",
		"sex",
		"sexy",
		"tit",
		"t**s",
		"video",
		"videos",
		"xxx",
		"xxxx",
		"xxxx"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_dtable.set ( 8,4,sizeof(s_dirty  )*2,NULL,0,false,0,
				      "adulttab")) 
			return log("build: Error initializing "
				    "dirty word hash table." );
		// now add in all the dirty words
		int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_dirty  [i] );
			if ( ! s_dtable.addTerm (&h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_dtable.getScore ( &h );
}		
static bool initProxyTables() {
	// initialize proxy/urlip ban table?
	if ( ! s_init ) return true;
	s_init = false;
	s_proxyBannedTable.set(8,0,0,NULL,0,false,"proxban");
	s_banCountTable.set(4,4,0,NULL,0,false,"bancnt");
	return true;
}
// save the stats
bool saveSpiderProxyStats ( ) {

	// save hashtable
	s_proxyBannedTable.save(g_hostdb.m_dir,"proxybantable.dat");

	s_banCountTable.save(g_hostdb.m_dir,"proxybancounttable.dat");

	// save hash table
	return s_iptab.save(g_hostdb.m_dir,"spiderproxystats.dat");
}
//
// . new code for saving hashtablex in a thread
// . so Process.cpp's call to g_spiderCache.save() can save the doleiptable
//   without blocking...
//
static void *saveWrapper ( void *state , class ThreadEntry *t ) {
	// get this class
	HashTableX *THIS = (HashTableX *)state;
	// this returns false and sets g_errno on error
	THIS->save( THIS->m_dir , 
		    THIS->m_filename , 
		    THIS->m_tbuf ,
		    THIS->m_tsize );
	// now exit the thread, bogus return
	return NULL;
}
bool resetProxyStats ( ) {
	// s_proxyBannedTable.reset();
	// s_banCountTable.reset();
	// s_iptab.reset();
	s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,"siptab",true);
	// skip port part of key magic, and get LSB of the IP as key magic
	s_iptab.m_maskKeyOffset = 5;
	s_proxyBannedTable.set(8,0,0,NULL,0,false,"proxban");
	s_banCountTable.set(4,4,0,NULL,0,false,"bancnt");
	return buildProxyTable();
}
SpiderProxy *getSpiderProxyByIpPort ( int32_t ip , uint16_t port ) {
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
		if ( sp->m_ip != ip ) continue;
		if ( sp->m_port != port ) continue;
		return sp;
	}
	return NULL;
}
void gbiconv_reset(){
	for (int32_t i=0;i<s_convTable.getNumSlots();i++){
		//int32_t key = *(int32_t *)s_convTable.getKey(i);
		//if (!key) continue;
		if ( ! s_convTable.m_flags[i] ) continue;
		iconv_t *pconv = (iconv_t *)s_convTable.getValueFromSlot(i);
		if (! pconv) continue;
		iconv_t iconv = *pconv;
		//logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv);
		g_mem.rmMem((void*)iconv, 52, "iconv");
		libiconv_close(iconv);
	}
	s_convTable.reset();
}
Ejemplo n.º 12
0
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			//
			// make my own mods to make parsing easier
			//

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;
			}

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
iconv_t gbiconv_open( char *tocode, char *fromcode) {
	// get hash for to/from
	uint32_t hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0);
	uint32_t hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0);
	uint32_t hash = hash32h(hash1, hash2);

	g_errno = 0;
	iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash);
	iconv_t conv = NULL;
	if ( convp ) conv = *convp;
	//log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%"XINT32": 0x%"XINT32"",
	//    fromcode, tocode,
	//    hash, conv);
	if (!conv){
		//log(LOG_DEBUG, "uni: Allocating new convertor for "
		//    "%s to %s (hash: 0x%"XINT32")",
		//    fromcode, tocode,hash);
		conv = iconv_open(tocode, fromcode);
		if (conv == (iconv_t) -1) {
			log(LOG_WARN, "uni: failed to open converter for "
			    "%s to %s: %s (%d)", fromcode, tocode, 
			    strerror(errno), errno);
			// need to stop if necessary converters don't open
			//char *xx=NULL; *xx = 0;
			g_errno = errno;
			if (errno == EINVAL)
				g_errno = EBADCHARSET;
			
			return conv;
		}
		// add mem to table to keep track
		g_mem.addMem((void*)conv, 52, "iconv", 1);
		// cache convertor
		s_convTable.addKey(&hash, &conv);
		//log(LOG_DEBUG, "uni: Saved convertor 0x%"INT32" under hash 0x%"XINT32"",
		//    conv, hash);
	}
	else{
		// reset convertor
		char *dummy = NULL;
		size_t dummy2 = 0;
		// JAB: warning abatement
		//size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2);
		iconv(conv,NULL,NULL,&dummy,&dummy2);
	}

	return conv;
}
bool saveHashTable ( ) {
	if ( s_ht.m_numSlotsUsed <= 0 ) return true;
	SafeBuf fn;
	fn.safePrintf("%s/qa/",g_hostdb.m_dir);
	log("qa: saving crctable.dat");
	s_ht.save ( fn.getBufStart() , "crctable.dat" );
	return true;
}
bool AdultBit::isObscene ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_obscene[] = {
		"c**t",
		"clits",
//		"cum",    magna cum laude
		"cums",
		"cumshot",
		"c**t",
		"cunts",
		"milf",
		"rimjob",
		"felch",
		"f**k",
		"f****d",
		"f****r",
		"f*****g",
		"f***s",
		"w***e",
		"w****s"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
				      "obscenetab") ) 
			return log("build: Error initializing "
				    "obscene word hash table." );
		// now add in all the stop words
		int32_t n = sizeof(s_obscene) / sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_obscene[i] );
			if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_otable.getScore ( &h );
}		
nodeid_t getTagId ( char *s , NodeType **retp ) {

	// init table?
	static bool s_init = false;
	static HashTableX  s_ht;
	static char s_buf[10000];
	if ( ! s_init ) {
		s_init = true;
		s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
		// how many NodeTypes do we have in g_nodes?
		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			char *name = g_nodes[i].m_nodeName;
			int32_t  nlen = gbstrlen(name);
			int64_t h = hash64Upper_a ( name,nlen,0LL );
			NodeType *nt = &g_nodes[i];
			if ( ! s_ht.addKey(&h,&nt) ) { 
				char *xx=NULL;*xx=0; }
		}
		// sanity
		if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
		// sanity test
		nodeid_t tt = getTagId ( "br" );
		if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
	}


	// find end of tag name. hyphens are ok to be in name.
	// facebook uses underscores like <start_time>
	char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
	// hash it for lookup
	int64_t h = hash64Upper_a ( s , e - s , 0 );
	// look it up
	NodeType **ntp = (NodeType **)s_ht.getValue(&h);
	// assume none
	if ( retp ) *retp = NULL;
	// none?
	if ( ! ntp ) return 0;
	// got one
	if ( retp ) *retp = *ntp;
	// get id otherwise
	return (*ntp)->m_nodeId;
}
Ejemplo n.º 17
0
const char *extensionToContentTypeStr2 ( const char *ext , int32_t elen ) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return NULL;
	if ( elen <= 0 ) return NULL;
	// get hash for table look up
	int32_t key = hash32 ( ext , elen );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	if ( ! pp ) return NULL;
	return *pp;
}
Ejemplo n.º 18
0
// . list of types is on: http://www.duke.edu/websrv/file-extensions.html
// . i copied it to the bottom of this file though
const char *HttpMime::getContentTypeFromExtension ( const char *ext ) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return "text/html";
	// get hash for table look up
	int32_t key = hash32n ( ext );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	// if not found in table, assume text/html
	if ( ! pp ) return "text/html";
	return *pp;
}
// call this at startup to register the handlers
bool initSpiderProxyStuff() {
	
	// do this for all hosts in case host #0 goes dead, then everyone
	// will, according to Msg13.cpp, send to host #1, the next in line
	// if she is alive
	//if ( g_hostdb.m_myHostId != 0 ) return true;

	// only host #0 has handlers
	if ( ! g_udpServer.registerHandler ( msg_type_54, handleRequest54 ))
		return false;

	// key is ip/port
	s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,"siptab",true);
	// skip port part of key magic, and get LSB of the IP as key magic
	s_iptab.m_maskKeyOffset = 5;

	loadSpiderProxyStats();

	// build the s_iptab hashtable for the first time
	buildProxyTable ();

	// reset spider proxy stats every hour to alleviate false positives (moved from Process.cpp)
	if (!g_loop.registerSleepCallback(3600000, NULL, resetProxyStatWrapper, 0)) {
		gbshutdownResourceError();
	}

	// make the loadtable hashtable
	static bool s_flag = 0;
	if ( s_flag ) return true;
	s_flag = true;
	return s_loadTable.set(4,
			       sizeof(LoadBucket),
			       128,
			       NULL,
			       0,
			       // this slows us down
			       true, // allow dups?
			       "lbtab",
			       true); // use key magic to mix things up

}
// . use msg 0x55 to say you are done using the proxy
// . we now use the top part of the Msg13Request as the proxy request
void returnProxy ( Msg13Request *preq , UdpSlot *udpSlot ) {

	//char *p = request;
	//int32_t  proxyIp   = *(int32_t  *)p; p += 4;
	//int16_t proxyPort = *(int16_t *)p; p += 2;
	//int32_t  lbId      = *(int32_t  *)p; p += 4;

	int32_t  urlIp     = preq->m_urlIp;

	//
	// update the load bucket
	//

	// scan over all that match to find lbid
	int32_t hslot = s_loadTable.getSlot ( &urlIp );
	// scan all proxies that have this urlip outstanding
	int32_t i;for (i=hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){
		// get the bucket
		LoadBucket *lb= (LoadBucket *)s_loadTable.getValueFromSlot(i);
		// is it the right id?
		if ( lb->m_id != preq->m_lbId ) continue;
		if ( lb->m_proxyIp != preq->m_proxyIp ) continue;
		if ( lb->m_proxyPort != preq->m_proxyPort ) continue;
		// that's it. set the download end time
		int64_t nowms = gettimeofdayInMillisecondsLocal();
		lb->m_downloadEndTimeMS = nowms;
		break;
	}

	if ( i < 0 ) 
		log("sproxy: could not find load bucket id #%" PRId32,preq->m_lbId);

	// if no slot provided, return to called without sending back reply,
	// they are banning a proxy and need to also return it before
	// we send them back another proxy to try.
	if ( ! udpSlot ) return;

	// gotta send reply back
	g_udpServer.sendReply(0, 0, 0, 0, udpSlot);
}
// get the id from a 2 character country code
uint8_t getCountryId ( char *cc ) {
	static bool s_init = false;
	static char buf[2000];
	static HashTableX ht;
	char tmp[4];
	if ( ! s_init ) {
		s_init = true;
		// hash them up
		ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids");
		// now add in all the country codes
		long n = (long) sizeof(s_countryCode) / sizeof(char *); 
		for ( long i = 0 ; i < n ; i++ ) {
			char *s    = (char *)s_countryCode[i];
			//long  slen = gbstrlen ( s );
			// sanity check
			if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; }
			// map it to a 4 byte key
			tmp[0]=s[0];
			tmp[1]=s[1];
			tmp[2]=0;
			tmp[3]=0;
			// a val of 0 does not mean empty in HashTableX,
			// that is an artifact of HashTableT
			uint8_t val = i; // +1;
			// add 1 cuz 0 means lang unknown
			if ( ! ht.addKey ( tmp , &val ) ) {
				char *xx=NULL;*xx=0; }
		}
	}
	// lookup
	tmp[0]=to_lower_a(cc[0]);
	tmp[1]=to_lower_a(cc[1]);
	tmp[2]=0;
	tmp[3]=0;
	long slot = ht.getSlot ( tmp );
	if ( slot < 0 ) return 0;
	void *val = ht.getValueFromSlot ( slot );
	return *(uint8_t *)val ;
}
// . how many keys are dups
// . returns -1 on error
long HashTableX::getNumDups() {
	if ( ! m_allowDups ) return 0;
	HashTableX tmp;
	if ( ! tmp.set ( m_ks, 0, m_numSlots, NULL , 0 , false , m_niceness,
			 "htxtmp") )
		return -1;
	// put into that table
	for ( long i = 0 ; i < m_numSlots ; i++ ) {
		// skip empty bucket
		if ( ! m_flags[i] ) continue;
		// get the key
		char *kp = (char *)getKeyFromSlot(i);
		// add to new table
		if ( ! tmp.addKey ( kp ) ) return -1;
	}
	// the unqieus
	long uniques = tmp.m_numSlotsUsed;
	// the dups
	long dups = m_numSlotsUsed - uniques;
	// that's it
	return dups;
}
// we come here after thread exits
static void threadDoneWrapper ( void *state , class ThreadEntry *t ) {
	// get this class
	HashTableX *THIS = (HashTableX *)state;
	// store save error into g_errno
	//g_errno = THIS->m_saveErrno;
	// log it
	log("db: done saving %s/%s",THIS->m_dir,THIS->m_filename);
	// . resume adding to the hashtable
	// . this will also allow other threads to be queued
	// . if we did this at the end of the thread we could end up with
	//   an overflow of queued SAVETHREADs
	THIS->m_isSaving = false;
	// we do not need to be saved now?
	THIS->m_needsSave = false;
	// g_errno should be preserved from the thread so if threadSave()
	// had an error it will be set
	if ( g_errno )
		log("db: Had error saving hashtable to disk for %s: %s.",
		    THIS->m_allocName,mstrerror(g_errno));
	// . call callback
	if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state );
}
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;
		}

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				}
				buf += len;
				
			}
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
// . is "s" an HTML entity? (ascii representative of an iso char)
// . return the 32-bit unicode char it represents
// . returns 0 if none
// . JAB: const-ness for optimizer...
static const Entity *getTextEntity ( const char *s , int32_t len ) {
	if ( !initEntityTable()) return 0;
	// take the ; off, if any
	if ( s[len-1] == ';' ) len--;
	// compute the hash of the entity including &, but not ;
	int64_t h = hash64 ( s , len );
	// get the entity index from table (stored in the score field)
	int32_t i = (int32_t) s_table.getScore(h);
	// return 0 if no match
	if ( i == 0 )
		return NULL;
	// point to the utf8 char. these is 1 or 2 bytes it seems
	return s_entities+i-1;
}
Ejemplo n.º 26
0
// . is "s" an HTML entity? (ascii representative of an iso char)
// . return the 32-bit unicode char it represents
// . returns 0 if none
// . JAB: const-ness for optimizer...
uint32_t getTextEntity ( const char *s , int32_t len ) {
	if ( !initEntityTable()) return 0;
	// take the ; off, if any
	if ( s[len-1] == ';' ) len--;
	// compute the hash of the entity including &, but not ;
	int64_t h = hash64 ( s , len );
	// get the entity index from table (stored in the score field)
	int32_t i = (int32_t) s_table.getScore ( &h );
	// return 0 if no match
	if ( i == 0 ) return 0;
	// point to the utf8 char. these is 1 or 2 bytes it seems
	char *p = (char *)s_entities[i-1].utf8;
	// encode into unicode
	uint32_t c = utf8Decode ( p );
	// return that
	return c;
}
// . sets *current to how many downloads are CURRENTly outstanding for 
//   this proxy
// . returns total # of load points, i.e. downloads, in the last 
//   LOADPOINT_EXPIRE_MS milliseconds (currently 10 minutes)
static int32_t getNumLoadPoints(SpiderProxy *sp, int32_t *current) {
	// currently outstanding downloads on proxy
	*current = 0;
	int32_t count = 0;
	// scan all proxies that have this urlip outstanding
	for ( int32_t i = 0 ; i < s_loadTable.m_numSlots ; i++ ) {
		// skip if empty
		if ( ! s_loadTable.m_flags[i] ) continue;
		// get the bucket
		LoadBucket *lb= (LoadBucket *)s_loadTable.getValueFromSlot(i);
		// get the spider proxy this load point was for
		if ( lb->m_proxyIp != sp->m_ip ) continue;
		if ( lb->m_proxyPort != sp->m_port ) continue;

		// currently outstanding downloads on proxy
		if (  lb->m_downloadEndTimeMS == 0LL ) 
			*current = *current + 1;

		count++;
	}
	return count;
}
// . is "s" an HTML entity? (ascii representative of an iso char)
// . return the 32-bit unicode char it represents
// . returns 0 if none
// . JAB: const-ness for optimizer...
uint32_t getTextEntity ( char *s , int32_t len ) {
	if ( !initEntityTable()) return 0;
	// take the ; off, if any
	if ( s[len-1] == ';' ) len--;
	// compute the hash of the entity including &, but not ;
	int64_t h = hash64 ( s , len );
	// get the entity index from table (stored in the score field)
	int32_t i = (int32_t) s_table.getScore ( &h );
	// return 0 if no match
	if ( i == 0 ) return 0;
	// point to the utf8 char. these is 1 or 2 bytes it seems
	char *p = (char *)s_entities[i-1].utf8;
	// encode into unicode
	uint32_t c = utf8Decode ( p );
	// return that
	return c;
	// return the iso character
	//printf("Converted text entity \"");
	//for(int si=0;si<len;si++)putchar(s[si]);
	//printf("\" to 0x%x(%d)\"%c\"\n",s_entities[i-1].c,s_entities[i-1].c, 
	//	   s_entities[i-1].c);
	//return (uint32_t)s_entities[i-1].c;
}
// . when the Conf::m_proxyIps parm is updated we call this to rebuild
//   s_iptab, our table of SpiderProxy instances, which has the proxies and 
//   their performance statistics.
// . we try to maintain stats of ip/ports that did NOT change when rebuilding.
bool buildProxyTable ( ) {

	// scan the NEW list of proxy ip/port pairs in g_conf
	char *p = g_conf.m_proxyIps.getBufStart();

	HashTableX tmptab;
	tmptab.set(8,0,16,NULL,0,false,"tmptab");

	// scan the user inputted space-separated list of ip:ports
	// (optional username:password@ip:port)
	for ( ; *p ; ) {
		// skip white space
		if ( is_wspace_a(*p) ) { p++; continue; }

		// skip http://
		if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; }

		// scan in an ip:port
		char *s = p; char *portStr = NULL;
		int32_t dc = 0, pc = 0, gc = 0, bc = 0;
		const char *msg;

		char *usernamePwd = NULL;
		int32_t usernamePwdLen = 0;
		char *ipStart = p;

		// scan all characters until we hit \0 or another whitespace
		for ( ; *s && !is_wspace_a(*s); s++) {

			if ( *s == '@' ) {
				// must be username:pwd
				if ( pc != 1 ) {
					msg = "bad username:password";
					goto hadError;
				}
				usernamePwd = p;
				usernamePwdLen = s - p;
				if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) {
					msg = "username:password too long";
					goto hadError;
				}
				dc = 0;
				gc = 0;
				bc = 0;
				pc = 0;
				portStr = NULL;
				ipStart = s+1;
				continue;
			}

			if ( *s == '.' ) { dc++; continue; }
			if ( *s == ':' ) { portStr=s; pc++; continue; }
			if ( is_digit(*s) ) { gc++; continue; }
			bc++;
			continue;
		}
		// ensure it is a legit ip:port combo
		msg = NULL;
		if ( gc < 4 ) 
			msg = "not enough digits for an ip";
		if ( pc > 1 )
			msg = "too many colons";
		if ( dc != 3 )
			msg = "need 3 dots for an ip address";
		if ( bc )
			msg = "got illegal char in ip:port listing";
		if ( msg ) {
		hadError:
			char c = *s;
			*s = '\0';
			log("buf: %s for %s",msg,p);
			*s = c;
			return false;
		}

		// convert it
		int32_t iplen = s - ipStart;
		if ( portStr ) iplen = portStr - ipStart;
		int32_t ip = atoip(ipStart,iplen);
		// another sanity check
		if ( ip == 0 || ip == -1 ) {
			log("spider: got bad proxy ip for %s",p);
			return false;
		}

		// and the port default is 80
		int32_t port = 80;
		if ( portStr ) port = atol2(portStr+1,s-portStr-1);
		if ( port < 0 || port > 65535 ) {
			log("spider: got bad proxy port for %s",p);
			return false;
		}


		// . we got a legit ip:port
		// . see if already in our table
		uint64_t ipKey = (uint32_t)ip;
		ipKey <<= 16;
		ipKey |= (uint16_t)(port & 0xffff);

		// also store into tmptable to see what we need to remove
		tmptab.addKey(&ipKey);

		// see if in table
		int32_t islot = s_iptab.getSlot( &ipKey);

		// advance p
		p = s;

		// if in there, keep it as is
		if ( islot >= 0 ) continue;

		// otherwise add new entry
		SpiderProxy newThing;
		memset ( &newThing , 0 , sizeof(SpiderProxy));
		newThing.m_ip = ip;
		newThing.m_port = port;
		newThing.m_lastDownloadTookMS = -1;
		newThing.m_lastSuccessfulTestMS = -1;

		gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen);
		// ensure it is NULL terminated
		newThing.m_usernamePwd[usernamePwdLen] = '\0';

		if ( ! s_iptab.addKey ( &ipKey, &newThing ) )
			return false;
	}		

 redo:
	int32_t removed = 0;
	// scan all SpiderProxies in tmptab
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty buckets in hashtable s_iptab
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the key
		int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i);
		// must also exist in tmptab, otherwise it got removed by user
		if ( tmptab.isInTable ( &key ) ) continue;
		// skip if not in table
		if ( s_iptab.getSlot ( &key ) < 0 ) {
			log("sproxy: iptable hashing messed up");
			continue;
		}
		// shoot, it got removed. not in the new list of ip:ports
		s_iptab.removeKey ( &key );
		removed++;
		// hashtable is messed up now, start over
		//goto redo;
	}
	if ( removed ) goto redo;
	return true;
}
// a host is asking us (host #0) what proxy to use?
static void handleRequest54(UdpSlot *udpSlot, int32_t niceness) {

	char *request     = udpSlot->m_readBuf;
	int32_t  requestSize = udpSlot->m_readBufSize;

	// we now use the top part of the Msg13Request as the ProxyRequest
	Msg13Request *preq = (Msg13Request *)request;

	// sanity check
	if ( requestSize != preq->getProxyRequestSize() ) {
		log("db: Got bad request 0x54 size of %" PRId32" bytes. bad",
		    requestSize );
		g_udpServer.sendErrorReply ( udpSlot , EBADREQUESTSIZE );
		return;
	}

	// is the request telling us it is done downloading through a proxy?
	if ( preq->m_opCode == OP_RETPROXY ) {
		returnProxy ( preq , udpSlot );
		return;
	}

	// if sender is asking for a new proxy and wants us to ban
	// the previous proxy we sent for this urlIp...
	if ( preq->m_banProxyIp ) {
		// don't core if misses sanity. it seems we don't always
		// NULLify these or something.
		// these must match
		if(preq->m_banProxyIp   != preq->m_proxyIp  ||
		   preq->m_banProxyPort != preq->m_proxyPort){
			log("db: proxy: banproxyip != proxyip. mismatch!");
			g_udpServer.sendErrorReply ( udpSlot , EBADENGINEER);
			return;
		}
		// this will "return" the banned proxy
		returnProxy ( preq , NULL );
		// now add it to the banned table
		int64_t uip = preq->m_urlIp;
		int64_t pip = preq->m_banProxyIp;
		int64_t h64 = hash64h ( uip , pip );
		if ( ! s_proxyBannedTable.isInTable ( &h64 ) ) {
			s_proxyBannedTable.addKey ( &h64 );
			// for stats counting. each proxy ip maps to #
			// of unique website IPs that have banned it.
			s_banCountTable.addTerm32((uint32_t)pip);
		}
	}
	

	// shortcut
	int32_t urlIp = preq->m_urlIp;

	// send to a proxy that is up and has the least amount
	// of LoadBuckets with this urlIp, if tied, go to least loaded.

	// clear counts for this url ip for scoring the best proxy to use
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
		sp->m_countForThisIp = 0;
		sp->m_lastTimeUsedForThisIp = 0LL;
	}

	// this table maps a url's current IP to a possibly MULTIPLE slots
	// which tell us what proxy is downloading a page from that IP.
	// so we can try to find a proxy that is not download a url from
	// this IP currently, or hasn't been for the longest time...
	int32_t hslot = s_loadTable.getSlot ( &urlIp );
	// scan all proxies that have this urlip outstanding
	for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){
		// get the bucket
		LoadBucket *lb;
		lb = (LoadBucket *)s_loadTable.getValueFromSlot(i);
		// get the spider proxy this load point was for
		uint64_t key = (uint32_t)lb->m_proxyIp;
		key <<= 16;
		key |= (uint16_t)lb->m_proxyPort;
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValue(&key);
		// must be there unless user remove it from the list
		if ( ! sp ) continue;
		// count it up
		if (  lb->m_downloadEndTimeMS == 0LL ) 
			sp->m_countForThisIp++;
		// set the last time used to the most recently downloaded time
		// that this proxy has downloaded from this ip
		if ( lb->m_downloadEndTimeMS &&
		     lb->m_downloadEndTimeMS > sp->m_lastTimeUsedForThisIp )
			sp->m_lastTimeUsedForThisIp = lb->m_downloadEndTimeMS;
	}

	// first try to get a spider proxy that is not "dead"
	bool skipDead = true;

	int32_t numBannedProxies = 0;
	int32_t aliveProxyCandidates = 0;

 redo:
	// get the min of the counts
	int32_t minCount = 999999;
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the spider proxy
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);

		// if this proxy was banned by the url's ip... skip it. it is
		// not a candidate...
		if ( skipDead ) {
			int64_t uip = preq->m_urlIp;
			int64_t pip = sp->m_ip;
			int64_t h64 = hash64h ( uip , pip );
			if ( s_proxyBannedTable.isInTable ( &h64 ) ) {
				numBannedProxies++;
				continue;
			}
		}

		// if it failed the last test, skip it
		if ( skipDead && sp->m_lastDownloadError ) continue;

		if ( skipDead ) aliveProxyCandidates++;

		if ( sp->m_countForThisIp >= minCount ) continue;
		minCount = sp->m_countForThisIp;
	}

	// all dead? then get the best dead one
	if ( minCount == 999999 ) {
		skipDead = false;
		goto redo;
	}

	// . we only use one proxy if none are banned by this IP
	// . when that gets banned, we will use the next 2 proxies with
	//   a higher backoff/crawlDelay, etc.
	int32_t threshHold;
	if      ( numBannedProxies <= 0  ) threshHold = 1;

	// if first proxy gets banned, try next 2 proxies until both get ban'd
	else if ( numBannedProxies == 1  ) threshHold = 2;
	else if ( numBannedProxies <  1+2) threshHold = 3 - numBannedProxies;

	// if next two proxies got banned, try next 4 proxies until banned
	else if ( numBannedProxies == 3  ) threshHold = 4;
	else if ( numBannedProxies <  3+4) threshHold = 7 - numBannedProxies;

	// if next 4 proxies got banned, try next 8 proxies until they get band
	else if ( numBannedProxies == 7  ) threshHold = 8;
	else if ( numBannedProxies <  7+8) threshHold = 15 - numBannedProxies;

	else if ( numBannedProxies == 15) threshHold = 16;
	else if ( numBannedProxies <  15+16 ) threshHold = 31-numBannedProxies;

	else if ( numBannedProxies == 31 ) threshHold = 32;
	else if ( numBannedProxies <  31+32)threshHold=63-numBannedProxies;

	else if ( numBannedProxies == 63 ) threshHold = 64;
	else if ( numBannedProxies <  63+64)threshHold=127-numBannedProxies;

	else if ( numBannedProxies == 127 ) threshHold = 128;
	else if ( numBannedProxies <  127+128)threshHold=255-numBannedProxies;

	else if ( numBannedProxies == 255 ) threshHold = 256;
	else if ( numBannedProxies <  255+256)threshHold=512-numBannedProxies;

	else if ( numBannedProxies == 511 ) threshHold = 512;
	else if ( numBannedProxies <  511+512)threshHold=1024-numBannedProxies;

	else threshHold = 1024;
	
	
	if ( threshHold <= 0 ) {
		log("proxy: spiderproxy error in threshold of %" PRId32" "
		    "for banned=%" PRId32,threshHold,numBannedProxies);
		threshHold = 1;
	}

	// reset minCount so we can take the min over those we check here
	minCount = -1;
	int64_t oldest = 0x7fffffffffffffffLL;
	SpiderProxy *winnersp = NULL;
	int32_t count = 0;
	// start at a random slot based on url's IP so we don't
	// overload the first proxy
	int32_t start = ((uint32_t)urlIp) % s_iptab.getNumSlots();
	int32_t slotCount = s_iptab.getNumSlots();
	// . now find the best proxy wih the minCount
	for ( int32_t i = start ; ; i++ ) {
		// scan all slots in hash table, then stop
		if ( slotCount-- <= 0 ) break;
		// wrap around to zero if we hit the end
		if ( i == s_iptab.getNumSlots() ) i = 0;
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the spider proxy
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
		// if it failed the last test, skip it... not here...
		if ( skipDead && sp->m_lastDownloadError ) continue;

		// if this proxy was banned by the url's ip... skip it. it is
		// not a candidate...
		if ( skipDead ) {
			int64_t uip = preq->m_urlIp;
			int64_t pip = sp->m_ip;
			int64_t h64 = hash64h ( uip , pip );
			if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue;
		}

		// if some proxies are "alive" then only pick from
		// the first half of the proxies that are alive (i.e. still
		// work). that way, when one of those goes dead we will inc
		// the backoff (crawldelay) and a new proxy that we haven't
		// used for this url's IP will take it's place. and such
		// new proxies will only have the new backoff count used
		// through them. that way, we don't get ALL of our proxies
		// banned at about the same time since we do somewhat uniform
		// load balancing over them.
		if ( skipDead && count >= threshHold)//aliveProxyCandidates/2 )
			continue;

		// count the alive/non-banned candidates
		count++;

		// if all hosts were "dead" because they all had 
		// m_lastDownloadError set then minCount will be 999999
		// and nobody should continue from this statement:
		if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue;
		// then go by last download time for this ip
		if ( sp->m_countForThisIp == minCount && minCount>=0 &&
		     sp->m_lastTimeUsedForThisIp >= oldest ) 
			continue;

		// pick the spider proxy used longest ago
		oldest   = sp->m_lastTimeUsedForThisIp;
		minCount = sp->m_countForThisIp;
		// got a new winner
		winnersp = sp;
	}

	// we must have a winner
	if ( ! winnersp ) { g_process.shutdownAbort(true); }

	int64_t nowms = gettimeofdayInMillisecondsLocal();

	// add a new load bucket then!
	LoadBucket bb;
	bb.m_urlIp = urlIp;
	// the time it started
	bb.m_downloadStartTimeMS = nowms;
	// download has not ended yet
	bb.m_downloadEndTimeMS = 0LL;
	// the host using the proxy
	bb.m_hostId = udpSlot->getHostId();
	// key is this for m_prTable
	bb.m_proxyIp   = winnersp->m_ip;
	bb.m_proxyPort = winnersp->m_port;
	// a new id. we use this to update the downloadEndTime when done
	static int32_t s_lbid = 0;
	// add it now
	bb.m_id = s_lbid++;
	s_loadTable.addKey ( &urlIp , &bb );
	// winner count update
	winnersp->m_timesUsed++;

	// sanity
	if ( (int32_t)sizeof(ProxyReply) > TMPBUFSIZE ){g_process.shutdownAbort(true);}

	// and give proxy ip/port back to the requester so they can
	// use that to download their url
	ProxyReply *prep = (ProxyReply *)udpSlot->m_tmpBuf;
	prep->m_proxyIp = winnersp->m_ip;
	prep->m_proxyPort = winnersp->m_port;

	// this is just '\0' if none
	strcpy(prep->m_usernamePwd,winnersp->m_usernamePwd);

	// do not count the proxy we are returning as "more"
	prep->m_hasMoreProxiesToTry = ( aliveProxyCandidates > 1 );
	// and the loadbucket id, so requester can tell us it is done
	// downloading through the proxy and we can update the LoadBucket
	// for this transaction (m_lbId)
	prep->m_lbId = bb.m_id;
	// requester wants to know how many proxies have been banned by the
	// urlIp so it can increase a self-imposed crawl-delay to be more
	// sensitive to the spider policy.
	prep->m_numBannedProxies = numBannedProxies;

	//char *p = udpSlot->m_tmpBuf;
	//*(int32_t  *)p = winnersp->m_ip  ; p += 4;
	//*(int16_t *)p = winnersp->m_port; p += 2;
	// and the loadbucket id
	//*(int32_t *)p = bb.m_id; p += 4;

	// with dup keys we end up with long chains of crap and this
	// takes forever. so just flush the whole thing every 2 minutes AND
	// when 20000+ entries are in there
	static time_t s_lastTime = 0;
	time_t now = nowms / 1000;
	if ( s_lastTime == 0 ) s_lastTime = now;
	time_t elapsed = now - s_lastTime;
	if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) {
		log("sproxy: flushing %i entries from proxy loadtable that "
		    "have accumulated since %i seconds ago",
		    (int)s_loadTable.m_numSlotsUsed,(int)elapsed);
		s_loadTable.clear();
		// only do this one per minute
		s_lastTime = now;
	}


	int32_t sanityCount = 0;//s_loadTable.getNumSlots();
	// top:
	// now remove old entries from the load table. entries that
	// have completed and have a download end time more than 10 mins ago.
	for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) {
		// skip if empty
		if ( ! s_loadTable.m_flags[i] ) continue;
		// get the bucket
		LoadBucket *pp =(LoadBucket *)s_loadTable.getValueFromSlot(i);
		// skip if still active
		if ( pp->m_downloadEndTimeMS == 0LL ) continue;
		// delta t
		int64_t took = nowms - pp->m_downloadEndTimeMS;
		// < 10 mins? now it's < 15 seconds to prevent clogging.
		if ( took < LOADPOINT_EXPIRE_MS ) continue;

		// 100 at a time so we don't slam cpu
		if ( sanityCount++ > 100 ) break;

		// ok, its too old, nuke it to save memory
		s_loadTable.removeSlot(i);
		// the keys might have buried us but we really should not
		// mis out on analyzing any keys if we just keep looping here
		// should we? TODO: figure it out. if we miss a few it's not
		// a big deal.
		//i--;
		//goto top;
	}

	// send the proxy ip/port/LBid back to user
	g_udpServer.sendReply(udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot);
}