Пример #1
0
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			//
			// make my own mods to make parsing easier
			//

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;
			}

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;
		}

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				}
				buf += len;
				
			}
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
// . when the Conf::m_proxyIps parm is updated we call this to rebuild
//   s_iptab, our table of SpiderProxy instances, which has the proxies and 
//   their performance statistics.
// . we try to maintain stats of ip/ports that did NOT change when rebuilding.
bool buildProxyTable ( ) {

	// scan the NEW list of proxy ip/port pairs in g_conf
	char *p = g_conf.m_proxyIps.getBufStart();

	HashTableX tmptab;
	tmptab.set(8,0,16,NULL,0,false,"tmptab");

	// scan the user inputted space-separated list of ip:ports
	// (optional username:password@ip:port)
	for ( ; *p ; ) {
		// skip white space
		if ( is_wspace_a(*p) ) { p++; continue; }

		// skip http://
		if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; }

		// scan in an ip:port
		char *s = p; char *portStr = NULL;
		int32_t dc = 0, pc = 0, gc = 0, bc = 0;
		const char *msg;

		char *usernamePwd = NULL;
		int32_t usernamePwdLen = 0;
		char *ipStart = p;

		// scan all characters until we hit \0 or another whitespace
		for ( ; *s && !is_wspace_a(*s); s++) {

			if ( *s == '@' ) {
				// must be username:pwd
				if ( pc != 1 ) {
					msg = "bad username:password";
					goto hadError;
				}
				usernamePwd = p;
				usernamePwdLen = s - p;
				if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) {
					msg = "username:password too long";
					goto hadError;
				}
				dc = 0;
				gc = 0;
				bc = 0;
				pc = 0;
				portStr = NULL;
				ipStart = s+1;
				continue;
			}

			if ( *s == '.' ) { dc++; continue; }
			if ( *s == ':' ) { portStr=s; pc++; continue; }
			if ( is_digit(*s) ) { gc++; continue; }
			bc++;
			continue;
		}
		// ensure it is a legit ip:port combo
		msg = NULL;
		if ( gc < 4 ) 
			msg = "not enough digits for an ip";
		if ( pc > 1 )
			msg = "too many colons";
		if ( dc != 3 )
			msg = "need 3 dots for an ip address";
		if ( bc )
			msg = "got illegal char in ip:port listing";
		if ( msg ) {
		hadError:
			char c = *s;
			*s = '\0';
			log("buf: %s for %s",msg,p);
			*s = c;
			return false;
		}

		// convert it
		int32_t iplen = s - ipStart;
		if ( portStr ) iplen = portStr - ipStart;
		int32_t ip = atoip(ipStart,iplen);
		// another sanity check
		if ( ip == 0 || ip == -1 ) {
			log("spider: got bad proxy ip for %s",p);
			return false;
		}

		// and the port default is 80
		int32_t port = 80;
		if ( portStr ) port = atol2(portStr+1,s-portStr-1);
		if ( port < 0 || port > 65535 ) {
			log("spider: got bad proxy port for %s",p);
			return false;
		}


		// . we got a legit ip:port
		// . see if already in our table
		uint64_t ipKey = (uint32_t)ip;
		ipKey <<= 16;
		ipKey |= (uint16_t)(port & 0xffff);

		// also store into tmptable to see what we need to remove
		tmptab.addKey(&ipKey);

		// see if in table
		int32_t islot = s_iptab.getSlot( &ipKey);

		// advance p
		p = s;

		// if in there, keep it as is
		if ( islot >= 0 ) continue;

		// otherwise add new entry
		SpiderProxy newThing;
		memset ( &newThing , 0 , sizeof(SpiderProxy));
		newThing.m_ip = ip;
		newThing.m_port = port;
		newThing.m_lastDownloadTookMS = -1;
		newThing.m_lastSuccessfulTestMS = -1;

		gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen);
		// ensure it is NULL terminated
		newThing.m_usernamePwd[usernamePwdLen] = '\0';

		if ( ! s_iptab.addKey ( &ipKey, &newThing ) )
			return false;
	}		

 redo:
	int32_t removed = 0;
	// scan all SpiderProxies in tmptab
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty buckets in hashtable s_iptab
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the key
		int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i);
		// must also exist in tmptab, otherwise it got removed by user
		if ( tmptab.isInTable ( &key ) ) continue;
		// skip if not in table
		if ( s_iptab.getSlot ( &key ) < 0 ) {
			log("sproxy: iptable hashing messed up");
			continue;
		}
		// shoot, it got removed. not in the new list of ip:ports
		s_iptab.removeKey ( &key );
		removed++;
		// hashtable is messed up now, start over
		//goto redo;
	}
	if ( removed ) goto redo;
	return true;
}
// a host is asking us (host #0) what proxy to use?
static void handleRequest54(UdpSlot *udpSlot, int32_t niceness) {

	char *request     = udpSlot->m_readBuf;
	int32_t  requestSize = udpSlot->m_readBufSize;

	// we now use the top part of the Msg13Request as the ProxyRequest
	Msg13Request *preq = (Msg13Request *)request;

	// sanity check
	if ( requestSize != preq->getProxyRequestSize() ) {
		log("db: Got bad request 0x54 size of %" PRId32" bytes. bad",
		    requestSize );
		g_udpServer.sendErrorReply ( udpSlot , EBADREQUESTSIZE );
		return;
	}

	// is the request telling us it is done downloading through a proxy?
	if ( preq->m_opCode == OP_RETPROXY ) {
		returnProxy ( preq , udpSlot );
		return;
	}

	// if sender is asking for a new proxy and wants us to ban
	// the previous proxy we sent for this urlIp...
	if ( preq->m_banProxyIp ) {
		// don't core if misses sanity. it seems we don't always
		// NULLify these or something.
		// these must match
		if(preq->m_banProxyIp   != preq->m_proxyIp  ||
		   preq->m_banProxyPort != preq->m_proxyPort){
			log("db: proxy: banproxyip != proxyip. mismatch!");
			g_udpServer.sendErrorReply ( udpSlot , EBADENGINEER);
			return;
		}
		// this will "return" the banned proxy
		returnProxy ( preq , NULL );
		// now add it to the banned table
		int64_t uip = preq->m_urlIp;
		int64_t pip = preq->m_banProxyIp;
		int64_t h64 = hash64h ( uip , pip );
		if ( ! s_proxyBannedTable.isInTable ( &h64 ) ) {
			s_proxyBannedTable.addKey ( &h64 );
			// for stats counting. each proxy ip maps to #
			// of unique website IPs that have banned it.
			s_banCountTable.addTerm32((uint32_t)pip);
		}
	}
	

	// shortcut
	int32_t urlIp = preq->m_urlIp;

	// send to a proxy that is up and has the least amount
	// of LoadBuckets with this urlIp, if tied, go to least loaded.

	// clear counts for this url ip for scoring the best proxy to use
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
		sp->m_countForThisIp = 0;
		sp->m_lastTimeUsedForThisIp = 0LL;
	}

	// this table maps a url's current IP to a possibly MULTIPLE slots
	// which tell us what proxy is downloading a page from that IP.
	// so we can try to find a proxy that is not download a url from
	// this IP currently, or hasn't been for the longest time...
	int32_t hslot = s_loadTable.getSlot ( &urlIp );
	// scan all proxies that have this urlip outstanding
	for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){
		// get the bucket
		LoadBucket *lb;
		lb = (LoadBucket *)s_loadTable.getValueFromSlot(i);
		// get the spider proxy this load point was for
		uint64_t key = (uint32_t)lb->m_proxyIp;
		key <<= 16;
		key |= (uint16_t)lb->m_proxyPort;
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValue(&key);
		// must be there unless user remove it from the list
		if ( ! sp ) continue;
		// count it up
		if (  lb->m_downloadEndTimeMS == 0LL ) 
			sp->m_countForThisIp++;
		// set the last time used to the most recently downloaded time
		// that this proxy has downloaded from this ip
		if ( lb->m_downloadEndTimeMS &&
		     lb->m_downloadEndTimeMS > sp->m_lastTimeUsedForThisIp )
			sp->m_lastTimeUsedForThisIp = lb->m_downloadEndTimeMS;
	}

	// first try to get a spider proxy that is not "dead"
	bool skipDead = true;

	int32_t numBannedProxies = 0;
	int32_t aliveProxyCandidates = 0;

 redo:
	// get the min of the counts
	int32_t minCount = 999999;
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the spider proxy
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);

		// if this proxy was banned by the url's ip... skip it. it is
		// not a candidate...
		if ( skipDead ) {
			int64_t uip = preq->m_urlIp;
			int64_t pip = sp->m_ip;
			int64_t h64 = hash64h ( uip , pip );
			if ( s_proxyBannedTable.isInTable ( &h64 ) ) {
				numBannedProxies++;
				continue;
			}
		}

		// if it failed the last test, skip it
		if ( skipDead && sp->m_lastDownloadError ) continue;

		if ( skipDead ) aliveProxyCandidates++;

		if ( sp->m_countForThisIp >= minCount ) continue;
		minCount = sp->m_countForThisIp;
	}

	// all dead? then get the best dead one
	if ( minCount == 999999 ) {
		skipDead = false;
		goto redo;
	}

	// . we only use one proxy if none are banned by this IP
	// . when that gets banned, we will use the next 2 proxies with
	//   a higher backoff/crawlDelay, etc.
	int32_t threshHold;
	if      ( numBannedProxies <= 0  ) threshHold = 1;

	// if first proxy gets banned, try next 2 proxies until both get ban'd
	else if ( numBannedProxies == 1  ) threshHold = 2;
	else if ( numBannedProxies <  1+2) threshHold = 3 - numBannedProxies;

	// if next two proxies got banned, try next 4 proxies until banned
	else if ( numBannedProxies == 3  ) threshHold = 4;
	else if ( numBannedProxies <  3+4) threshHold = 7 - numBannedProxies;

	// if next 4 proxies got banned, try next 8 proxies until they get band
	else if ( numBannedProxies == 7  ) threshHold = 8;
	else if ( numBannedProxies <  7+8) threshHold = 15 - numBannedProxies;

	else if ( numBannedProxies == 15) threshHold = 16;
	else if ( numBannedProxies <  15+16 ) threshHold = 31-numBannedProxies;

	else if ( numBannedProxies == 31 ) threshHold = 32;
	else if ( numBannedProxies <  31+32)threshHold=63-numBannedProxies;

	else if ( numBannedProxies == 63 ) threshHold = 64;
	else if ( numBannedProxies <  63+64)threshHold=127-numBannedProxies;

	else if ( numBannedProxies == 127 ) threshHold = 128;
	else if ( numBannedProxies <  127+128)threshHold=255-numBannedProxies;

	else if ( numBannedProxies == 255 ) threshHold = 256;
	else if ( numBannedProxies <  255+256)threshHold=512-numBannedProxies;

	else if ( numBannedProxies == 511 ) threshHold = 512;
	else if ( numBannedProxies <  511+512)threshHold=1024-numBannedProxies;

	else threshHold = 1024;
	
	
	if ( threshHold <= 0 ) {
		log("proxy: spiderproxy error in threshold of %" PRId32" "
		    "for banned=%" PRId32,threshHold,numBannedProxies);
		threshHold = 1;
	}

	// reset minCount so we can take the min over those we check here
	minCount = -1;
	int64_t oldest = 0x7fffffffffffffffLL;
	SpiderProxy *winnersp = NULL;
	int32_t count = 0;
	// start at a random slot based on url's IP so we don't
	// overload the first proxy
	int32_t start = ((uint32_t)urlIp) % s_iptab.getNumSlots();
	int32_t slotCount = s_iptab.getNumSlots();
	// . now find the best proxy wih the minCount
	for ( int32_t i = start ; ; i++ ) {
		// scan all slots in hash table, then stop
		if ( slotCount-- <= 0 ) break;
		// wrap around to zero if we hit the end
		if ( i == s_iptab.getNumSlots() ) i = 0;
		// skip empty slots
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the spider proxy
		SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
		// if it failed the last test, skip it... not here...
		if ( skipDead && sp->m_lastDownloadError ) continue;

		// if this proxy was banned by the url's ip... skip it. it is
		// not a candidate...
		if ( skipDead ) {
			int64_t uip = preq->m_urlIp;
			int64_t pip = sp->m_ip;
			int64_t h64 = hash64h ( uip , pip );
			if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue;
		}

		// if some proxies are "alive" then only pick from
		// the first half of the proxies that are alive (i.e. still
		// work). that way, when one of those goes dead we will inc
		// the backoff (crawldelay) and a new proxy that we haven't
		// used for this url's IP will take it's place. and such
		// new proxies will only have the new backoff count used
		// through them. that way, we don't get ALL of our proxies
		// banned at about the same time since we do somewhat uniform
		// load balancing over them.
		if ( skipDead && count >= threshHold)//aliveProxyCandidates/2 )
			continue;

		// count the alive/non-banned candidates
		count++;

		// if all hosts were "dead" because they all had 
		// m_lastDownloadError set then minCount will be 999999
		// and nobody should continue from this statement:
		if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue;
		// then go by last download time for this ip
		if ( sp->m_countForThisIp == minCount && minCount>=0 &&
		     sp->m_lastTimeUsedForThisIp >= oldest ) 
			continue;

		// pick the spider proxy used longest ago
		oldest   = sp->m_lastTimeUsedForThisIp;
		minCount = sp->m_countForThisIp;
		// got a new winner
		winnersp = sp;
	}

	// we must have a winner
	if ( ! winnersp ) { g_process.shutdownAbort(true); }

	int64_t nowms = gettimeofdayInMillisecondsLocal();

	// add a new load bucket then!
	LoadBucket bb;
	bb.m_urlIp = urlIp;
	// the time it started
	bb.m_downloadStartTimeMS = nowms;
	// download has not ended yet
	bb.m_downloadEndTimeMS = 0LL;
	// the host using the proxy
	bb.m_hostId = udpSlot->getHostId();
	// key is this for m_prTable
	bb.m_proxyIp   = winnersp->m_ip;
	bb.m_proxyPort = winnersp->m_port;
	// a new id. we use this to update the downloadEndTime when done
	static int32_t s_lbid = 0;
	// add it now
	bb.m_id = s_lbid++;
	s_loadTable.addKey ( &urlIp , &bb );
	// winner count update
	winnersp->m_timesUsed++;

	// sanity
	if ( (int32_t)sizeof(ProxyReply) > TMPBUFSIZE ){g_process.shutdownAbort(true);}

	// and give proxy ip/port back to the requester so they can
	// use that to download their url
	ProxyReply *prep = (ProxyReply *)udpSlot->m_tmpBuf;
	prep->m_proxyIp = winnersp->m_ip;
	prep->m_proxyPort = winnersp->m_port;

	// this is just '\0' if none
	strcpy(prep->m_usernamePwd,winnersp->m_usernamePwd);

	// do not count the proxy we are returning as "more"
	prep->m_hasMoreProxiesToTry = ( aliveProxyCandidates > 1 );
	// and the loadbucket id, so requester can tell us it is done
	// downloading through the proxy and we can update the LoadBucket
	// for this transaction (m_lbId)
	prep->m_lbId = bb.m_id;
	// requester wants to know how many proxies have been banned by the
	// urlIp so it can increase a self-imposed crawl-delay to be more
	// sensitive to the spider policy.
	prep->m_numBannedProxies = numBannedProxies;

	//char *p = udpSlot->m_tmpBuf;
	//*(int32_t  *)p = winnersp->m_ip  ; p += 4;
	//*(int16_t *)p = winnersp->m_port; p += 2;
	// and the loadbucket id
	//*(int32_t *)p = bb.m_id; p += 4;

	// with dup keys we end up with long chains of crap and this
	// takes forever. so just flush the whole thing every 2 minutes AND
	// when 20000+ entries are in there
	static time_t s_lastTime = 0;
	time_t now = nowms / 1000;
	if ( s_lastTime == 0 ) s_lastTime = now;
	time_t elapsed = now - s_lastTime;
	if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) {
		log("sproxy: flushing %i entries from proxy loadtable that "
		    "have accumulated since %i seconds ago",
		    (int)s_loadTable.m_numSlotsUsed,(int)elapsed);
		s_loadTable.clear();
		// only do this one per minute
		s_lastTime = now;
	}


	int32_t sanityCount = 0;//s_loadTable.getNumSlots();
	// top:
	// now remove old entries from the load table. entries that
	// have completed and have a download end time more than 10 mins ago.
	for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) {
		// skip if empty
		if ( ! s_loadTable.m_flags[i] ) continue;
		// get the bucket
		LoadBucket *pp =(LoadBucket *)s_loadTable.getValueFromSlot(i);
		// skip if still active
		if ( pp->m_downloadEndTimeMS == 0LL ) continue;
		// delta t
		int64_t took = nowms - pp->m_downloadEndTimeMS;
		// < 10 mins? now it's < 15 seconds to prevent clogging.
		if ( took < LOADPOINT_EXPIRE_MS ) continue;

		// 100 at a time so we don't slam cpu
		if ( sanityCount++ > 100 ) break;

		// ok, its too old, nuke it to save memory
		s_loadTable.removeSlot(i);
		// the keys might have buried us but we really should not
		// mis out on analyzing any keys if we just keep looping here
		// should we? TODO: figure it out. if we miss a few it's not
		// a big deal.
		//i--;
		//goto top;
	}

	// send the proxy ip/port/LBid back to user
	g_udpServer.sendReply(udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot);
}
// . this returns false if blocks, true otherwise
// . sets g_errno on failure
bool Msg1c::gotList ( ) {

	if ( g_errno ) return true;

	int64_t *tmpDocIds = m_msg3a.getDocIds();
	int32_t       numDocIds = m_msg3a.getNumDocIds();

	if ( m_startNum > 0) {
		numDocIds -= m_startNum;
		tmpDocIds = &tmpDocIds[m_startNum];
	}

	m_numDocIds = numDocIds; // save for reporting
	// log it
	log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds);
	// bail if no need
	if ( numDocIds <= 0 ) return true;

	// force spiders on on entire network. they will progagate from 
	// host #0... 
	g_conf.m_spideringEnabled = true;

	int32_t nowGlobal = getTimeGlobal();

	HashTableX dt;
	char dbuf[1024];
	dt.set(8,0,64,dbuf,1024,false,0,"ddocids");

	m_sb.setLabel("reiadd");

	State13 *st = (State13 *)m_state;
	GigablastRequest *gr = &st->m_gr;

	m_numDocIdsAdded = 0;

	// list consists of docIds, loop through each one
 	for(int32_t i = 0; i < numDocIds; i++) {
		int64_t docId = tmpDocIds[i];
		// when searching events we get multiple docids that are same
		if ( dt.isInTable ( &docId ) ) continue;
		// add it
		if ( ! dt.addKey ( &docId ) ) return true;

		SpiderRequest sr;
		sr.reset();

		// url is a docid!
		sprintf ( sr.m_url , "%" PRIu64 , docId );

		// make a fake first ip
		// use only 64k values so we don't stress doledb/waittrees/etc.
		// for large #'s of docids
		int32_t firstIp = (docId & 0x0000ffff);

		// bits 6-13 of the docid are the domain hash so use those
		// when doing a REINDEX (not delete!) to ensure that requests
		// on the same domain go to the same shard, at least when
		// we have up to 256 shards. if we have more than 256 shards
		// at this point some shards will not participate in the
		// query reindex/delete process because of this, so 
		// we'll want to allow more bits in in that case perhaps.
		// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
		// to see what shard is responsible for storing and indexing 
		// this SpiderRequest based on the firstIp.
		if ( ! m_forceDel ) { 
			// if we are a REINDEX not a delete because 
			// deletes don't need to spider/redownload the doc
			// so the distribution can be more random
			firstIp >>= 6;
			firstIp &= 0xff;
		}

		// 0 is not a legit val. it'll core below.
		if ( firstIp == 0 ) {
			firstIp = 1;
		}

		// use a fake ip
		sr.m_firstIp        =  firstIp;
		// we are not really injecting...
		sr.m_isInjecting    =  false;//true;
		sr.m_hopCount       = -1;
		sr.m_isPageReindex  =  1;
		sr.m_urlIsDocId     =  1;
		sr.m_fakeFirstIp    =  1;

		// now you can recycle content instead of re-downloading it
		// for every docid
		sr.m_recycleContent = gr->m_recycleContent;
		// if this is zero we end up getting deduped in
		// dedupSpiderList() if there was a SpiderReply whose
		// spider time was > 0
		sr.m_addedTime = nowGlobal;
	    sr.m_forceDelete = m_forceDel ? 1 : 0;

		// . complete its m_key member
		// . parentDocId is used to make the key, but only allow one
		//   page reindex spider request per url... so use "0"
		// . this will set "uh48" to hash64b(m_url) which is the docid
		sr.setKey( firstIp, 0LL , false );

		// how big to serialize
		int32_t recSize = sr.getRecSize();

		m_numDocIdsAdded++;
	
		// store it
		if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) {
			// g_errno must be set
			if ( ! g_errno ) { g_process.shutdownAbort(true); }

			log(LOG_LOGIC,
			    "admin: Query reindex size of %" PRId32" "
			    "too big. Aborting. Bad engineer." , 
			    (int32_t)0);//m_list.getListSize() );
			return true;
		}
	}
// langId is language of the query
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) {
	Words ww;
	ww.set3 ( qstr );
	long nw = ww.getNumWords();
	long long *wids = ww.getWordIds();
	//char **wptrs = ww.getWords();
	//long *wlens = ww.getWordLens();
	long long baseHash64 = 0LL;
	Synonyms syn;
	// assume english if unknown to fix 'pandora's tower'
	// vs 'pandoras tower' where both words are in both
	// english and german so langid is unknown
	if ( langId == langUnknown ) langId = langEnglish;
	// . store re-written query into here then hash that string
	// . this way we can get rid of spaces
	//char rebuf[1024];
	//char *p = rebuf;
	//if ( strstr(qstr,"cheatcodes") )
	//	log("hey");
	// for deduping
	HashTableX dups;
	if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false;
	// scan the words
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not alnum
		if ( ! wids[i] ) continue;
		// get its synonyms into tmpBuf
		char tmpBuf[TMPSYNBUFSIZE];
		// . assume niceness of 0 for now
		// . make sure to get all synsets!! ('love' has two synsets)
		long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0);
		// term freq algo
		//long pop = g_speller.getPhrasePopularity(NULL,
		//					 wids[i],
		//					 true,
		//					 langId);
		// is it a queryStopWord like "the" or "and"?
		bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]);
		// a more restrictive list
		bool isStop = ::isStopWord(NULL,0,wids[i]);
		if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true;
		// find the smallest one
		unsigned long long min = wids[i];
		//char *minWordPtr = wptrs[i];
		//long  minWordLen = wlens[i];
		// declare up here since we have a goto below
		long j;
		// add to table too
		if ( dups.isInTable ( &min ) ) goto gotdup;
		// add to it
		if ( ! dups.addKey ( &min ) ) return false;
		// now scan the synonyms, they do not include "min" in them
		for ( j = 0 ; j < naids ; j++ ) {
			// get it
			unsigned long long aid64;
			aid64 = (unsigned long long)syn.m_aids[j];
			// if any syn already hashed then skip it and count
			// as a repeated term. we have to do it this way
			// rather than just getting the minimum synonym 
			// word id, because 'love' has two synsets and
			// 'like', a synonym of 'love' only has one synset
			// and they end up having different minimum synonym
			// word ids!!!
			if ( dups.isInTable ( &aid64 ) ) break;
			// add it. this could fail!
			if ( ! dups.addKey ( &aid64 ) ) return false;
			// set it?
			if ( aid64 >= min ) continue;
			// got a new min
			min = aid64;
			//minWordPtr = syn.m_termPtrs[j];
			//minWordLen = syn.m_termLens[j];
			// get largest term freq of all synonyms
			//long pop2 = g_speller.getPhrasePopularity(NULL,aid64,
			//					  true,langId);
			//if ( pop2 > pop ) pop = pop2;
		}
		// early break out means a hit in dups table
		if ( j < naids ) {
		gotdup:
			// do not count as repeat if query stop word
			// because they often repeat
			if ( isQueryStop ) continue;
			// count # of repeated word forms
			//nrwf++;
			continue;
		}
		// hash that now
		// do not include stop words in synbasehash so
		// 'search the web' != 'search web'
		if ( ! isStop ) {
			// no! make it order independent so 'search the web'
			// equals 'web the search' and 'engine search'
			// equals 'search engine'
			//baseHash64 <<= 1LL;
			baseHash64 ^= min;
		}
		// count it, but only if not a query stop word like "and"
		// or "the" or "a". # of unique word forms.
		//if ( ! isQueryStop ) nuwf++;
		// get term freq 
		//if ( pop > maxPop ) maxPop = pop;
		// control word?
		//if ( wids[i] == cw1 ) ncwf++;
	}
	return baseHash64;
}
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
long Synonyms::getSynonyms ( Words *words , 
			     long wordNum , 
			     uint8_t langId ,
			     char *tmpBuf ,
			     long niceness ) {

	// punct words have no synoyms
	if ( ! words->m_wordIds[wordNum] ) return 0;

	// store these
	m_words     = words;
	m_docLangId = langId;
	m_niceness = niceness;

	// sanity check
	if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; }

	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];
	dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds");


	long maxSyns = (long)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (long *)bufPtr;
	bufPtr += maxSyns * 4;


	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = m_termPtrs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;

	
	char *w    = m_words->m_words   [wordNum];
	long  wlen = m_words->m_wordLens[wordNum];

	//
	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below
	//

	char sourceId = SOURCE_WIKTIONARY;
	char *ss = NULL;
	long long bwid;
	char wikiLangId = m_docLangId;
	bool hadSpace ;
	long klen ;
	long baseNumAlnumWords;

 tryOtherLang:

	/*
	// if word only exists in one language, assume that language for word
	// even if m_docLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_docLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		long long bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		else
			wikiLangId = getCharacterLanguage(w);
	}
	*/

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId && 
	     wordNum+2< m_words->m_numWords &&
	     m_words->m_wordIds[wordNum+2]) {
		// get phrase id bigram then
		long conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		char *wp2 = m_words->m_words[wordNum+2];
		long  wlen2 = m_words->m_wordLens[wordNum+2];
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
	}

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss && 
		     wlen >= 3 &&
		     w[wlen-2]=='\'' && 
		     w[wlen-1]=='s' ) {
			long long cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );
		}
	}

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_docLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_docLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		sourceId   = SOURCE_WIKTIONARY_EN;
		goto tryOtherLang;
	}

	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		long count = 0;
	addSynSet:
		// do we have another set following this
		char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf");
		}
		// skip over the pipe i guess
		char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) { char *xx=NULL;*xx=0; }
		// point to word list
		char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		char *e = p + 1;
		// save count in case we need to undo
		//long saved = m_numAlts[wordNum];
	hashLoop:


		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer" 
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		long long h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		}
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		hadSpace = false;
		klen = e - p;
		for ( long k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			Words sw;
			sw.setx ( p , e - p , m_niceness );
			*(long long *)m_wids0Ptr = sw.m_wordIds[0];
			*(long long *)m_wids1Ptr = sw.m_wordIds[2];
			*(long  *)m_numAlnumWordsPtr = sw.getNumAlnumWords();
		}

		m_wids0Ptr++;
		m_wids1Ptr++;
		m_numAlnumWordsPtr++;

		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) goto done;
	getNextSyn:
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
	done:
		// all done
		return m_aidsPtr - m_aids;
	}


	// strip marks from THIS word, return -1 w/ g_errno set on error
	if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' && 
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe ( wordNum, &dt ) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
}
// returns false if blocked, true otherwise
bool Statsdb::gifLoop ( ) {
	// shortcut
	Msg5 *m = &m_msg5;

	//#ifndef _USEPLOTTER_
	//return true;
	//#endif

	// loop over all the lists in the time range, [m_t1,m_t2]
	for ( ; ! m_done ; ) {
		if ( ! m->getList ( (char)RDB_STATSDB	,
				    "statsdb"		, // coll
				    &m_list		,
				    (char *)&m_startKey	,
				    (char *)&m_endKey	,
				    32000	, // requested scan size
				    true 	, // include tree?
				    false	, // add to cache?
				    0		, // max cache age
				    0		, // start file number
				    -1		, // number of files
				    NULL	, // state
				    gotListWrapper, // callback
				    m_niceness	, // niceness
				    false	, // do error correction?
				    NULL	, // cache key pointer
				    0		, // # retries
				    -1		, // max # retries
				    true	, // compensate for merge?
				    -1		, // sync point
				    NULL	) ) // msg5b
			return false;
		// . process list
		// . returns false with g_errno set on error
		if ( ! processList() ) return true;
	}

	// define time delta - commented out because it's currently not used.
	long dt = m_t2 - m_t1;

	//#ifdef _USEPLOTTER_

	// gif size
	//char tmp[64];
	// dimensions of the gif
	//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
	//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
	// create one
	//GIFPlotter plotter ( NULL , m_fd , NULL );
	// open it
	//plotter.openpl ( );

	// define the space with boundaries 100 unit wide boundaries
	//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );

	// line thickness in user coordinates (pixels for us)
	//plotter.linewidth ( 1 );       
	// set bg color to gray (r/g/b) 
	//plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
	// erase Plotter's graphics display
	//plotter.erase ();                
	// draw axises in black
	//plotter.pencolorname ("black");    

	//
	// main graphing window
	//
	m_gw.safePrintf("<div style=\"position:relative;"
		      "background-color:#c0c0c0;"
		      //"overflow-y:hidden;"
		      "overflow-x:hidden;"
		      "z-index:-10;"
		      // the tick marks we print below are based on it
		      // being a window of the last 20 seconds... and using
		      // DX pixels
		      "min-width:%lipx;"
		      "min-height:%lipx;"
		      //"width:100%%;"
		      //"min-height:600px;"
		      "margin-top:10px;"
		      "margin-bottom:10px;"
		      "margin-right:10px;"
		      "margin-left:10px;\">"
		      ,(long)DX + 2 *m_bx
			,(long)DY + 2*m_by);


	// draw the x-axis
	//plotter.line ( m_bx , m_by , DX + m_bx , m_by  );

	// 10 x-axis tick marks
	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
		// tick mark
		//plotter.line ( x , -20 , x , 20 );
		m_gw.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:0;"
			      "background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\"></div>\n"
			      , m_bx + (long)x-1
			      );
		long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
		// LABEL
		m_gw.safePrintf("<div style=\"position:absolute;"
				"left:%li;"
				"bottom:20;"
				//"background-color:#000000;"
				"z-index:110;"
				"min-height:20px;"
				"min-width:3px;\">%lis</div>\n"
				, (long)x-10 + m_bx
				// the label:
				, xv
				);
	}


	HashTableX tmpht;
	tmpht.set(4,0,0,NULL,0,false,m_niceness,"statsparms");

	long col = 0;

	m_sb2->safePrintf("<table border=1 width=100%%>\n");

	// label offset to prevent collisions of superimposing multiple
	// graph calbrations
	long zoff = 0;


	//
	// point to the triplets in m_sb1's buffer (x,y,c)
	//
	char *p    = m_sb1.getBufStart();
	char *pend = p + m_sb1.length();
	for ( ; p < pend ; p += 12 ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get graph hash of this point
		long  gh = *(long *)(p +8);

		// if we already did this graph, skip it
		if ( tmpht.isInTable ( &gh ) ) continue;

		// . graph this single graph of this color
		// . returns ptr to first point of different color!
		plotGraph ( p , pend , gh , m_gw , zoff );
		// prevent collisions
		zoff += 20;

		// get the label based on graphHash
		Label *bb = getLabel ( gh );

		// add to key
		if ( col == 0 )
			m_sb2->safePrintf("<tr>");

		m_sb2->safePrintf("<td bgcolor=#%06lx>&nbsp; &nbsp;</td>"
				 "<td>%s</td>\n",
				 bb->m_color ,
				 bb->m_keyDesc );

		if ( col == 1 )
			m_sb2->safePrintf("</tr>\n");

		// inc column and wrap
		if ( ++col >= 2 ) col = 0;

		// . do not re-display 
		// . TODO: deal with error
		tmpht.addKey ( &gh );
		
	}



	// clear that up
	m_sb1.reset();

	// now plot the events, horizontal line segments like the performance
	// graph uses
	for ( long i = 0 ; i < m_ht3.m_numSlots ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if slot empty
		if ( ! m_ht3.m_flags[i] ) continue;
		// get the offset into m_sb3
		long offset = *(long *)m_ht3.getValueFromSlot(i);
		// get buf start
		char *bufStart = m_sb3.getBufStart();
		// get the ptr
		EventPoint *pp = (EventPoint *)(bufStart + offset);

		// get name of parm
		Parm *m = g_parms.getParmFromParmHash ( pp->m_parmHash );
		// make sure we got it
		if ( ! m ) { 
			log("statsdb: unrecognized parm hash = %li",
			    pp->m_parmHash);
			continue;
			//char *xx=NULL;*xx=0; }
		}

		// set the line width
		//plotter.linewidth ( pp->m_thickness );

		// get parm hash
		long colorHash = pp->m_parmHash;
		// add in old/new values to make it different
		colorHash = hash32h ( (long)pp->m_oldVal , colorHash );
		colorHash = hash32h ( (long)pp->m_newVal , colorHash );
		// . get color
		// . is really the parm hash in disguise
		long c1 = colorHash & 0x00ffffff;
		// use the color specified from addStat_r() for this line/pt
		//plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
		//		   ((c1 >>  8) & 0xff) << 8 ,
		//		   ((c1 >>  0) & 0xff) << 8 );

		long x1 = pp->m_a;
		long x2 = pp->m_b;
		long y1 = *(long *)m_ht3.getKey(i); // i value
		// ensure at least 3 units wide for visibility
		if ( x2 < x1 + 10 ) x2 = x1 + 10;
		// . flip the y so we don't have to scroll the browser down
		// . DY does not include the axis and tick marks
		//long fy1 = DY - y1 + m_by ;
		// plot it
		//plotter.line ( x1 , fy1 , x2 , fy1 );
		drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );

		// add to map key? only if we haven't already
		if ( tmpht.isInTable ( &colorHash ) ) continue;

		// add it
		if ( col == 0 )
			m_sb2->safePrintf("<tr>");

		char *title = "unknown parm";
		if ( m ) title = m->m_title;

		m_sb2->safePrintf("<td bgcolor=#%06lx>&nbsp; &nbsp;</td>",c1);

		// print the parm name and old/new values
		m_sb2->safePrintf("<td><b>%s</b>",title);

		if ( pp->m_oldVal != pp->m_newVal )
			m_sb2->safePrintf(" (%.02f -> %.02f)",
					 pp->m_oldVal,pp->m_newVal);

		m_sb2->safePrintf("</td>");

		if ( col == 1 )
			m_sb2->safePrintf("</tr>\n");

		// inc column and wrap
		if ( ++col >= 2 ) col = 0;

		// . do not re-display 
		// . TODO: deal with error
		tmpht.addKey ( &colorHash ) ;
	}
	m_sb2->safePrintf("</table>\n");

	// clear that up
	m_ht3.reset();
	m_sb3.reset();

	// and stat states
	m_ht0.reset();
	m_sb0.reset();

	// all done free some mem
	m_sb1.reset();
	//m_sb2.reset();

	//
	// but not m_sb2 cuz that has the html in it!!
	//

	// all done
	//if ( plotter.closepl () < 0 ) 
	//	log("admin: Could not close performance graph object.");
	// close the file
	//fclose ( m_fd );

	//#endif

	// close main graphing window
	m_gw.safePrintf("</div>\n");

	return true;
}
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );
			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;
			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }
			// point to it
			char *buf = (char *)s_entities[i].utf8;
			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);
			//
			// make my own mods to make parsing easier
			//
			if ( up == 160 ) {  // nbsp
				buf[0] = ' '; len = 1; }
			// make all quotes equal '\"' (34 decimal)
			// double and single curling quotes
			//http://www.dwheeler.com/essays/quotes-test-utf-8.html
			// &#x201c, 201d, 2018, 2019 (unicode values, not utf8)
			// &ldquo, &rdquo, &lsquo, &rsquo
			/*
			if ( up == 171 ||
			     up == 187 ||
			     up == 8216 ||
			     up == 8217 ||
			     up == 8218 ||
			     up == 8220 ||
			     up == 8221 ||
			     up == 8222 ||
			     up == 8249 ||
			     up == 8250 ) {
				buf[0] = '\"'; len = 1; }
			// and normalize all dashes (mdash,ndash)
			if ( up == 8211 || up == 8212 ) {
				buf[0] = '-'; len = 1; }
			*/

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
Пример #10
0
static bool isTLD ( char *tld , int32_t tldLen ) {

	int32_t pcount = 0;
	// now they are random!
	for ( int32_t i = 0 ; i < tldLen ; i++ ) {
		// period count
		if ( tld[i] == '.' ) { pcount++; continue; }
		if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false;
	}

	if ( pcount == 0 ) return true;
	if ( pcount >= 2 ) return false;

	// otherwise, if one period, check table to see if qualified

	// we use this as our hashtable
	static bool       s_isInitialized = false;
	// . i shrunk this list a lot
	// . see backups for the hold list
	static const char * const s_tlds[] = {

	  // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
	"AAA",
	"AARP",
	"ABB",
	"ABBOTT",
	"ABBVIE",
	"ABOGADO",
	"ABUDHABI",
	"AC",
	"ACADEMY",
	"ACCENTURE",
	"ACCOUNTANT",
	"ACCOUNTANTS",
	"ACO",
	"ACTIVE",
	"ACTOR",
	"AD",
	"ADAC",
	"ADS",
	"ADULT",
	"AE",
	"AEG",
	"AERO",
	"AF",
	"AFL",
	"AG",
	"AGAKHAN",
	"AGENCY",
	"AI",
	"AIG",
	"AIRFORCE",
	"AIRTEL",
	"AKDN",
	"AL",
	"ALIBABA",
	"ALIPAY",
	"ALLFINANZ",
	"ALLY",
	"ALSACE",
	"AM",
	"AMICA",
	"AMSTERDAM",
	"ANALYTICS",
	"ANDROID",
	"ANQUAN",
	"AO",
	"APARTMENTS",
	"APP",
	"APPLE",
	"AQ",
	"AQUARELLE",
	"AR",
	"ARAMCO",
	"ARCHI",
	"ARMY",
	"ARPA",
	"ARTE",
	"AS",
	"ASIA",
	"ASSOCIATES",
	"AT",
	"ATTORNEY",
	"AU",
	"AUCTION",
	"AUDI",
	"AUDIO",
	"AUTHOR",
	"AUTO",
	"AUTOS",
	"AVIANCA",
	"AW",
	"AWS",
	"AX",
	"AXA",
	"AZ",
	"AZURE",
	"BA",
	"BABY",
	"BAIDU",
	"BAND",
	"BANK",
	"BAR",
	"BARCELONA",
	"BARCLAYCARD",
	"BARCLAYS",
	"BAREFOOT",
	"BARGAINS",
	"BAUHAUS",
	"BAYERN",
	"BB",
	"BBC",
	"BBVA",
	"BCG",
	"BCN",
	"BD",
	"BE",
	"BEATS",
	"BEER",
	"BENTLEY",
	"BERLIN",
	"BEST",
	"BET",
	"BF",
	"BG",
	"BH",
	"BHARTI",
	"BI",
	"BIBLE",
	"BID",
	"BIKE",
	"BING",
	"BINGO",
	"BIO",
	"BIZ",
	"BJ",
	"BLACK",
	"BLACKFRIDAY",
	"BLOOMBERG",
	"BLUE",
	"BM",
	"BMS",
	"BMW",
	"BN",
	"BNL",
	"BNPPARIBAS",
	"BO",
	"BOATS",
	"BOEHRINGER",
	"BOM",
	"BOND",
	"BOO",
	"BOOK",
	"BOOTS",
	"BOSCH",
	"BOSTIK",
	"BOT",
	"BOUTIQUE",
	"BR",
	"BRADESCO",
	"BRIDGESTONE",
	"BROADWAY",
	"BROKER",
	"BROTHER",
	"BRUSSELS",
	"BS",
	"BT",
	"BUDAPEST",
	"BUGATTI",
	"BUILD",
	"BUILDERS",
	"BUSINESS",
	"BUY",
	"BUZZ",
	"BV",
	"BW",
	"BY",
	"BZ",
	"BZH",
	"CA",
	"CAB",
	"CAFE",
	"CAL",
	"CALL",
	"CAMERA",
	"CAMP",
	"CANCERRESEARCH",
	"CANON",
	"CAPETOWN",
	"CAPITAL",
	"CAR",
	"CARAVAN",
	"CARDS",
	"CARE",
	"CAREER",
	"CAREERS",
	"CARS",
	"CARTIER",
	"CASA",
	"CASH",
	"CASINO",
	"CAT",
	"CATERING",
	"CBA",
	"CBN",
	"CC",
	"CD",
	"CEB",
	"CENTER",
	"CEO",
	"CERN",
	"CF",
	"CFA",
	"CFD",
	"CG",
	"CH",
	"CHANEL",
	"CHANNEL",
	"CHASE",
	"CHAT",
	"CHEAP",
	"CHLOE",
	"CHRISTMAS",
	"CHROME",
	"CHURCH",
	"CI",
	"CIPRIANI",
	"CIRCLE",
	"CISCO",
	"CITIC",
	"CITY",
	"CITYEATS",
	"CK",
	"CL",
	"CLAIMS",
	"CLEANING",
	"CLICK",
	"CLINIC",
	"CLINIQUE",
	"CLOTHING",
	"CLOUD",
	"CLUB",
	"CLUBMED",
	"CM",
	"CN",
	"CO",
	"COACH",
	"CODES",
	"COFFEE",
	"COLLEGE",
	"COLOGNE",
	"COM",
	"COMMBANK",
	"COMMUNITY",
	"COMPANY",
	"COMPARE",
	"COMPUTER",
	"COMSEC",
	"CONDOS",
	"CONSTRUCTION",
	"CONSULTING",
	"CONTACT",
	"CONTRACTORS",
	"COOKING",
	"COOL",
	"COOP",
	"CORSICA",
	"COUNTRY",
	"COUPON",
	"COUPONS",
	"COURSES",
	"CR",
	"CREDIT",
	"CREDITCARD",
	"CREDITUNION",
	"CRICKET",
	"CROWN",
	"CRS",
	"CRUISES",
	"CSC",
	"CU",
	"CUISINELLA",
	"CV",
	"CW",
	"CX",
	"CY",
	"CYMRU",
	"CYOU",
	"CZ",
	"DABUR",
	"DAD",
	"DANCE",
	"DATE",
	"DATING",
	"DATSUN",
	"DAY",
	"DCLK",
	"DE",
	"DEALER",
	"DEALS",
	"DEGREE",
	"DELIVERY",
	"DELL",
	"DELOITTE",
	"DELTA",
	"DEMOCRAT",
	"DENTAL",
	"DENTIST",
	"DESI",
	"DESIGN",
	"DEV",
	"DIAMONDS",
	"DIET",
	"DIGITAL",
	"DIRECT",
	"DIRECTORY",
	"DISCOUNT",
	"DJ",
	"DK",
	"DM",
	"DNP",
	"DO",
	"DOCS",
	"DOG",
	"DOHA",
	"DOMAINS",
	"DOWNLOAD",
	"DRIVE",
	"DUBAI",
	"DURBAN",
	"DVAG",
	"DZ",
	"EARTH",
	"EAT",
	"EC",
	"EDEKA",
	"EDU",
	"EDUCATION",
	"EE",
	"EG",
	"EMAIL",
	"EMERCK",
	"ENERGY",
	"ENGINEER",
	"ENGINEERING",
	"ENTERPRISES",
	"EPSON",
	"EQUIPMENT",
	"ER",
	"ERNI",
	"ES",
	"ESQ",
	"ESTATE",
	"ET",
	"EU",
	"EUROVISION",
	"EUS",
	"EVENTS",
	"EVERBANK",
	"EXCHANGE",
	"EXPERT",
	"EXPOSED",
	"EXPRESS",
	"EXTRASPACE",
	"FAGE",
	"FAIL",
	"FAIRWINDS",
	"FAITH",
	"FAMILY",
	"FAN",
	"FANS",
	"FARM",
	"FASHION",
	"FAST",
	"FEEDBACK",
	"FERRERO",
	"FI",
	"FILM",
	"FINAL",
	"FINANCE",
	"FINANCIAL",
	"FIRESTONE",
	"FIRMDALE",
	"FISH",
	"FISHING",
	"FIT",
	"FITNESS",
	"FJ",
	"FK",
	"FLICKR",
	"FLIGHTS",
	"FLORIST",
	"FLOWERS",
	"FLSMIDTH",
	"FLY",
	"FM",
	"FO",
	"FOO",
	"FOOTBALL",
	"FORD",
	"FOREX",
	"FORSALE",
	"FORUM",
	"FOUNDATION",
	"FOX",
	"FR",
	"FRESENIUS",
	"FRL",
	"FROGANS",
	"FRONTIER",
	"FTR",
	"FUND",
	"FURNITURE",
	"FUTBOL",
	"FYI",
	"GA",
	"GAL",
	"GALLERY",
	"GALLO",
	"GALLUP",
	"GAME",
	"GARDEN",
	"GB",
	"GBIZ",
	"GD",
	"GDN",
	"GE",
	"GEA",
	"GENT",
	"GENTING",
	"GF",
	"GG",
	"GGEE",
	"GH",
	"GI",
	"GIFT",
	"GIFTS",
	"GIVES",
	"GIVING",
	"GL",
	"GLASS",
	"GLE",
	"GLOBAL",
	"GLOBO",
	"GM",
	"GMAIL",
	"GMBH",
	"GMO",
	"GMX",
	"GN",
	"GOLD",
	"GOLDPOINT",
	"GOLF",
	"GOO",
	"GOOG",
	"GOOGLE",
	"GOP",
	"GOT",
	"GOV",
	"GP",
	"GQ",
	"GR",
	"GRAINGER",
	"GRAPHICS",
	"GRATIS",
	"GREEN",
	"GRIPE",
	"GROUP",
	"GS",
	"GT",
	"GU",
	"GUCCI",
	"GUGE",
	"GUIDE",
	"GUITARS",
	"GURU",
	"GW",
	"GY",
	"HAMBURG",
	"HANGOUT",
	"HAUS",
	"HDFCBANK",
	"HEALTH",
	"HEALTHCARE",
	"HELP",
	"HELSINKI",
	"HERE",
	"HERMES",
	"HIPHOP",
	"HITACHI",
	"HIV",
	"HK",
	"HM",
	"HN",
	"HOCKEY",
	"HOLDINGS",
	"HOLIDAY",
	"HOMEDEPOT",
	"HOMES",
	"HONDA",
	"HORSE",
	"HOST",
	"HOSTING",
	"HOTELES",
	"HOTMAIL",
	"HOUSE",
	"HOW",
	"HR",
	"HSBC",
	"HT",
	"HTC",
	"HU",
	"HYUNDAI",
	"IBM",
	"ICBC",
	"ICE",
	"ICU",
	"ID",
	"IE",
	"IFM",
	"IINET",
	"IL",
	"IM",
	"IMAMAT",
	"IMMO",
	"IMMOBILIEN",
	"IN",
	"INDUSTRIES",
	"INFINITI",
	"INFO",
	"ING",
	"INK",
	"INSTITUTE",
	"INSURANCE",
	"INSURE",
	"INT",
	"INTERNATIONAL",
	"INVESTMENTS",
	"IO",
	"IPIRANGA",
	"IQ",
	"IR",
	"IRISH",
	"IS",
	"ISELECT",
	"ISMAILI",
	"IST",
	"ISTANBUL",
	"IT",
	"ITAU",
	"IWC",
	"JAGUAR",
	"JAVA",
	"JCB",
	"JCP",
	"JE",
	"JETZT",
	"JEWELRY",
	"JLC",
	"JLL",
	"JM",
	"JMP",
	"JNJ",
	"JO",
	"JOBS",
	"JOBURG",
	"JOT",
	"JOY",
	"JP",
	"JPMORGAN",
	"JPRS",
	"JUEGOS",
	"KAUFEN",
	"KDDI",
	"KE",
	"KERRYHOTELS",
	"KERRYLOGISTICS",
	"KERRYPROPERTIES",
	"KFH",
	"KG",
	"KH",
	"KI",
	"KIA",
	"KIM",
	"KINDER",
	"KITCHEN",
	"KIWI",
	"KM",
	"KN",
	"KOELN",
	"KOMATSU",
	"KP",
	"KPMG",
	"KPN",
	"KR",
	"KRD",
	"KRED",
	"KUOKGROUP",
	"KW",
	"KY",
	"KYOTO",
	"KZ",
	"LA",
	"LACAIXA",
	"LAMBORGHINI",
	"LAMER",
	"LANCASTER",
	"LAND",
	"LANDROVER",
	"LANXESS",
	"LASALLE",
	"LAT",
	"LATROBE",
	"LAW",
	"LAWYER",
	"LB",
	"LC",
	"LDS",
	"LEASE",
	"LECLERC",
	"LEGAL",
	"LEXUS",
	"LGBT",
	"LI",
	"LIAISON",
	"LIDL",
	"LIFE",
	"LIFEINSURANCE",
	"LIFESTYLE",
	"LIGHTING",
	"LIKE",
	"LIMITED",
	"LIMO",
	"LINCOLN",
	"LINDE",
	"LINK",
	"LIPSY",
	"LIVE",
	"LIVING",
	"LIXIL",
	"LK",
	"LOAN",
	"LOANS",
	"LOCUS",
	"LOL",
	"LONDON",
	"LOTTE",
	"LOTTO",
	"LOVE",
	"LR",
	"LS",
	"LT",
	"LTD",
	"LTDA",
	"LU",
	"LUPIN",
	"LUXE",
	"LUXURY",
	"LV",
	"LY",
	"MA",
	"MADRID",
	"MAIF",
	"MAISON",
	"MAKEUP",
	"MAN",
	"MANAGEMENT",
	"MANGO",
	"MARKET",
	"MARKETING",
	"MARKETS",
	"MARRIOTT",
	"MBA",
	"MC",
	"MD",
	"ME",
	"MED",
	"MEDIA",
	"MEET",
	"MELBOURNE",
	"MEME",
	"MEMORIAL",
	"MEN",
	"MENU",
	"MEO",
	"MG",
	"MH",
	"MIAMI",
	"MICROSOFT",
	"MIL",
	"MINI",
	"MK",
	"ML",
	"MLS",
	"MM",
	"MMA",
	"MN",
	"MO",
	"MOBI",
	"MOBILY",
	"MODA",
	"MOE",
	"MOI",
	"MOM",
	"MONASH",
	"MONEY",
	"MONTBLANC",
	"MORMON",
	"MORTGAGE",
	"MOSCOW",
	"MOTORCYCLES",
	"MOV",
	"MOVIE",
	"MOVISTAR",
	"MP",
	"MQ",
	"MR",
	"MS",
	"MT",
	"MTN",
	"MTPC",
	"MTR",
	"MU",
	"MUSEUM",
	"MUTUAL",
	"MUTUELLE",
	"MV",
	"MW",
	"MX",
	"MY",
	"MZ",
	"NA",
	"NADEX",
	"NAGOYA",
	"NAME",
	"NATURA",
	"NAVY",
	"NC",
	"NE",
	"NEC",
	"NET",
	"NETBANK",
	"NETWORK",
	"NEUSTAR",
	"NEW",
	"NEWS",
	"NEXT",
	"NEXTDIRECT",
	"NEXUS",
	"NF",
	"NG",
	"NGO",
	"NHK",
	"NI",
	"NICO",
	"NIKON",
	"NINJA",
	"NISSAN",
	"NISSAY",
	"NL",
	"NO",
	"NOKIA",
	"NORTHWESTERNMUTUAL",
	"NORTON",
	"NOWRUZ",
	"NP",
	"NR",
	"NRA",
	"NRW",
	"NTT",
	"NU",
	"NYC",
	"NZ",
	"OBI",
	"OFFICE",
	"OKINAWA",
	"OLAYAN",
	"OM",
	"OMEGA",
	"ONE",
	"ONG",
	"ONL",
	"ONLINE",
	"OOO",
	"ORACLE",
	"ORANGE",
	"ORG",
	"ORGANIC",
	"ORIGINS",
	"OSAKA",
	"OTSUKA",
	"OVH",
	"PA",
	"PAGE",
	"PAMPEREDCHEF",
	"PANERAI",
	"PARIS",
	"PARS",
	"PARTNERS",
	"PARTS",
	"PARTY",
	"PASSAGENS",
	"PE",
	"PET",
	"PF",
	"PG",
	"PH",
	"PHARMACY",
	"PHILIPS",
	"PHOTO",
	"PHOTOGRAPHY",
	"PHOTOS",
	"PHYSIO",
	"PIAGET",
	"PICS",
	"PICTET",
	"PICTURES",
	"PID",
	"PIN",
	"PING",
	"PINK",
	"PIZZA",
	"PK",
	"PL",
	"PLACE",
	"PLAY",
	"PLAYSTATION",
	"PLUMBING",
	"PLUS",
	"PM",
	"PN",
	"POHL",
	"POKER",
	"P**N",
	"POST",
	"PR",
	"PRAXI",
	"PRESS",
	"PRO",
	"PROD",
	"PRODUCTIONS",
	"PROF",
	"PROGRESSIVE",
	"PROMO",
	"PROPERTIES",
	"PROPERTY",
	"PROTECTION",
	"PS",
	"PT",
	"PUB",
	"PW",
	"PWC",
	"PY",
	"QA",
	"QPON",
	"QUEBEC",
	"QUEST",
	"RACING",
	"RE",
	"READ",
	"REALTOR",
	"REALTY",
	"RECIPES",
	"RED",
	"REDSTONE",
	"REDUMBRELLA",
	"REHAB",
	"REISE",
	"REISEN",
	"REIT",
	"REN",
	"RENT",
	"RENTALS",
	"REPAIR",
	"REPORT",
	"REPUBLICAN",
	"REST",
	"RESTAURANT",
	"REVIEW",
	"REVIEWS",
	"REXROTH",
	"RICH",
	"RICOH",
	"RIO",
	"RIP",
	"RO",
	"ROCHER",
	"ROCKS",
	"RODEO",
	"ROOM",
	"RS",
	"RSVP",
	"RU",
	"RUHR",
	"RUN",
	"RW",
	"RWE",
	"RYUKYU",
	"SA",
	"SAARLAND",
	"SAFE",
	"SAFETY",
	"SAKURA",
	"SALE",
	"SALON",
	"SAMSUNG",
	"SANDVIK",
	"SANDVIKCOROMANT",
	"SANOFI",
	"SAP",
	"SAPO",
	"SARL",
	"SAS",
	"SAXO",
	"SB",
	"SBI",
	"SBS",
	"SC",
	"SCA",
	"SCB",
	"SCHAEFFLER",
	"SCHMIDT",
	"SCHOLARSHIPS",
	"SCHOOL",
	"SCHULE",
	"SCHWARZ",
	"SCIENCE",
	"SCOR",
	"SCOT",
	"SD",
	"SE",
	"SEAT",
	"SECURITY",
	"SEEK",
	"SELECT",
	"SENER",
	"SERVICES",
	"SEVEN",
	"SEW",
	"SEX",
	"SEXY",
	"SFR",
	"SG",
	"SH",
	"SHARP",
	"SHAW",
	"SHELL",
	"SHIA",
	"SHIKSHA",
	"SHOES",
	"SHOUJI",
	"SHOW",
	"SHRIRAM",
	"SI",
	"SINA",
	"SINGLES",
	"SITE",
	"SJ",
	"SK",
	"SKI",
	"SKIN",
	"SKY",
	"SKYPE",
	"SL",
	"SM",
	"SMILE",
	"SN",
	"SNCF",
	"SO",
	"SOCCER",
	"SOCIAL",
	"SOFTBANK",
	"SOFTWARE",
	"SOHU",
	"SOLAR",
	"SOLUTIONS",
	"SONG",
	"SONY",
	"SOY",
	"SPACE",
	"SPIEGEL",
	"SPOT",
	"SPREADBETTING",
	"SR",
	"SRL",
	"ST",
	"STADA",
	"STAR",
	"STARHUB",
	"STATEBANK",
	"STATEFARM",
	"STATOIL",
	"STC",
	"STCGROUP",
	"STOCKHOLM",
	"STORAGE",
	"STORE",
	"STREAM",
	"STUDIO",
	"STUDY",
	"STYLE",
	"SU",
	"SUCKS",
	"SUPPLIES",
	"SUPPLY",
	"SUPPORT",
	"SURF",
	"SURGERY",
	"SUZUKI",
	"SV",
	"SWATCH",
	"SWISS",
	"SX",
	"SY",
	"SYDNEY",
	"SYMANTEC",
	"SYSTEMS",
	"SZ",
	"TAB",
	"TAIPEI",
	"TALK",
	"TAOBAO",
	"TATAMOTORS",
	"TATAR",
	"TATTOO",
	"TAX",
	"TAXI",
	"TC",
	"TCI",
	"TD",
	"TEAM",
	"TECH",
	"TECHNOLOGY",
	"TEL",
	"TELECITY",
	"TELEFONICA",
	"TEMASEK",
	"TENNIS",
	"TEVA",
	"TF",
	"TG",
	"TH",
	"THD",
	"THEATER",
	"THEATRE",
	"TICKETS",
	"TIENDA",
	"TIFFANY",
	"TIPS",
	"TIRES",
	"TIROL",
	"TJ",
	"TK",
	"TL",
	"TM",
	"TMALL",
	"TN",
	"TO",
	"TODAY",
	"TOKYO",
	"TOOLS",
	"TOP",
	"TORAY",
	"TOSHIBA",
	"TOTAL",
	"TOURS",
	"TOWN",
	"TOYOTA",
	"TOYS",
	"TR",
	"TRADE",
	"TRADING",
	"TRAINING",
	"TRAVEL",
	"TRAVELERS",
	"TRAVELERSINSURANCE",
	"TRUST",
	"TRV",
	"TT",
	"TUBE",
	"TUI",
	"TUNES",
	"TUSHU",
	"TV",
	"TVS",
	"TW",
	"TZ",
	"UA",
	"UBS",
	"UG",
	"UK",
	"UNICOM",
	"UNIVERSITY",
	"UNO",
	"UOL",
	"US",
	"UY",
	"UZ",
	"VA",
	"VACATIONS",
	"VANA",
	"VC",
	"VE",
	"VEGAS",
	"VENTURES",
	"VERISIGN",
	"VERSICHERUNG",
	"VET",
	"VG",
	"VI",
	"VIAJES",
	"VIDEO",
	"VIG",
	"VIKING",
	"VILLAS",
	"VIN",
	"VIP",
	"VIRGIN",
	"VISION",
	"VISTA",
	"VISTAPRINT",
	"VIVA",
	"VLAANDEREN",
	"VN",
	"VODKA",
	"VOLKSWAGEN",
	"VOTE",
	"VOTING",
	"VOTO",
	"VOYAGE",
	"VU",
	"VUELOS",
	"WALES",
	"WALTER",
	"WANG",
	"WANGGOU",
	"WARMAN",
	"WATCH",
	"WATCHES",
	"WEATHER",
	"WEATHERCHANNEL",
	"WEBCAM",
	"WEBER",
	"WEBSITE",
	"WED",
	"WEDDING",
	"WEIBO",
	"WEIR",
	"WF",
	"WHOSWHO",
	"WIEN",
	"WIKI",
	"WILLIAMHILL",
	"WIN",
	"WINDOWS",
	"WINE",
	"WME",
	"WOLTERSKLUWER",
	"WORK",
	"WORKS",
	"WORLD",
	"WS",
	"WTC",
	"WTF",
	"XBOX",
	"XEROX",
	"XIHUAN",
	"XIN",
	"XN--11B4C3D",
	"XN--1CK2E1B",
	"XN--1QQW23A",
	"XN--30RR7Y",
	"XN--3BST00M",
	"XN--3DS443G",
	"XN--3E0B707E",
	"XN--3PXU8K",
	"XN--42C2D9A",
	"XN--45BRJ9C",
	"XN--45Q11C",
	"XN--4GBRIM",
	"XN--55QW42G",
	"XN--55QX5D",
	"XN--5TZM5G",
	"XN--6FRZ82G",
	"XN--6QQ986B3XL",
	"XN--80ADXHKS",
	"XN--80AO21A",
	"XN--80ASEHDB",
	"XN--80ASWG",
	"XN--8Y0A063A",
	"XN--90A3AC",
	"XN--90AIS",
	"XN--9DBQ2A",
	"XN--9ET52U",
	"XN--9KRT00A",
	"XN--B4W605FERD",
	"XN--BCK1B9A5DRE4C",
	"XN--C1AVG",
	"XN--C2BR7G",
	"XN--CCK2B3B",
	"XN--CG4BKI",
	"XN--CLCHC0EA0B2G2A9GCD",
	"XN--CZR694B",
	"XN--CZRS0T",
	"XN--CZRU2D",
	"XN--D1ACJ3B",
	"XN--D1ALF",
	"XN--E1A4C",
	"XN--ECKVDTC9D",
	"XN--EFVY88H",
	"XN--ESTV75G",
	"XN--FCT429K",
	"XN--FHBEI",
	"XN--FIQ228C5HS",
	"XN--FIQ64B",
	"XN--FIQS8S",
	"XN--FIQZ9S",
	"XN--FJQ720A",
	"XN--FLW351E",
	"XN--FPCRJ9C3D",
	"XN--FZC2C9E2C",
	"XN--G2XX48C",
	"XN--GCKR3F0F",
	"XN--GECRJ9C",
	"XN--H2BRJ9C",
	"XN--HXT814E",
	"XN--I1B6B1A6A2E",
	"XN--IMR513N",
	"XN--IO0A7I",
	"XN--J1AEF",
	"XN--J1AMH",
	"XN--J6W193G",
	"XN--JLQ61U9W7B",
	"XN--JVR189M",
	"XN--KCRX77D1X4A",
	"XN--KPRW13D",
	"XN--KPRY57D",
	"XN--KPU716F",
	"XN--KPUT3I",
	"XN--L1ACC",
	"XN--LGBBAT1AD8J",
	"XN--MGB9AWBF",
	"XN--MGBA3A3EJT",
	"XN--MGBA3A4F16A",
	"XN--MGBA7C0BBN0A",
	"XN--MGBAAM7A8H",
	"XN--MGBAB2BD",
	"XN--MGBAYH7GPA",
	"XN--MGBB9FBPOB",
	"XN--MGBBH1A71E",
	"XN--MGBC0A9AZCG",
	"XN--MGBCA7DZDO",
	"XN--MGBERP4A5D4AR",
	"XN--MGBPL2FH",
	"XN--MGBT3DHD",
	"XN--MGBTX2B",
	"XN--MGBX4CD0AB",
	"XN--MIX891F",
	"XN--MK1BU44C",
	"XN--MXTQ1M",
	"XN--NGBC5AZD",
	"XN--NGBE9E0A",
	"XN--NODE",
	"XN--NQV7F",
	"XN--NQV7FS00EMA",
	"XN--NYQY26A",
	"XN--O3CW4H",
	"XN--OGBPF8FL",
	"XN--P1ACF",
	"XN--P1AI",
	"XN--PBT977C",
	"XN--PGBS0DH",
	"XN--PSSY2U",
	"XN--Q9JYB4C",
	"XN--QCKA1PMC",
	"XN--QXAM",
	"XN--RHQV96G",
	"XN--ROVU88B",
	"XN--S9BRJ9C",
	"XN--SES554G",
	"XN--T60B56A",
	"XN--TCKWE",
	"XN--UNUP4Y",
	"XN--VERMGENSBERATER-CTB",
	"XN--VERMGENSBERATUNG-PWB",
	"XN--VHQUV",
	"XN--VUQ861B",
	"XN--W4R85EL8FHU5DNRA",
	"XN--WGBH1C",
	"XN--WGBL6A",
	"XN--XHQ521B",
	"XN--XKC2AL3HYE2A",
	"XN--XKC2DL3A5EE0H",
	"XN--Y9A3AQ",
	"XN--YFRO4I67O",
	"XN--YGBI2AMMX",
	"XN--ZFR164B",
	"XPERIA",
	"XXX",
	"XYZ",
	"YACHTS",
	"YAHOO",
	"YAMAXUN",
	"YANDEX",
	"YE",
	"YODOBASHI",
	"YOGA",
	"YOKOHAMA",
	"YOU",
	"YOUTUBE",
	"YT",
	"YUN",
	"ZA",
	"ZARA",
	"ZERO",
	"ZIP",
	"ZM",
	"ZONE",
	"ZUERICH",
	"ZW",


	"AB.CA",
	"AC.AE",
	"AC.AT",
	"AC.CN",
	"AC.CR",
	"AC.CY",
	"AC.FJ",
	"AC.GG",
	"AC.ID",
	"AC.IL",
	"AC.IM",
	"AC.IN",
	"AC.JE",
	"AC.JP",
	"AC.KR",
	"AC.NZ",
	"AC.PA",
	"AC.TH",
	"AC.UG",
	"AC.UK",
	"AC.YU",
	"AC.ZA",
	"AD.JP",
	"AH.CN",
	"ALDERNEY.GG",
	"ALT.ZA",
	"ART.BR",
	"ART.DO",
	"ARTS.CO",
	"ARTS.VE",
	"ASN.AU",
	"ASN.LV",
	"BBS.TR",
	"BC.CA",
	"BIB.VE",
	"BJ.CN",
	"CO.AT",
	"CO.AO",
	"CO.CK",
	"CO.CR",
	"CO.GG",
	"CO.HU",
	"CO.ID",
	"CO.IL",
	"CO.IM",
	"CO.IN",
	"CO.JE",
	"CO.JP",
	"CO.KR",
	"COM.AR",
	"COM.AU",
	"COM.AZ",
	"COM.BB",
	"COM.BM",
	"COM.BR",
	"COM.BS",
	"COM.CN",
	"COM.CO",
	"COM.CU",
	"COM.CY",
	"COM.DO",
	"COM.EC",
	"COM.EG",
	"COM.FJ",
	"COM.GE",
	"COM.GU",
	"COM.HK",
	"COM.JO",
	"COM.KH",
	"COM.LA",
	"COM.LB",
	"COM.LC",
	"COM.LV",
	"COM.LY",
	"COM.MM",
	"COM.MO",
	"COM.MT",
	"COM.MX",
	"COM.MY",
	"COM.NA",
	"COM.NC",
	"COM.NI",
	"COM.NP",
	"COM.PA",
	"COM.PE",
	"COM.PH",
	"COM.PL",
	"COM.PY",
	"COM.RU",
	"COM.SG",
	"COM.SH",
	"COM.SY",
	"COM.TN",
	"COM.TR",
	"COM.TW",
	"COM.UA",
	"COM.UY",
	"COM.VE",
	"CONF.AU",
	"CONF.LV",
	"CO.NZ",
	"COOP",
	"CO.AE",
	"CO.SV",
	"CO.TH",
	"CO.UG",
	"CO.UK",
	"CO.VE",
	"CO.VI",
	"CO.YU",
	"CO.ZA",
	"CQ.CN",
	"CSIRO.AU",
	"ED.CR",
	"EDU.BM",
	"EDU.AR",
	"EDU.CN",
	"EDU.CO",
	"EDU.DO",
	"EDU.EC",
	"EDU.EG",
	"EDU.GE",
	"EDU.GU",
	"EDU.JO",
	"EDU.LC",
	"EDU.LV",
	"EDU.MM",
	"EDU.MO",
	"EDU.MY",
	"EDUNET.TN",
	"EDU.PA",
	"EDU.PY",
	"EDU.SG",
	"EDU.SH",
	"EDU.TR",
	"EDU.TW",
	"EDU.UY",
	"EDU.VE",
	"EDU.YU",
	"EDU.ZA",
	"ENS.TN",
	"ERNET.IN",
	"ESP.BR",
	"ETC.BR",
	"EUN.EG",
	"FI.CR",
	"FIN.EC",
	"FIN.TN",
	"FIRM.CO",
	"FIRM.VE",
	"G12.BR",
	"GD.CN",
	"GEN.NZ",
	"GOB.PA",
	"GO.CR",
	"GO.ID",
	"GO.KR",
	"GO.TH",
	"GO.UG",
	"GOV.AE",
	"GOV.AR",
	"GOV.AU",
	"GOV.BM",
	"GOV.BR",
	"GOV.CN",
	"GOV.CO",
	"GOV.CY",
	"GOV.DO",
	"GOV.EC",
	"GOV.EG",
	"GOVE.TW",
	"GOV.FJ",
	"GOV.GE",
	"GOV.GG",
	"GOV.GU",
	"GOV.IL",
	"GOV.IM",
	"GOV.IN",
	"GOV.JE",
	"GOV.JO",
	"GOV.JP",
	"GOV.LB",
	"GOV.LC",
	"GOV.LV",
	"GOV.MM",
	"GOV.MO",
	"GOV.MY",
	"GOV.SG",
	"GOV.SH",
	"GOV.TN",
	"GOVT.NZ",
	"GOV.TR",
	"GOV.UA",
	"GOV.UK",
	"GOV.VE",
	"GOV.ZA",
	"GS.CN",
	"GUERNSEY.GG",
	"GX.CN",
	"GZ.CN",
	"HB.CN",
	"HE.CN",
	"HI.CN",
	"HK.CN",
	"HL.CN",
	"HN.CN",
	"ID.AU",
	"ID.FJ",
	"ID.LV",
	"IND.BR",
	"IND.GG",
	"IND.JE",
	"IND.TN",
	"INF.BR",
	"INFO.AU",
	"INFO.CO",
	"INFO.HU",
	"INFO.TN",
	"INFO.VE",
	"INT.CO",
	"INTL.TN",
	"INT.VE",
	"JERSEY.JE",
	"JL.CN",
	"JS.CN",
	"K12.EC",
	"K12.IL",
	"K12.TR",
	"LKD.CO.IM",
	"LN.CN",
	"LTD.GG",
	"LTD.JE",
	"LTD.UK",
	"MB.CA",
	"MED.EC",
	"MIL.BR",
	"MIL.CO",
	"MIL.DO",
	"MIL.EC",
	"MIL.GE",
	"MIL.GU",
	"MIL.ID",
	"MIL.LB",
	"MIL.LV",
	"MIL.PH",
	"MIL.SH",
	"MIL.TR",
	"MIL.VE",
	"MIL.ZA",
	"MO.CN",
	"MOD.UK",
	"MUNI.IL",
	"MUSEUM",
	"NAME",
	"NAT.TN",
	"NB.CA",
	"NET.AR",
	"NET.AU",
	"NET.AZ",
	"NET.BB",
	"NET.BM",
	"NET.BR",
	"NET.BS",
	"NET.CN",
	"NET.CU",
	"NET.CY",
	"NET.DO",
	"NET.EC",
	"NET.EG",
	"NET.GE",
	"NET.GG",
	"NET.GU",
	"NET.HK",
	"NET.ID",
	"NET.IL",
	"NET.IM",
	"NET.IN",
	"NET.JE",
	"NET.JO",
	"NET.JP",
	"NET.KH",
	"NET.LA",
	"NET.LB",
	"NET.LC",
	"NET.LV",
	"NET.LY",
	"NET.MM",
	"NET.MO",
	"NET.MT",
	"NET.MX",
	"NET.MY",
	"NET.NA",
	"NET.NC",
	"NET.NP",
	"NET.NZ",
	"NET.PA",
	"NET.PE",
	"NET.PH",
	"NET.PL",
	"NET.PY",
	"NET.RU",
	"NET.SG",
	"NET.SH",
	"NET.SY",
	"NET.TH",
	"NET.TN",
	"NET.TR",
	"NET.TW",
	"NET.UA",
	"NET.UK",
	"NET.UY",
	"NET.VE",
	"NET.VI",
	"NET.ZA",
	"NF.CA",
	"NGO.PH",
	"NGO.ZA",
	"NHS.UK",
	"NIC.IM",
	"NIC.IN",
	"NM.CN",
	"NM.KR",
	"NOM.CO",
	"NOM.VE",
	"NOM.ZA",
	"NS.CA",
	"NSK.SU",
	"NT.CA",
	"NUI.HU",
	"NX.CN",
	"ON.CA",
	"OR.CR",
	"ORG.AE",
	"ORG.AR",
	"ORG.AU",
	"ORG.AZ",
	"ORG.BB",
	"ORG.BM",
	"ORG.BR",
	"ORG.BS",
	"ORG.CN",
	"ORG.CO",
	"ORG.CU",
	"ORG.CY",
	"ORG.DO",
	"ORG.EC",
	"ORG.EG",
	"ORG.FJ",
	"ORG.GE",
	"ORG.GG",
	"ORG.GU",
	"ORG.HK",
	"ORG.HU",
	"ORG.IL",
	"ORG.IM",
	"ORG.JE",
	"ORG.JP",
	"ORG.KH",
	"ORG.LA",
	"ORG.LB",
	"ORG.LC",
	"ORG.LV",
	"ORG.LY",
	"ORG.MM",
	"ORG.MO",
	"ORG.MT",
	"ORG.MX",
	"ORG.MY",
	"ORG.NA",
	"ORG.NC",
	"ORG.NZ",
	"ORG.PA",
	"ORG.PE",
	"ORG.PH",
	"ORG.PL",
	"ORG.PY",
	"ORG.RU",
	"ORG.SG",
	"ORG.SH",
	"ORG.SY",
	"ORG.TN",
	"ORG.TR",
	"ORG.TW",
	"ORG.UK",
	"ORG.UY",
	"ORG.VE",
	"ORG.VI",
	"ORG.YU",
	"ORG.ZA",
	"OR.ID",
	"OR.KR",
	"OR.TH",
	"ORT.NP",
	"OR.UG",
	"OZ.AU",
	"PE.CA",
	"PLC.CO.IM",
	"PLC.UK",
	"POLICE.UK",
	"PRIV.HU",
	"PSI.BR",
	"PVT.GE",
	"QC.CA",
	"QH.CN",
	"REC.BR",
	"REC.CO",
	"REC.VE",
	"RE.KR",
	"RES.IN",
	"RNRT.TN",
	"RNS.TN",
	"RNU.TN",
	"SA.CR",
	"SARK.GG",
	"SC.CN",
	"SCH.GG",
	"SCH.JE",
	"SCHOOL.FJ",
	"SCHOOL.ZA",
	"SCH.UK",
	"SCI.EG",
	"SH.CN",
	"SK.CA",
	"SLD.PA",
	"SN.CN",
	"STORE.CO",
	"STORE.VE",
	"SX.CN",
	"TEC.VE",
	"TELEMEMO.AU",
	"TJ.CN",
	"TM.HU",
	"TMP.BR",
	"TM.ZA",
	"TOURISM.TN",
	"TW.CN",
	"WEB.CO",
	"WEB.DO",
	"WEB.VE",
	"WEB.ZA",
	"XJ.CN",
	"XZ.CN",
	"YK.CA",
	"YN.CN",
	"ZJ.CN"
};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
				     "tldtbl") ) 
			return log("build: Could not init table of TLDs.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char      *d    = s_tlds[i];
			int32_t       dlen = gbstrlen ( d );
			int64_t  dh   = hash64Lower_a ( d , dlen );
			if ( ! s_table.addKey (&dh,NULL) )
				return log("build: dom table failed");
		}
		s_isInitialized = true;
	} 
	int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
	return s_table.isInTable ( &h );//getScoreFromTermId ( h );
}