static bool initProxyTables() {
	// initialize proxy/urlip ban table?
	if ( ! s_init ) return true;
	s_init = false;
	return true;
bool resetProxyStats ( ) {
	// s_proxyBannedTable.reset();
	// s_banCountTable.reset();
	// s_iptab.reset();
	// skip port part of key magic, and get LSB of the IP as key magic
	s_iptab.m_maskKeyOffset = 5;
	return buildProxyTable();
// Slightly modified from getTextEntity
int16_t get_iana_charset(const char *cs, int len)
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) {
		log(LOG_WARN, "build: Could not init table of IANA Charsets.");
		return csUnknown;
	// now add in all the charset entries
	int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( int32_t i = 0 ; i < n ; i++ ) {
	    int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(h, i+1) ) {
			log(LOG_WARN, "build: add term failed");
			return csUnknown;
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    int64_t h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    int32_t i = (int32_t) s_table.getScore(h);
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (int16_t)s_charsets[i-1].mib_enum;
// Slightly modified from getTextEntity
short get_iana_charset(char *cs, int len)
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") )
	    return log("build: Could not init table of "
		       "IANA Charsets.");
	// now add in all the charset entries
	long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( long i = 0 ; i < n ; i++ ) {
	    long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(&h, i+1) ) 
		return log("build: add term failed");
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    long long h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    long i = (long) s_table.getScore ( &h );
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (short)s_charsets[i-1].mib_enum;
Ejemplo n.º 5
// . init s_mimeTable in this call
// . called from HttpServer::init
// . returns false and sets g_errno on error
bool HttpMime::init ( ) {
	// only need to call once
	if ( s_init ) return true;
	// make sure only called once
	s_init = true;
	//s_mimeTable.set ( 256 );
	if ( ! s_mimeTable.set(4,sizeof(char *),256,NULL,0,false,1,"mimetbl"))
		return false;
	// set table from internal list
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) {
		int32_t key = hash32n ( s_ext[i] );
		if ( ! s_mimeTable.addKey ( &key , &s_ext[i+1] ) ) 
			return log("HttpMime::init: failed to set table.");
	// quick text
	const char *tt = getContentTypeFromExtension ( "zip" );
	if ( strcmp(tt,"application/zip") != 0 ) {
		g_errno = EBADENGINEER;
		return log("http: Failed to init mime table correctly.");
	// a more thorough test
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) {
		tt = getContentTypeFromExtension ( s_ext[i] );
		if ( strcmp(tt,s_ext[i+1]) == 0 ) continue;
		g_errno = EBADENGINEER;
		return log("http: Failed to do mime table correctly. i=%" PRId32,i);

	// TODO: set it from a user supplied file here
	return true;
bool AdultBit::isDirty ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_dirty[] = {

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_dtable.set ( 8,4,sizeof(s_dirty  )*2,NULL,0,false,0,
			return log("build: Error initializing "
				    "dirty word hash table." );
		// now add in all the dirty words
		int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_dirty  [i] );
			if ( ! s_dtable.addTerm (&h, i+1) ) return false;
		s_isInitialized = true;

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_dtable.getScore ( &h );
// call this at startup to register the handlers
bool initSpiderProxyStuff() {
	// do this for all hosts in case host #0 goes dead, then everyone
	// will, according to Msg13.cpp, send to host #1, the next in line
	// if she is alive
	//if ( g_hostdb.m_myHostId != 0 ) return true;

	// only host #0 has handlers
	if ( ! g_udpServer.registerHandler ( msg_type_54, handleRequest54 ))
		return false;

	// key is ip/port
	// skip port part of key magic, and get LSB of the IP as key magic
	s_iptab.m_maskKeyOffset = 5;


	// build the s_iptab hashtable for the first time
	buildProxyTable ();

	// reset spider proxy stats every hour to alleviate false positives (moved from Process.cpp)
	if (!g_loop.registerSleepCallback(3600000, NULL, resetProxyStatWrapper, 0)) {

	// make the loadtable hashtable
	static bool s_flag = 0;
	if ( s_flag ) return true;
	s_flag = true;
	return s_loadTable.set(4,
			       // this slows us down
			       true, // allow dups?
			       true); // use key magic to mix things up

Ejemplo n.º 8
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			// make my own mods to make parsing easier

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;

			// end custom mods

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		s_isInitialized = true;
	return true;
bool AdultBit::isObscene ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_obscene[] = {
//		"cum",    magna cum laude

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
				      "obscenetab") ) 
			return log("build: Error initializing "
				    "obscene word hash table." );
		// now add in all the stop words
		int32_t n = sizeof(s_obscene) / sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_obscene[i] );
			if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
		s_isInitialized = true;

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_otable.getScore ( &h );
nodeid_t getTagId ( char *s , NodeType **retp ) {

	// init table?
	static bool s_init = false;
	static HashTableX  s_ht;
	static char s_buf[10000];
	if ( ! s_init ) {
		s_init = true;
		s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
		// how many NodeTypes do we have in g_nodes?
		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			char *name = g_nodes[i].m_nodeName;
			int32_t  nlen = gbstrlen(name);
			int64_t h = hash64Upper_a ( name,nlen,0LL );
			NodeType *nt = &g_nodes[i];
			if ( ! s_ht.addKey(&h,&nt) ) { 
				char *xx=NULL;*xx=0; }
		// sanity
		if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
		// sanity test
		nodeid_t tt = getTagId ( "br" );
		if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }

	// find end of tag name. hyphens are ok to be in name.
	// facebook uses underscores like <start_time>
	char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
	// hash it for lookup
	int64_t h = hash64Upper_a ( s , e - s , 0 );
	// look it up
	NodeType **ntp = (NodeType **)s_ht.getValue(&h);
	// assume none
	if ( retp ) *retp = NULL;
	// none?
	if ( ! ntp ) return 0;
	// got one
	if ( retp ) *retp = *ntp;
	// get id otherwise
	return (*ntp)->m_nodeId;
// . how many keys are dups
// . returns -1 on error
long HashTableX::getNumDups() {
	if ( ! m_allowDups ) return 0;
	HashTableX tmp;
	if ( ! tmp.set ( m_ks, 0, m_numSlots, NULL , 0 , false , m_niceness,
			 "htxtmp") )
		return -1;
	// put into that table
	for ( long i = 0 ; i < m_numSlots ; i++ ) {
		// skip empty bucket
		if ( ! m_flags[i] ) continue;
		// get the key
		char *kp = (char *)getKeyFromSlot(i);
		// add to new table
		if ( ! tmp.addKey ( kp ) ) return -1;
	// the unqieus
	long uniques = tmp.m_numSlotsUsed;
	// the dups
	long dups = m_numSlotsUsed - uniques;
	// that's it
	return dups;
// get the id from a 2 character country code
uint8_t getCountryId ( char *cc ) {
	static bool s_init = false;
	static char buf[2000];
	static HashTableX ht;
	char tmp[4];
	if ( ! s_init ) {
		s_init = true;
		// hash them up
		ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids");
		// now add in all the country codes
		long n = (long) sizeof(s_countryCode) / sizeof(char *); 
		for ( long i = 0 ; i < n ; i++ ) {
			char *s    = (char *)s_countryCode[i];
			//long  slen = gbstrlen ( s );
			// sanity check
			if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; }
			// map it to a 4 byte key
			// a val of 0 does not mean empty in HashTableX,
			// that is an artifact of HashTableT
			uint8_t val = i; // +1;
			// add 1 cuz 0 means lang unknown
			if ( ! ht.addKey ( tmp , &val ) ) {
				char *xx=NULL;*xx=0; }
	// lookup
	long slot = ht.getSlot ( tmp );
	if ( slot < 0 ) return 0;
	void *val = ht.getValueFromSlot ( slot );
	return *(uint8_t *)val ;
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				buf += len;
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		s_isInitialized = true;
	return true;
bool ucInit(char *path, bool verifyFiles){

	char file[384];
	if (path == NULL) path = "./";

	// Might want to move this out of ucInit someday
	// but right now it's the only thing that uses .so files (?)
	char gbLibDir[512];
	snprintf(gbLibDir, 512, "%s/lib",path);
	// i don't think this is used any more because we don't have it!
	//log(LOG_INIT, "ucinit: Setting LD_RUN_PATH to \"%s\"",gbLibDir);
	if (setenv("LD_RUN_PATH", gbLibDir, 1)){
		log(LOG_INIT, "Failed to set LD_RUN_PATH");
	//char *ldpath = getenv("LD_RUN_PATH");
	// i don't think this is used any more because we don't have it!
	//log(LOG_DEBUG, "ucinit: LD_RUN_PATH: %s\n", ldpath);

	strcpy(file, path);
	strcat(file, "/ucdata/uppermap.dat");
	if (!loadUnicodeTable(&g_ucUpperMap,file, 
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/lowermap.dat");
	if (!loadUnicodeTable(&g_ucLowerMap,file, 
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/properties.dat");
	if (!loadUnicodeTable(&g_ucProps, file, 
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/combiningclass.dat");
	if (!loadUnicodeTable(&g_ucCombiningClass, file, 
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/scripts.dat");
	if (!loadUnicodeTable(&g_ucScripts, file, 
		goto failed;
	// MDW: do we need this for converting from X to utf8? or for
	// the is_alnum(), etc. functions?
	if (!loadDecompTables(path) ||
		goto failed;
	if ( ! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,0,"cnvtbl"))
		goto failed;
	// dont use these files anymore
	if (verifyFiles){
		if (!openIconvDescriptors())
			return log(LOG_WARN,
				   "uni: unable to open all iconv descriptors");

	return true;
	return log(LOG_WARN, 
		   "uni: unable to load all property tables");
Ejemplo n.º 15
static bool isTLD ( char *tld , int32_t tldLen ) {

	int32_t pcount = 0;
	// now they are random!
	for ( int32_t i = 0 ; i < tldLen ; i++ ) {
		// period count
		if ( tld[i] == '.' ) { pcount++; continue; }
		if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false;

	if ( pcount == 0 ) return true;
	if ( pcount >= 2 ) return false;

	// otherwise, if one period, check table to see if qualified

	// we use this as our hashtable
	static bool       s_isInitialized = false;
	// . i shrunk this list a lot
	// . see backups for the hold list
	static const char * const s_tlds[] = {

	  // From:


	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
				     "tldtbl") ) 
			return log("build: Could not init table of TLDs.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char      *d    = s_tlds[i];
			int32_t       dlen = gbstrlen ( d );
			int64_t  dh   = hash64Lower_a ( d , dlen );
			if ( ! s_table.addKey (&dh,NULL) )
				return log("build: dom table failed");
		s_isInitialized = true;
	int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
	return s_table.isInTable ( &h );//getScoreFromTermId ( h );
// . this returns false if blocks, true otherwise
// . sets g_errno on failure
bool Msg1c::gotList ( ) {

	if ( g_errno ) return true;

	int64_t *tmpDocIds = m_msg3a.getDocIds();
	int32_t       numDocIds = m_msg3a.getNumDocIds();

	if ( m_startNum > 0) {
		numDocIds -= m_startNum;
		tmpDocIds = &tmpDocIds[m_startNum];

	m_numDocIds = numDocIds; // save for reporting
	// log it
	log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds);
	// bail if no need
	if ( numDocIds <= 0 ) return true;

	// force spiders on on entire network. they will progagate from 
	// host #0... 
	g_conf.m_spideringEnabled = true;

	int32_t nowGlobal = getTimeGlobal();

	HashTableX dt;
	char dbuf[1024];


	State13 *st = (State13 *)m_state;
	GigablastRequest *gr = &st->m_gr;

	m_numDocIdsAdded = 0;

	// list consists of docIds, loop through each one
 	for(int32_t i = 0; i < numDocIds; i++) {
		int64_t docId = tmpDocIds[i];
		// when searching events we get multiple docids that are same
		if ( dt.isInTable ( &docId ) ) continue;
		// add it
		if ( ! dt.addKey ( &docId ) ) return true;

		SpiderRequest sr;

		// url is a docid!
		sprintf ( sr.m_url , "%" PRIu64 , docId );

		// make a fake first ip
		// use only 64k values so we don't stress doledb/waittrees/etc.
		// for large #'s of docids
		int32_t firstIp = (docId & 0x0000ffff);

		// bits 6-13 of the docid are the domain hash so use those
		// when doing a REINDEX (not delete!) to ensure that requests
		// on the same domain go to the same shard, at least when
		// we have up to 256 shards. if we have more than 256 shards
		// at this point some shards will not participate in the
		// query reindex/delete process because of this, so 
		// we'll want to allow more bits in in that case perhaps.
		// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
		// to see what shard is responsible for storing and indexing 
		// this SpiderRequest based on the firstIp.
		if ( ! m_forceDel ) { 
			// if we are a REINDEX not a delete because 
			// deletes don't need to spider/redownload the doc
			// so the distribution can be more random
			firstIp >>= 6;
			firstIp &= 0xff;

		// 0 is not a legit val. it'll core below.
		if ( firstIp == 0 ) {
			firstIp = 1;

		// use a fake ip
		sr.m_firstIp        =  firstIp;
		// we are not really injecting...
		sr.m_isInjecting    =  false;//true;
		sr.m_hopCount       = -1;
		sr.m_isPageReindex  =  1;
		sr.m_urlIsDocId     =  1;
		sr.m_fakeFirstIp    =  1;

		// now you can recycle content instead of re-downloading it
		// for every docid
		sr.m_recycleContent = gr->m_recycleContent;
		// if this is zero we end up getting deduped in
		// dedupSpiderList() if there was a SpiderReply whose
		// spider time was > 0
		sr.m_addedTime = nowGlobal;
	    sr.m_forceDelete = m_forceDel ? 1 : 0;

		// . complete its m_key member
		// . parentDocId is used to make the key, but only allow one
		//   page reindex spider request per url... so use "0"
		// . this will set "uh48" to hash64b(m_url) which is the docid
		sr.setKey( firstIp, 0LL , false );

		// how big to serialize
		int32_t recSize = sr.getRecSize();

		// store it
		if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) {
			// g_errno must be set
			if ( ! g_errno ) { g_process.shutdownAbort(true); }

			    "admin: Query reindex size of %" PRId32" "
			    "too big. Aborting. Bad engineer." , 
			    (int32_t)0);//m_list.getListSize() );
			return true;
// langId is language of the query
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) {
	Words ww;
	ww.set3 ( qstr );
	long nw = ww.getNumWords();
	long long *wids = ww.getWordIds();
	//char **wptrs = ww.getWords();
	//long *wlens = ww.getWordLens();
	long long baseHash64 = 0LL;
	Synonyms syn;
	// assume english if unknown to fix 'pandora's tower'
	// vs 'pandoras tower' where both words are in both
	// english and german so langid is unknown
	if ( langId == langUnknown ) langId = langEnglish;
	// . store re-written query into here then hash that string
	// . this way we can get rid of spaces
	//char rebuf[1024];
	//char *p = rebuf;
	//if ( strstr(qstr,"cheatcodes") )
	//	log("hey");
	// for deduping
	HashTableX dups;
	if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false;
	// scan the words
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not alnum
		if ( ! wids[i] ) continue;
		// get its synonyms into tmpBuf
		char tmpBuf[TMPSYNBUFSIZE];
		// . assume niceness of 0 for now
		// . make sure to get all synsets!! ('love' has two synsets)
		long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0);
		// term freq algo
		//long pop = g_speller.getPhrasePopularity(NULL,
		//					 wids[i],
		//					 true,
		//					 langId);
		// is it a queryStopWord like "the" or "and"?
		bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]);
		// a more restrictive list
		bool isStop = ::isStopWord(NULL,0,wids[i]);
		if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true;
		// find the smallest one
		unsigned long long min = wids[i];
		//char *minWordPtr = wptrs[i];
		//long  minWordLen = wlens[i];
		// declare up here since we have a goto below
		long j;
		// add to table too
		if ( dups.isInTable ( &min ) ) goto gotdup;
		// add to it
		if ( ! dups.addKey ( &min ) ) return false;
		// now scan the synonyms, they do not include "min" in them
		for ( j = 0 ; j < naids ; j++ ) {
			// get it
			unsigned long long aid64;
			aid64 = (unsigned long long)syn.m_aids[j];
			// if any syn already hashed then skip it and count
			// as a repeated term. we have to do it this way
			// rather than just getting the minimum synonym 
			// word id, because 'love' has two synsets and
			// 'like', a synonym of 'love' only has one synset
			// and they end up having different minimum synonym
			// word ids!!!
			if ( dups.isInTable ( &aid64 ) ) break;
			// add it. this could fail!
			if ( ! dups.addKey ( &aid64 ) ) return false;
			// set it?
			if ( aid64 >= min ) continue;
			// got a new min
			min = aid64;
			//minWordPtr = syn.m_termPtrs[j];
			//minWordLen = syn.m_termLens[j];
			// get largest term freq of all synonyms
			//long pop2 = g_speller.getPhrasePopularity(NULL,aid64,
			//					  true,langId);
			//if ( pop2 > pop ) pop = pop2;
		// early break out means a hit in dups table
		if ( j < naids ) {
			// do not count as repeat if query stop word
			// because they often repeat
			if ( isQueryStop ) continue;
			// count # of repeated word forms
		// hash that now
		// do not include stop words in synbasehash so
		// 'search the web' != 'search web'
		if ( ! isStop ) {
			// no! make it order independent so 'search the web'
			// equals 'web the search' and 'engine search'
			// equals 'search engine'
			//baseHash64 <<= 1LL;
			baseHash64 ^= min;
		// count it, but only if not a query stop word like "and"
		// or "the" or "a". # of unique word forms.
		//if ( ! isQueryStop ) nuwf++;
		// get term freq 
		//if ( pop > maxPop ) maxPop = pop;
		// control word?
		//if ( wids[i] == cw1 ) ncwf++;
	return baseHash64;
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
long Synonyms::getSynonyms ( Words *words , 
			     long wordNum , 
			     uint8_t langId ,
			     char *tmpBuf ,
			     long niceness ) {

	// punct words have no synoyms
	if ( ! words->m_wordIds[wordNum] ) return 0;

	// store these
	m_words     = words;
	m_docLangId = langId;
	m_niceness = niceness;

	// sanity check
	if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; }

	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];

	long maxSyns = (long)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = m_termPtrs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;

	char *w    = m_words->m_words   [wordNum];
	long  wlen = m_words->m_wordLens[wordNum];

	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below

	char sourceId = SOURCE_WIKTIONARY;
	char *ss = NULL;
	long long bwid;
	char wikiLangId = m_docLangId;
	bool hadSpace ;
	long klen ;
	long baseNumAlnumWords;


	// if word only exists in one language, assume that language for word
	// even if m_docLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_docLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		long long bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
			wikiLangId = getCharacterLanguage(w);

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId && 
	     wordNum+2< m_words->m_numWords &&
	     m_words->m_wordIds[wordNum+2]) {
		// get phrase id bigram then
		long conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		char *wp2 = m_words->m_words[wordNum+2];
		long  wlen2 = m_words->m_wordLens[wordNum+2];
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss && 
		     wlen >= 3 &&
		     w[wlen-2]=='\'' && 
		     w[wlen-1]=='s' ) {
			long long cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_docLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_docLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		goto tryOtherLang;

	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		long count = 0;
		// do we have another set following this
		char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf");
		// skip over the pipe i guess
		char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) { char *xx=NULL;*xx=0; }
		// point to word list
		char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		char *e = p + 1;
		// save count in case we need to undo
		//long saved = m_numAlts[wordNum];

		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer" 
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		long long h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		hadSpace = false;
		klen = e - p;
		for ( long k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			Words sw;
			sw.setx ( p , e - p , m_niceness );
			*(long long *)m_wids0Ptr = sw.m_wordIds[0];
			*(long long *)m_wids1Ptr = sw.m_wordIds[2];
			*(long  *)m_numAlnumWordsPtr = sw.getNumAlnumWords();


		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) goto done;
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
		// all done
		return m_aidsPtr - m_aids;

	// strip marks from THIS word, return -1 w/ g_errno set on error
	if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' && 
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe ( wordNum, &dt ) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
void processReply ( char *reply , long replyLen ) {

	// store our current reply
	SafeBuf fb2;
	fb2.safeMemcpy(reply,replyLen );

	// log that we got the reply
	log("qa: got reply(len=%li)(errno=%s)=%s",

	char *content = NULL;
	long  contentLen = 0;

	// get mime
	if ( reply ) {
		HttpMime mime;
		mime.set ( reply, replyLen , NULL );
		// only hash content since mime has a timestamp in it
		content = mime.getContent();
		contentLen = mime.getContentLen();
		if ( content && contentLen>0 && content[contentLen] ) { 
			char *xx=NULL;*xx=0; }

	if ( ! content ) {
		content = "";
		contentLen = 0;

	s_content = content;

	// take out <responseTimeMS>
	markOut ( content , "<currentTimeUTC>");
	markOut ( content , "<responseTimeMS>");

	// until i figure this one out, take it out
	markOut ( content , "<docsInCollection>");

	// until i figure this one out, take it out
	markOut ( content , "<hits>");

	// for those links in the html pages
	markOut ( content, "rand64=");

	// for json
	markOut ( content , "\"currentTimeUTC\":" );
	markOut ( content , "\"responseTimeMS\":");
	markOut ( content , "\"docsInCollection\":");

	// for xml
	markOut ( content , "<currentTimeUTC>" );
	markOut ( content , "<responseTimeMS>");
	markOut ( content , "<docsInCollection>");

	// indexed 1 day ago
	markOut ( content,"indexed:");
	// modified 1 day ago
	markOut ( content,"modified:");

	// s_gigabitCount... it is perpetually incrementing static counter
	// in PageResults.cpp

	// for some reason the term freq seems to change a little in
	// the scoring table

	// make checksum. we ignore back to back spaces so this
	// hash works for <docsInCollection>10 vs <docsInCollection>9
	long contentCRC = 0; 
	if ( content ) contentCRC = qa_hash32 ( content );

	// note it
	log("qa: got contentCRC of %lu",contentCRC);

	// if what we expected, save to disk if not there yet, then
	// call s_callback() to resume the qa pipeline
	if ( contentCRC == s_expectedCRC ) {
		// save content if good
		char fn3[1024];
		File ff; ff.set ( fn3 );
		if ( ! ff.doesExist() ) {
			// if not there yet then save it;
		// . continue on with the qa process
		// . which qa function that may be

	// if crc of content does not match what was expected then do a diff
	// so we can see why not

	// this means caller does not care about the response
	if ( ! s_checkCRC ) {

	//const char *emsg = "qa: bad contentCRC of %li should be %li "
	//	"\n";//"phase=%li\n";

	// hash url
	long urlHash32 = hash32n ( s_url.getUrl() );

	// combine test function too since two tests may use the same url
	long nameHash = hash32n ( s_qt->m_testName );

	// combine together
	urlHash32 = hash32h ( nameHash , urlHash32 );

	static bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		// make symlink
		//char cmd[512];
		//snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir);
		char dir[1024];
		long status = ::mkdir ( dir ,
					S_IROTH | S_IXOTH );
	        if ( status == -1 && errno != EEXIST && errno )
			log("qa: Failed to make directory %s: %s.",
		// try to load from disk
		SafeBuf fn;
		log("qa: loading crctable.dat");
		s_ht.load ( fn.getBufStart() , "crctable.dat" );

	// break up into lines
	char fn2[1024];
	sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); ( fn2 );

	// look up in hashtable to see what reply crc should be
	long *val = (long *)s_ht.getValue ( &urlHash32 );

	// just return if the same
	if ( val && contentCRC == *val ) {
		g_qaOutput.safePrintf("<b style=color:green;>"
				      "passed test</b><br>%s : "
				      "<a href=%s>%s</a> (urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>"

	if ( ! val ) {
		// add it so we know
		s_ht.addKey ( &urlHash32 , &contentCRC );
		g_qaOutput.safePrintf("<b style=color:blue;>"
				      "first time testing</b><br>%s : "
				      "<a href=%s>%s</a> "
				      "(urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>%lu"

	log("qa: crc changed for url %s from %li to %li",

	// get response on file
	SafeBuf fb1;
	char fn1[1024];
	sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val);

	// do the diff between the two replies so we can see what changed
	char cmd[1024];
	sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2);
	log("qa: %s\n",cmd);

	g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
			      "<a href=%s>%s</a> (urlhash=%lu)<br>"

			      "<input type=checkbox name=urlhash%lu value=1 "
			      // use ajax to update test crc. if you undo your
			      // check then it should put the old val back.
			      // when you first click the checkbox it should
			      // gray out the diff i guess.
			      "onclick=submitchanges(%lu,%lu);> "
			      "Accept changes"

			      "original on left, new on right. "
			      "oldcrc = <a href=/qa/content.%lu>%lu</a>"

			      " != <a href=/qa/content.%lu>%lu</a> = newcrc"
			      "<br>diff output follows:<br>"
			      "<pre id=%lu style=background-color:0xffffff;>",

			      // input checkbox name field

			      // submitchanges() parms

			      // original/old content.%lu

			      // new content.%lu

			      // for the pre tag id:

	// store in output
	SafeBuf sb;
	g_qaOutput.htmlEncode ( sb.getBufStart() );


	// if this is zero allow it to slide by. it is learning mode i guess.
	// so we can learn what crc we need to use.
	// otherwise, stop right there for debugging
	//if ( s_expectedCRC != 0 ) exit(1);

	// keep on going
void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			// nuke tt
			tt = NULL;
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
		      "<title>Event Editor</title>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
		sendReply ( &sb );

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );
			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;
			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }
			// point to it
			char *buf = (char *)s_entities[i].utf8;
			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);
			// make my own mods to make parsing easier
			if ( up == 160 ) {  // nbsp
				buf[0] = ' '; len = 1; }
			// make all quotes equal '\"' (34 decimal)
			// double and single curling quotes
			// &#x201c, 201d, 2018, 2019 (unicode values, not utf8)
			// &ldquo, &rdquo, &lsquo, &rsquo
			if ( up == 171 ||
			     up == 187 ||
			     up == 8216 ||
			     up == 8217 ||
			     up == 8218 ||
			     up == 8220 ||
			     up == 8221 ||
			     up == 8222 ||
			     up == 8249 ||
			     up == 8250 ) {
				buf[0] = '\"'; len = 1; }
			// and normalize all dashes (mdash,ndash)
			if ( up == 8211 || up == 8212 ) {
				buf[0] = '-'; len = 1; }

			// end custom mods

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		s_isInitialized = true;
	return true;
Ejemplo n.º 22
// . merge all the replies together
// . put final merged docids into m_docIds[],m_bitScores[],m_scores[],...
// . this calls Msg51 to get cluster levels when done merging
// . Msg51 remembers clusterRecs from previous call to avoid repeating lookups
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg3a::mergeLists ( ) {

    // time how long the merge takes
    if ( m_debug ) {
        logf( LOG_DEBUG, "query: msg3a: --- Final DocIds --- " );
        m_startTime = gettimeofdayInMilliseconds();

    // reset our final docids count here in case we are a re-call
    m_numDocIds = 0;
    // a secondary count, how many unique docids we scanned, and not
    // necessarily added to the m_docIds[] array
    //m_totalDocCount = 0; // long docCount = 0;
    m_moreDocIdsAvail = true;

    // shortcut
    //long numSplits = m_numHosts;//indexdbSplit;

    // . point to the various docids, etc. in each split reply
    // . tcPtr = term count. how many required query terms does the doc
    //   have? formerly called topExplicits in IndexTable2.cpp
    long long     *diPtr [MAX_INDEXDB_SPLIT];
    float         *rsPtr [MAX_INDEXDB_SPLIT];
    key_t         *ksPtr [MAX_INDEXDB_SPLIT];
    long long     *diEnd [MAX_INDEXDB_SPLIT];
    for ( long j = 0; j < m_numHosts ; j++ ) {
        Msg39Reply *mr =m_reply[j];
        // if we have gbdocid:| in query this could be NULL
        if ( ! mr ) {
            diPtr[j] = NULL;
            diEnd[j] = NULL;
            rsPtr[j] = NULL;
            ksPtr[j] = NULL;
        diPtr [j] = (long long *)mr->ptr_docIds;
        rsPtr [j] = (float     *)mr->ptr_scores;
        ksPtr [j] = (key_t     *)mr->ptr_clusterRecs;
        diEnd [j] = (long long *)(mr->ptr_docIds +
                                  mr->m_numDocIds * 8);

    // clear if we had it
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;

    // HACK: START section stats merge
    long sneed = 0;
    for ( long j = 0; j < m_numHosts ; j++ ) {
        Msg39Reply *mr = m_reply[j];
        if ( ! mr ) continue;
        sneed += mr->size_siteHashList/4;
    HashTableX dt;
    //char tmpBuf[5000];
    if (sneed&&!dt.set(4,0,sneed,NULL,0,false,
        return true;
    for ( long j = 0; sneed && j < m_numHosts ; j++ ) {
        Msg39Reply *mr =m_reply[j];
        if ( ! mr ) continue;
        SectionStats *src = &mr->m_sectionStats;
        SectionStats *dst = &m_sectionStats;
        dst->m_onSiteDocIds      += src->m_onSiteDocIds;
        dst->m_offSiteDocIds     += src->m_offSiteDocIds;
        // now the list should be the unique site hashes that
        // had the section hash. we need to uniquify them again
        // here.
        long *p = (long *)mr->ptr_siteHashList;
        long np = mr->size_siteHashList / 4;
        for ( long k = 0 ; k < np ; k++ )
            // hash it up, no dups!
        // update our count based on that
        dst->m_numUniqueSites = dt.getNumSlotsUsed();
    if ( m_r->m_getSectionStats ) return true;
    // HACK: END section stats merge

    if ( m_docsToGet <= 0 ) {
        char *xx=NULL;

    // . how much do we need to store final merged docids, etc.?
    // . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1
    long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1);
    // allocate it
    m_finalBuf     = (char *)mmalloc ( need , "finalBuf" );
    m_finalBufSize = need;
    // g_errno should be set if this fails
    if ( ! m_finalBuf ) return true;
    // hook into it
    char *p = m_finalBuf;
    m_docIds        = (long long *)p;
    p += m_docsToGet * 8;
    m_scores        = (float     *)p;
    p += m_docsToGet * sizeof(float);
    m_clusterRecs   = (key_t     *)p;
    p += m_docsToGet * sizeof(key_t);
    m_clusterLevels = (char      *)p;
    p += m_docsToGet * 1;
    m_scoreInfos    = (DocIdScore **)p;
    p+=m_docsToGet*sizeof(DocIdScore *);

    // sanity check
    char *pend = m_finalBuf + need;
    if ( p != pend ) {
        char *xx = NULL;
        *xx =0;
    // . now allocate for hash table
    // . get at least twice as many slots as docids
    HashTableT<long long,char> htable;
    // returns false and sets g_errno on error
    if ( ! htable.set ( m_docsToGet * 2 ) ) return true;
    // hash table for doing site clustering, provided we
    // are fully split and we got the site recs now
    HashTableT<long long,long> htable2;
    if ( m_r->m_doSiteClustering && ! htable2.set ( m_docsToGet * 2 ) )
        return true;

    // ***MERGE ALL SPLITS INTO m_docIds[], etc.***
    // . merge all lists in m_replyDocIds[splitNum]
    // . we may be re-called later after m_docsToGet is increased
    //   if too many docids were clustered/filtered out after the call
    //   to Msg51.

    // the winning docid will be diPtr[maxj]
    long maxj = -1;
    //Msg39Reply *mr;
    long hslot;

    // get the next highest-scoring docids from all split lists
    for ( long j = 0; j < m_numHosts; j++ ) {
        // . skip exhausted lists
        // . these both should be NULL if reply was skipped because
        //   we did a gbdocid:| query
        if ( diPtr[j] >= diEnd[j] ) continue;
        // compare the score
        if ( maxj == -1 ) {
            maxj = j;
        if ( *rsPtr[j] < *rsPtr[maxj] ) continue;
        if ( *rsPtr[j] > *rsPtr[maxj] ) {
            maxj = j;
        // prefer lower docids on top
        if ( *diPtr[j] < *diPtr[maxj] ) {
            maxj = j;

    if ( maxj == -1 ) {
        m_moreDocIdsAvail = false;
        goto doneMerge;

    // only do this logic if we have clusterdb recs included
    if ( m_r->m_doSiteClustering     &&
            // if the clusterLevel was set to CR_*errorCode* then this key
            // will be 0, so in that case, it might have been a not found
            // or whatever, so let it through regardless
            ksPtr[maxj]->n0 != 0LL &&
            ksPtr[maxj]->n1 != 0   ) {
        // get the hostname hash, a long long
        long sh = g_clusterdb.getSiteHash26 ((char *)ksPtr[maxj]);
        // do we have enough from this hostname already?
        long slot = htable2.getSlot ( sh );
        // if this hostname already visible, do not over-display it...
        if ( slot >= 0 ) {
            // get the count
            long val = htable2.getValueFromSlot ( slot );
            // . if already 2 or more, give up
            // . if the site hash is 0, that usually means a
            //   "not found" in clusterdb, and the accompanying
            //   cluster level would be set as such, but since we
            //   did not copy the cluster levels over in the merge
            //   algo above, we don't know for sure... cluster recs
            //   are set to 0 in the Msg39.cpp clustering.
            if ( sh && val >= 2 ) goto skip;
            // inc the count
            // store it
            htable2.setValue ( slot , val );
        // . add it, this should be pre-allocated!
        // . returns false and sets g_errno on error
        else if ( ! htable2.addKey(sh,1) ) return true;

    hslot = htable.getSlot ( *diPtr[maxj] );

    // . only add it to the final list if the docid is "unique"
    // . BUT since different event ids share the same docid, exception!
    if ( hslot < 0 ) {
        // always inc this
        // only do this if we need more
        if ( m_numDocIds < m_docsToGet ) {
            // get DocIdScore class for this docid
            Msg39Reply *mr = m_reply[maxj];
            // point to the array of DocIdScores
            DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
            long nds = mr->size_scoreInfo/sizeof(DocIdScore);
            DocIdScore *dp = NULL;
            for ( long i = 0 ; i < nds ; i++ ) {
                if ( ds[i].m_docId != *diPtr[maxj] )  continue;
                dp = &ds[i];
            // add the max to the final merged lists
            m_docIds    [m_numDocIds] = *diPtr[maxj];

            // wtf?
            if ( ! dp ) {
                // this is empty if no scoring info
                // supplied!
                if ( m_r->m_getDocIdScoringInfo )
                    log("msg3a: CRAP! got empty score "
                        "info for "
                //char *xx=NULL; *xx=0;  261561804684
                // qry =
            // point to the single DocIdScore for this docid
            m_scoreInfos[m_numDocIds] = dp;

            // reset this just in case
            if ( dp ) {
                dp->m_singleScores = NULL;
                dp->m_pairScores   = NULL;

            // now fix DocIdScore::m_pairScores and m_singleScores
            // ptrs so they reference into the
            // Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
            // like they should. it seems we do not free the
            // Msg39Replies so we should be ok referencing them.
            if ( dp && dp->m_singlesOffset >= 0 )
                dp->m_singleScores =
                    (SingleScore *)(mr->ptr_singleScoreBuf+
                                    dp->m_singlesOffset) ;
            if ( dp && dp->m_pairsOffset >= 0 )
                dp->m_pairScores =
                    (PairScore *)(mr->ptr_pairScoreBuf +
                                  dp->m_pairsOffset );

            // turn it into a float, that is what rscore_t is.
            // we do this to make it easier for PostQueryRerank.cpp
            m_scores    [m_numDocIds]=(float)*rsPtr[maxj];
            if ( m_r->m_doSiteClustering )
                m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
            // clear this out
            // set this for use below
            hslot = m_numDocIds;
            // point to next available slot to add to
        // if it has ALL the required query terms, count it
        //if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
        // . add it, this should be pre-allocated!
        // . returns false and sets g_errno on error
        if ( ! htable.addKey(*diPtr[maxj],1) ) return true;

    // increment the split pointers from which we took the max
    // get the next highest docid and add it in
    if ( m_numDocIds < m_docsToGet ) goto mergeLoop;


    if ( m_debug ) {
        // show how long it took
        logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li "
              "splits in %llu ms. "
              (unsigned long)this,
              m_numDocIds, (long)m_numHosts,
              gettimeofdayInMilliseconds() - m_startTime
        // show the final merged docids
        for ( long i = 0 ; i < m_numDocIds ; i++ ) {
            long sh = 0;
            if ( m_r->m_doSiteClustering )
                sh=g_clusterdb.getSiteHash26((char *)
            // print out score_t
            logf(LOG_DEBUG,"query: msg3a: [%lu] "
                 "%03li) merged docId=%012llu "
                 "score=%.01f hosthash=0x%lx",
                 (unsigned long)this,
                 m_docIds    [i] ,
                 (float)m_scores    [i] ,
                 sh );

    // if we had a full split, we should have gotten the cluster recs
    // from each split already
    memset ( m_clusterLevels , CR_OK , m_numDocIds );

    return true;
// . when the Conf::m_proxyIps parm is updated we call this to rebuild
//   s_iptab, our table of SpiderProxy instances, which has the proxies and 
//   their performance statistics.
// . we try to maintain stats of ip/ports that did NOT change when rebuilding.
bool buildProxyTable ( ) {

	// scan the NEW list of proxy ip/port pairs in g_conf
	char *p = g_conf.m_proxyIps.getBufStart();

	HashTableX tmptab;

	// scan the user inputted space-separated list of ip:ports
	// (optional username:password@ip:port)
	for ( ; *p ; ) {
		// skip white space
		if ( is_wspace_a(*p) ) { p++; continue; }

		// skip http://
		if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; }

		// scan in an ip:port
		char *s = p; char *portStr = NULL;
		int32_t dc = 0, pc = 0, gc = 0, bc = 0;
		const char *msg;

		char *usernamePwd = NULL;
		int32_t usernamePwdLen = 0;
		char *ipStart = p;

		// scan all characters until we hit \0 or another whitespace
		for ( ; *s && !is_wspace_a(*s); s++) {

			if ( *s == '@' ) {
				// must be username:pwd
				if ( pc != 1 ) {
					msg = "bad username:password";
					goto hadError;
				usernamePwd = p;
				usernamePwdLen = s - p;
				if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) {
					msg = "username:password too long";
					goto hadError;
				dc = 0;
				gc = 0;
				bc = 0;
				pc = 0;
				portStr = NULL;
				ipStart = s+1;

			if ( *s == '.' ) { dc++; continue; }
			if ( *s == ':' ) { portStr=s; pc++; continue; }
			if ( is_digit(*s) ) { gc++; continue; }
		// ensure it is a legit ip:port combo
		msg = NULL;
		if ( gc < 4 ) 
			msg = "not enough digits for an ip";
		if ( pc > 1 )
			msg = "too many colons";
		if ( dc != 3 )
			msg = "need 3 dots for an ip address";
		if ( bc )
			msg = "got illegal char in ip:port listing";
		if ( msg ) {
			char c = *s;
			*s = '\0';
			log("buf: %s for %s",msg,p);
			*s = c;
			return false;

		// convert it
		int32_t iplen = s - ipStart;
		if ( portStr ) iplen = portStr - ipStart;
		int32_t ip = atoip(ipStart,iplen);
		// another sanity check
		if ( ip == 0 || ip == -1 ) {
			log("spider: got bad proxy ip for %s",p);
			return false;

		// and the port default is 80
		int32_t port = 80;
		if ( portStr ) port = atol2(portStr+1,s-portStr-1);
		if ( port < 0 || port > 65535 ) {
			log("spider: got bad proxy port for %s",p);
			return false;

		// . we got a legit ip:port
		// . see if already in our table
		uint64_t ipKey = (uint32_t)ip;
		ipKey <<= 16;
		ipKey |= (uint16_t)(port & 0xffff);

		// also store into tmptable to see what we need to remove

		// see if in table
		int32_t islot = s_iptab.getSlot( &ipKey);

		// advance p
		p = s;

		// if in there, keep it as is
		if ( islot >= 0 ) continue;

		// otherwise add new entry
		SpiderProxy newThing;
		memset ( &newThing , 0 , sizeof(SpiderProxy));
		newThing.m_ip = ip;
		newThing.m_port = port;
		newThing.m_lastDownloadTookMS = -1;
		newThing.m_lastSuccessfulTestMS = -1;

		// ensure it is NULL terminated
		newThing.m_usernamePwd[usernamePwdLen] = '\0';

		if ( ! s_iptab.addKey ( &ipKey, &newThing ) )
			return false;

	int32_t removed = 0;
	// scan all SpiderProxies in tmptab
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty buckets in hashtable s_iptab
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the key
		int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i);
		// must also exist in tmptab, otherwise it got removed by user
		if ( tmptab.isInTable ( &key ) ) continue;
		// skip if not in table
		if ( s_iptab.getSlot ( &key ) < 0 ) {
			log("sproxy: iptable hashing messed up");
		// shoot, it got removed. not in the new list of ip:ports
		s_iptab.removeKey ( &key );
		// hashtable is messed up now, start over
		//goto redo;
	if ( removed ) goto redo;
	return true;
// returns false if blocked, true otherwise
bool Statsdb::gifLoop ( ) {
	// shortcut
	Msg5 *m = &m_msg5;

	//#ifndef _USEPLOTTER_
	//return true;

	// loop over all the lists in the time range, [m_t1,m_t2]
	for ( ; ! m_done ; ) {
		if ( ! m->getList ( (char)RDB_STATSDB	,
				    "statsdb"		, // coll
				    &m_list		,
				    (char *)&m_startKey	,
				    (char *)&m_endKey	,
				    32000	, // requested scan size
				    true 	, // include tree?
				    false	, // add to cache?
				    0		, // max cache age
				    0		, // start file number
				    -1		, // number of files
				    NULL	, // state
				    gotListWrapper, // callback
				    m_niceness	, // niceness
				    false	, // do error correction?
				    NULL	, // cache key pointer
				    0		, // # retries
				    -1		, // max # retries
				    true	, // compensate for merge?
				    -1		, // sync point
				    NULL	) ) // msg5b
			return false;
		// . process list
		// . returns false with g_errno set on error
		if ( ! processList() ) return true;

	// define time delta - commented out because it's currently not used.
	long dt = m_t2 - m_t1;

	//#ifdef _USEPLOTTER_

	// gif size
	//char tmp[64];
	// dimensions of the gif
	//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
	//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
	// create one
	//GIFPlotter plotter ( NULL , m_fd , NULL );
	// open it
	//plotter.openpl ( );

	// define the space with boundaries 100 unit wide boundaries
	// ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );

	// line thickness in user coordinates (pixels for us)
	//plotter.linewidth ( 1 );       
	// set bg color to gray (r/g/b) 
	//plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
	// erase Plotter's graphics display
	//plotter.erase ();                
	// draw axises in black
	//plotter.pencolorname ("black");    

	// main graphing window
	m_gw.safePrintf("<div style=\"position:relative;"
		      // the tick marks we print below are based on it
		      // being a window of the last 20 seconds... and using
		      // DX pixels
		      ,(long)DX + 2 *m_bx
			,(long)DY + 2*m_by);

	// draw the x-axis
	//plotter.line ( m_bx , m_by , DX + m_bx , m_by  );

	// 10 x-axis tick marks
	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
		// tick mark
		//plotter.line ( x , -20 , x , 20 );
		m_gw.safePrintf("<div style=\"position:absolute;"
			      , m_bx + (long)x-1
		long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
		// LABEL
		m_gw.safePrintf("<div style=\"position:absolute;"
				, (long)x-10 + m_bx
				// the label:
				, xv

	HashTableX tmpht;

	long col = 0;

	m_sb2->safePrintf("<table border=1 width=100%%>\n");

	// label offset to prevent collisions of superimposing multiple
	// graph calbrations
	long zoff = 0;

	// point to the triplets in m_sb1's buffer (x,y,c)
	char *p    = m_sb1.getBufStart();
	char *pend = p + m_sb1.length();
	for ( ; p < pend ; p += 12 ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get graph hash of this point
		long  gh = *(long *)(p +8);

		// if we already did this graph, skip it
		if ( tmpht.isInTable ( &gh ) ) continue;

		// . graph this single graph of this color
		// . returns ptr to first point of different color!
		plotGraph ( p , pend , gh , m_gw , zoff );
		// prevent collisions
		zoff += 20;

		// get the label based on graphHash
		Label *bb = getLabel ( gh );

		// add to key
		if ( col == 0 )

		m_sb2->safePrintf("<td bgcolor=#%06lx>&nbsp; &nbsp;</td>"
				 bb->m_color ,
				 bb->m_keyDesc );

		if ( col == 1 )

		// inc column and wrap
		if ( ++col >= 2 ) col = 0;

		// . do not re-display 
		// . TODO: deal with error
		tmpht.addKey ( &gh );

	// clear that up

	// now plot the events, horizontal line segments like the performance
	// graph uses
	for ( long i = 0 ; i < m_ht3.m_numSlots ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if slot empty
		if ( ! m_ht3.m_flags[i] ) continue;
		// get the offset into m_sb3
		long offset = *(long *)m_ht3.getValueFromSlot(i);
		// get buf start
		char *bufStart = m_sb3.getBufStart();
		// get the ptr
		EventPoint *pp = (EventPoint *)(bufStart + offset);

		// get name of parm
		Parm *m = g_parms.getParmFromParmHash ( pp->m_parmHash );
		// make sure we got it
		if ( ! m ) { 
			log("statsdb: unrecognized parm hash = %li",
			//char *xx=NULL;*xx=0; }

		// set the line width
		//plotter.linewidth ( pp->m_thickness );

		// get parm hash
		long colorHash = pp->m_parmHash;
		// add in old/new values to make it different
		colorHash = hash32h ( (long)pp->m_oldVal , colorHash );
		colorHash = hash32h ( (long)pp->m_newVal , colorHash );
		// . get color
		// . is really the parm hash in disguise
		long c1 = colorHash & 0x00ffffff;
		// use the color specified from addStat_r() for this line/pt
		//plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
		//		   ((c1 >>  8) & 0xff) << 8 ,
		//		   ((c1 >>  0) & 0xff) << 8 );

		long x1 = pp->m_a;
		long x2 = pp->m_b;
		long y1 = *(long *)m_ht3.getKey(i); // i value
		// ensure at least 3 units wide for visibility
		if ( x2 < x1 + 10 ) x2 = x1 + 10;
		// . flip the y so we don't have to scroll the browser down
		// . DY does not include the axis and tick marks
		//long fy1 = DY - y1 + m_by ;
		// plot it
		//plotter.line ( x1 , fy1 , x2 , fy1 );
		drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );

		// add to map key? only if we haven't already
		if ( tmpht.isInTable ( &colorHash ) ) continue;

		// add it
		if ( col == 0 )

		char *title = "unknown parm";
		if ( m ) title = m->m_title;

		m_sb2->safePrintf("<td bgcolor=#%06lx>&nbsp; &nbsp;</td>",c1);

		// print the parm name and old/new values

		if ( pp->m_oldVal != pp->m_newVal )
			m_sb2->safePrintf(" (%.02f -> %.02f)",


		if ( col == 1 )

		// inc column and wrap
		if ( ++col >= 2 ) col = 0;

		// . do not re-display 
		// . TODO: deal with error
		tmpht.addKey ( &colorHash ) ;

	// clear that up

	// and stat states

	// all done free some mem

	// but not m_sb2 cuz that has the html in it!!

	// all done
	//if ( plotter.closepl () < 0 ) 
	//	log("admin: Could not close performance graph object.");
	// close the file
	//fclose ( m_fd );


	// close main graphing window

	return true;
Ejemplo n.º 25
// returns -1 and sets g_errno on error, because 0 means langUnknown
long Words::getLanguage( Sections *sections ,
			 long maxSamples,
			 long niceness,
			 long *langScore) {
	// calculate scores if not given
	//Scores calcdScores;
	//if ( ! scores ) {
	//	if ( ! calcdScores.set( this,m_version,false ) )
	//		return -1;
	//	scores = &calcdScores;

	// . take a random sample of words and look them up in the
	//   language dictionary
	//HashTableT<long long, char> ht;
	HashTableX ht;
	long long langCount[MAX_LANGUAGES];
	long long langWorkArea[MAX_LANGUAGES];
	long numWords = m_numWords;
	//long skip = numWords/maxSamples;
	//if ( skip == 0 ) skip = 1;
	// reset the language count
	memset(langCount, 0, sizeof(long long)*MAX_LANGUAGES);
	// sample the words
	//long wordBase  = 0;
	long wordi     = 0;
	//if ( ! ht.set(maxSamples*1.5) ) return -1;
	if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false,
		return -1;
	// . avoid words in these bad sections
	// . google seems to index SEC_MARQUEE so i took that out of badFlags
	// shortcuts
	long long *wids  = m_wordIds;
	long      *wlens = m_wordLens;
	char     **wptrs = m_words;

	//long langTotal = 0;
// 	log ( LOG_WARN, "xmldoc: Picking language from %li words with %li skip",
// 			numWords, skip );
	char numOne = 1;
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	// this means null too
	if ( sections && sections->m_numSections == 0 ) sp = NULL;

	long maxCount = 1000;

	while ( wordi < numWords ) {
		// breathe
		QUICKPOLL( niceness );
		// move to the next valid word
		if ( ! wids [wordi]     ) { wordi++; continue; }
		if (   wlens[wordi] < 2 ) { wordi++; continue; }
		// skip if in a bad section
		//long flags = sections->m_sectionPtrs[i]->m_flags;
		// meaning script section ,etc
		if ( sp && ( sp[wordi]->m_flags & badFlags ) ) {
			wordi++; continue; }
		// check the language
		//unsigned char lang = 0;

		// Skip if word is capitalized and not preceded by a tag
		//if(s_isWordCap(getWord(wordi), getWordLen(wordi)) &&
		//   wordi > 0 && !getTagId(wordi - 1)) {
		//	wordi++;
		//	continue;

		// Skip word if bounded by '/' or '?' might be in a URL
		if(isBounded(wordi)) {

		// is it arabic? sometimes they are spammy pages and repeat
		// a few arabic words over and over again, so don't do deduping
		// with "ht" before checking this.
		char cl = getCharacterLanguage ( wptrs[wordi] );
		if ( cl ) {
		        langCount[(unsigned char)cl]++;

		//if(ht.getSlot(m_wordIds[wordi]) !=-1) {
		if(!ht.isEmpty(&m_wordIds[wordi]) ) {

		// If we can't add the word, it's not that bad.
		// Just gripe about it in the log.
		if(!ht.addKey(&m_wordIds[wordi], &numOne)) {
			log(LOG_WARN, "build: Could not add word to temporary "
			    "table, memory error?\n");
			g_errno = ENOMEM;
			return -1;

		if ( maxCount-- <= 0 ) break;

		// No lang from charset, got a phrase, and 0 language does not have 
		// a score Order is very important!
		int foundone = 0;
		if ( // lang == 0 &&
		    // we seem to be missing hungarian and thai
						 langWorkArea) &&
		    // why must it have an "unknown score" of 0?
		    // allow -1... i don't know what that means!!
		    langWorkArea[0] <= 0) {
			int lasty = -1;
			for(int y = 1; y < MAX_LANGUAGES; y++) {
				if(langWorkArea[y] == 0) continue;
				long pop = langWorkArea[y];
				// negative means in an official dictionary
				if ( pop < 0 ) {
					pop *= -1;
					langCount[y] += 1;
				// extra?
				if ( pop > 1000 )
					langCount[y] += 2;
				if ( pop > 10000 )
					langCount[y] += 2;
				lasty = y;
			// . if it can only belong to one language
			// . helps fix that fact that our unifiedDict is crummy
			//   and identifes some words as being in a lot of languages
			//   like "Pronto" as being in english and not giving
			//   the popularities correctly.
			if ( foundone == 1 )
				// give massive boost
				langCount[lasty] += 10;
		// . try to skip unknown words without killing sample size
		// . we lack russian, hungarian and arabic in the unified
		//   dict, so try to do character detection for those langs.
		// . should prevent them from being detected as unknown
		//   langs and coming up for english search 'gigablast'
		if ( ! foundone ) {
			// do not count towards sample size

		// skip to the next word
		//wordBase += skip;
		//if ( wordi < wordBase )
		//	wordi = wordBase;
	// punish unknown count in case a doc has a lot of proper names
	// or something
	//langCount[langUnknown] /= 2;
	// get the lang with the max score then
	int l = s_findMaxIndex(langCount, MAX_LANGUAGES);
	// if(langCount[l] < 15) return(langUnknown);
	if(langScore) *langScore = langCount[l];
	// return if known now
	return l;
Ejemplo n.º 26
// . cluster the docids based on the clusterRecs
// . returns false and sets g_errno on error
// . if maxDocIdsPerHostname is -1 do not do hostname clsutering
bool setClusterLevels ( const key96_t   *clusterRecs,
			const int64_t *docIds,
			int32_t       numRecs              ,
			int32_t       maxDocIdsPerHostname ,
			bool       doHostnameClustering ,
			bool       familyFilter         ,
			bool       isDebug              ,
			// output to clusterLevels[]
			char    *clusterLevels        ) {
	if ( numRecs <= 0 ) return true;

	// skip if not clustering on anything
	//if ( ! doHostnameClustering && ! familyFilter ) {
	//	memset ( clusterLevels, CR_OK, numRecs );
	//	return true;

	// how many negative site hashes do we have?
	// count how many docids we got, they are a cgi value, so represented
	// in ascii separated by +'s. i.e. "12345+435322+3439333333"
	//HashTableT <int64_t,char> sht;
	//if ( ! hashFromString ( &sht , noSiteIds ) ) return false;
	//bool checkNegative = ( sht.getNumSlotsUsed() > 0 );

	HashTableX ctab;
	// init to 2*numRecs for speed. use 0 for niceness!
	if ( ! ctab.set ( 8 , 4 , numRecs * 2,NULL,0,false,"clustertab" ) )
		return false;

	// time it
	u_int64_t startTime = gettimeofdayInMilliseconds();

	// init loop counter vars
	int32_t           count = 0;
	uint32_t  score = 0;
	char          *crec ;
	int64_t      h  ;
	char          *level ;
	bool           fakeIt ;

	for(int32_t i=0; i<numRecs; i++) {
		crec = (char *)&clusterRecs[i];
		// . set this cluster level
		// . right now will be CR_ERROR_CLUSTERDB or CR_OK...
		level = &clusterLevels[i];

		// sanity check
		if ( *level == CR_UNINIT ) gbshutdownLogicError();
		// and the adult bit, for cleaning the results
		if ( familyFilter && g_clusterdb.hasAdultContent ( crec ) ) {
			*level = CR_DIRTY;
		// if error looking up in clusterdb, use a 8 bit domainhash from docid
		fakeIt = (*level==CR_ERROR_CLUSTERDB);
		// assume ok, show it, it is visible
		*level = CR_OK;
		// site hash comes next

		// . get the site hash
		// . these are only 32 bits!
			h = Titledb::getDomHash8FromDocId(docIds[i]);
			h = g_clusterdb.getSiteHash26 ( crec );

		// inc this count!
		if ( fakeIt ) {

		// if it matches a siteid on our black list
		//if ( checkNegative && sht.getSlot((int64_t)h) > 0 ) {
		//	*level = CR_BLACKLISTED_SITE; goto loop; }
		// look it up
		score = ctab.getScore(h) ;
		// if still visible, just continue
		if ( score < (uint32_t)maxDocIdsPerHostname ) {
			if ( ! ctab.addTerm(h))
				return false;
		// otherwise, no lonegr visible
		*level = CR_CLUSTERED;

	// debug
	for ( int32_t i = 0 ; i < numRecs && isDebug ; i++ ) {
		crec = (char *)&clusterRecs[i];
		uint32_t siteHash26=g_clusterdb.getSiteHash26(crec);
		logf(LOG_DEBUG,"query: msg51: hit #%" PRId32") sitehash26=%" PRIu32" "
		     "rec.n0=%" PRIx64" docid=%" PRId64" cl=%" PRId32" (%s)",
		     g_crStrings[(int32_t)clusterLevels[i]] );

	//log(LOG_DEBUG,"build: numVisible=%" PRId32" numClustered=%" PRId32" numErrors=%" PRId32,
	//    *numVisible,*numClustered,*numErrors);
	// show time
	uint64_t took = gettimeofdayInMilliseconds() - startTime;
	if ( took > 3 )
		log(LOG_INFO,"build: Took %" PRId64" ms to do clustering.",took);

	// we are all done
	return true;