// Slightly modified from getTextEntity
int16_t get_iana_charset(const char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) {
		log(LOG_WARN, "build: Could not init table of IANA Charsets.");
		return csUnknown;
	}
	// now add in all the charset entries
	int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( int32_t i = 0 ; i < n ; i++ ) {
	    int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(h, i+1) ) {
			log(LOG_WARN, "build: add term failed");
			return csUnknown;
		}
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    int64_t h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    int32_t i = (int32_t) s_table.getScore(h);
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (int16_t)s_charsets[i-1].mib_enum;
}
// Slightly modified from getTextEntity
short get_iana_charset(char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") )
	    return log("build: Could not init table of "
		       "IANA Charsets.");
	// now add in all the charset entries
	long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( long i = 0 ; i < n ; i++ ) {
	    long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(&h, i+1) ) 
		return log("build: add term failed");
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    long long h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    long i = (long) s_table.getScore ( &h );
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (short)s_charsets[i-1].mib_enum;
}
bool AdultBit::isDirty ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_dirty[] = {
		"anal",
		"analsex",
		"b*****b",
		"blowjobs",
		"boob",
		"boobs",
		"clitoris",
		"c**k",
		"cocks",
		"cum",
		"dick",
		"dicks",
		"g******g",
		"gangbangs",
		"gangbanging",
		"movie",
		"movies",
		"oral",
		"oralsex",
		"p**n",
		"porno",
		"pussy",
		"pussies",
		"sex",
		"sexy",
		"tit",
		"t**s",
		"video",
		"videos",
		"xxx",
		"xxxx",
		"xxxx"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_dtable.set ( 8,4,sizeof(s_dirty  )*2,NULL,0,false,0,
				      "adulttab")) 
			return log("build: Error initializing "
				    "dirty word hash table." );
		// now add in all the dirty words
		int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_dirty  [i] );
			if ( ! s_dtable.addTerm (&h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_dtable.getScore ( &h );
}		
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			//
			// make my own mods to make parsing easier
			//

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;
			}

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
bool AdultBit::isObscene ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_obscene[] = {
		"c**t",
		"clits",
//		"cum",    magna cum laude
		"cums",
		"cumshot",
		"c**t",
		"cunts",
		"milf",
		"rimjob",
		"felch",
		"f**k",
		"f****d",
		"f****r",
		"f*****g",
		"f***s",
		"w***e",
		"w****s"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
				      "obscenetab") ) 
			return log("build: Error initializing "
				    "obscene word hash table." );
		// now add in all the stop words
		int32_t n = sizeof(s_obscene) / sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_obscene[i] );
			if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_otable.getScore ( &h );
}		
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;
		}

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				}
				buf += len;
				
			}
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}
// . cluster the docids based on the clusterRecs
// . returns false and sets g_errno on error
// . if maxDocIdsPerHostname is -1 do not do hostname clsutering
bool setClusterLevels ( const key96_t   *clusterRecs,
			const int64_t *docIds,
			int32_t       numRecs              ,
			int32_t       maxDocIdsPerHostname ,
			bool       doHostnameClustering ,
			bool       familyFilter         ,
			bool       isDebug              ,
			// output to clusterLevels[]
			char    *clusterLevels        ) {
	
	if ( numRecs <= 0 ) return true;

	// skip if not clustering on anything
	//if ( ! doHostnameClustering && ! familyFilter ) {
	//	memset ( clusterLevels, CR_OK, numRecs );
	//	return true;
	//}

	// how many negative site hashes do we have?
	// count how many docids we got, they are a cgi value, so represented
	// in ascii separated by +'s. i.e. "12345+435322+3439333333"
	//HashTableT <int64_t,char> sht;
	//if ( ! hashFromString ( &sht , noSiteIds ) ) return false;
	//bool checkNegative = ( sht.getNumSlotsUsed() > 0 );

	HashTableX ctab;
	// init to 2*numRecs for speed. use 0 for niceness!
	if ( ! ctab.set ( 8 , 4 , numRecs * 2,NULL,0,false,"clustertab" ) )
		return false;

	// time it
	u_int64_t startTime = gettimeofdayInMilliseconds();

	// init loop counter vars
	int32_t           count = 0;
	uint32_t  score = 0;
	char          *crec ;
	int64_t      h  ;
	char          *level ;
	bool           fakeIt ;

	for(int32_t i=0; i<numRecs; i++) {
		crec = (char *)&clusterRecs[i];
		// . set this cluster level
		// . right now will be CR_ERROR_CLUSTERDB or CR_OK...
		level = &clusterLevels[i];

		// sanity check
		if ( *level == CR_UNINIT ) gbshutdownLogicError();
		// and the adult bit, for cleaning the results
		if ( familyFilter && g_clusterdb.hasAdultContent ( crec ) ) {
			*level = CR_DIRTY;
			continue;
		}
		// if error looking up in clusterdb, use a 8 bit domainhash from docid
		fakeIt = (*level==CR_ERROR_CLUSTERDB);
		// assume ok, show it, it is visible
		*level = CR_OK;
		// site hash comes next
		if(!doHostnameClustering)
			continue;

		// . get the site hash
		// . these are only 32 bits!
		if(fakeIt)
			h = Titledb::getDomHash8FromDocId(docIds[i]);
		else
			h = g_clusterdb.getSiteHash26 ( crec );

		// inc this count!
		if ( fakeIt ) {
			g_stats.m_filterStats[CR_ERROR_CLUSTERDB]++;
		}

		// if it matches a siteid on our black list
		//if ( checkNegative && sht.getSlot((int64_t)h) > 0 ) {
		//	*level = CR_BLACKLISTED_SITE; goto loop; }
		// look it up
		score = ctab.getScore(h) ;
		// if still visible, just continue
		if ( score < (uint32_t)maxDocIdsPerHostname ) {
			if ( ! ctab.addTerm(h))
				return false;
			continue;
		}
		// otherwise, no lonegr visible
		*level = CR_CLUSTERED;
	}


	// debug
	for ( int32_t i = 0 ; i < numRecs && isDebug ; i++ ) {
		crec = (char *)&clusterRecs[i];
		uint32_t siteHash26=g_clusterdb.getSiteHash26(crec);
		logf(LOG_DEBUG,"query: msg51: hit #%" PRId32") sitehash26=%" PRIu32" "
		     "rec.n0=%" PRIx64" docid=%" PRId64" cl=%" PRId32" (%s)",
		     (int32_t)count++,
		     (int32_t)siteHash26,
		     clusterRecs[i].n0,
		     (int64_t)docIds[i],
		     (int32_t)clusterLevels[i],
		     g_crStrings[(int32_t)clusterLevels[i]] );
	}


	//log(LOG_DEBUG,"build: numVisible=%" PRId32" numClustered=%" PRId32" numErrors=%" PRId32,
	//    *numVisible,*numClustered,*numErrors);
	// show time
	uint64_t took = gettimeofdayInMilliseconds() - startTime;
	if ( took > 3 )
		log(LOG_INFO,"build: Took %" PRId64" ms to do clustering.",took);

	// we are all done
	return true;
}
static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );
			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;
			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }
			// point to it
			char *buf = (char *)s_entities[i].utf8;
			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);
			//
			// make my own mods to make parsing easier
			//
			if ( up == 160 ) {  // nbsp
				buf[0] = ' '; len = 1; }
			// make all quotes equal '\"' (34 decimal)
			// double and single curling quotes
			//http://www.dwheeler.com/essays/quotes-test-utf-8.html
			// &#x201c, 201d, 2018, 2019 (unicode values, not utf8)
			// &ldquo, &rdquo, &lsquo, &rsquo
			/*
			if ( up == 171 ||
			     up == 187 ||
			     up == 8216 ||
			     up == 8217 ||
			     up == 8218 ||
			     up == 8220 ||
			     up == 8221 ||
			     up == 8222 ||
			     up == 8249 ||
			     up == 8250 ) {
				buf[0] = '\"'; len = 1; }
			// and normalize all dashes (mdash,ndash)
			if ( up == 8211 || up == 8212 ) {
				buf[0] = '-'; len = 1; }
			*/

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}