Ejemplos de HashTableX::set en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: SpiderProxy.cpp Proyecto: privacore/open-source-search-engine

static bool initProxyTables() {
	// initialize proxy/urlip ban table?
	if ( ! s_init ) return true;
	s_init = false;
	s_proxyBannedTable.set(8,0,0,NULL,0,false,"proxban");
	s_banCountTable.set(4,4,0,NULL,0,false,"bancnt");
	return true;
}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: SpiderProxy.cpp Proyecto: privacore/open-source-search-engine

bool resetProxyStats ( ) {
	// s_proxyBannedTable.reset();
	// s_banCountTable.reset();
	// s_iptab.reset();
	s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,"siptab",true);
	// skip port part of key magic, and get LSB of the IP as key magic
	s_iptab.m_maskKeyOffset = 5;
	s_proxyBannedTable.set(8,0,0,NULL,0,false,"proxban");
	s_banCountTable.set(4,4,0,NULL,0,false,"bancnt");
	return buildProxyTable();
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: iana_charset.cpp Proyecto: privacore/open-source-search-engine

// Slightly modified from getTextEntity
int16_t get_iana_charset(const char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) {
		log(LOG_WARN, "build: Could not init table of IANA Charsets.");
		return csUnknown;
	}
	// now add in all the charset entries
	int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( int32_t i = 0 ; i < n ; i++ ) {
	    int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(h, i+1) ) {
			log(LOG_WARN, "build: add term failed");
			return csUnknown;
		}
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    int64_t h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    int32_t i = (int32_t) s_table.getScore(h);
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (int16_t)s_charsets[i-1].mib_enum;
}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: iana_charset.cpp Proyecto: BILObilo/open-source-search-engine

// Slightly modified from getTextEntity
short get_iana_charset(char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") )
	    return log("build: Could not init table of "
		       "IANA Charsets.");
	// now add in all the charset entries
	long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( long i = 0 ; i < n ; i++ ) {
	    long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(&h, i+1) ) 
		return log("build: add term failed");
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    long long h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    long i = (long) s_table.getScore ( &h );
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (short)s_charsets[i-1].mib_enum;
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: HttpMime.cpp Proyecto: lemire/open-source-search-engine

// . init s_mimeTable in this call
// . called from HttpServer::init
// . returns false and sets g_errno on error
bool HttpMime::init ( ) {
	// only need to call once
	if ( s_init ) return true;
	// make sure only called once
	s_init = true;
	//s_mimeTable.set ( 256 );
	//s_mimeTable.setLabel("mimetbl");
	if ( ! s_mimeTable.set(4,sizeof(char *),256,NULL,0,false,1,"mimetbl"))
		return false;
	// set table from internal list
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) {
		int32_t key = hash32n ( s_ext[i] );
		if ( ! s_mimeTable.addKey ( &key , &s_ext[i+1] ) ) 
			return log("HttpMime::init: failed to set table.");
	}
	// quick text
	const char *tt = getContentTypeFromExtension ( "zip" );
	if ( strcmp(tt,"application/zip") != 0 ) {
		g_errno = EBADENGINEER;
		return log("http: Failed to init mime table correctly.");
	}
	// a more thorough test
	for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) {
		tt = getContentTypeFromExtension ( s_ext[i] );
		if ( strcmp(tt,s_ext[i+1]) == 0 ) continue;
		g_errno = EBADENGINEER;
		return log("http: Failed to do mime table correctly. i=%" PRId32,i);
	}

	// TODO: set it from a user supplied file here
	return true;
}

Ejemplo n.º 6

0

Mostrar archivo

Archivo: AdultBit.cpp Proyecto: BlaBlaNet/open-source-search-engine

bool AdultBit::isDirty ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_dirty[] = {
		"anal",
		"analsex",
		"b*****b",
		"blowjobs",
		"boob",
		"boobs",
		"clitoris",
		"c**k",
		"cocks",
		"cum",
		"dick",
		"dicks",
		"g******g",
		"gangbangs",
		"gangbanging",
		"movie",
		"movies",
		"oral",
		"oralsex",
		"p**n",
		"porno",
		"pussy",
		"pussies",
		"sex",
		"sexy",
		"tit",
		"t**s",
		"video",
		"videos",
		"xxx",
		"xxxx",
		"xxxx"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_dtable.set ( 8,4,sizeof(s_dirty  )*2,NULL,0,false,0,
				      "adulttab")) 
			return log("build: Error initializing "
				    "dirty word hash table." );
		// now add in all the dirty words
		int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_dirty  [i] );
			if ( ! s_dtable.addTerm (&h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_dtable.getScore ( &h );
}

Ejemplo n.º 7

0

Mostrar archivo

Archivo: SpiderProxy.cpp Proyecto: privacore/open-source-search-engine

// call this at startup to register the handlers
bool initSpiderProxyStuff() {
	
	// do this for all hosts in case host #0 goes dead, then everyone
	// will, according to Msg13.cpp, send to host #1, the next in line
	// if she is alive
	//if ( g_hostdb.m_myHostId != 0 ) return true;

	// only host #0 has handlers
	if ( ! g_udpServer.registerHandler ( msg_type_54, handleRequest54 ))
		return false;

	// key is ip/port
	s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,"siptab",true);
	// skip port part of key magic, and get LSB of the IP as key magic
	s_iptab.m_maskKeyOffset = 5;

	loadSpiderProxyStats();

	// build the s_iptab hashtable for the first time
	buildProxyTable ();

	// reset spider proxy stats every hour to alleviate false positives (moved from Process.cpp)
	if (!g_loop.registerSleepCallback(3600000, NULL, resetProxyStatWrapper, 0)) {
		gbshutdownResourceError();
	}

	// make the loadtable hashtable
	static bool s_flag = 0;
	if ( s_flag ) return true;
	s_flag = true;
	return s_loadTable.set(4,
			       sizeof(LoadBucket),
			       128,
			       NULL,
			       0,
			       // this slows us down
			       true, // allow dups?
			       "lbtab",
			       true); // use key magic to mix things up

}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: Entities.cpp Proyecto: lemire/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			//
			// make my own mods to make parsing easier
			//

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;
			}

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: AdultBit.cpp Proyecto: BlaBlaNet/open-source-search-engine

bool AdultBit::isObscene ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_obscene[] = {
		"c**t",
		"clits",
//		"cum",    magna cum laude
		"cums",
		"cumshot",
		"c**t",
		"cunts",
		"milf",
		"rimjob",
		"felch",
		"f**k",
		"f****d",
		"f****r",
		"f*****g",
		"f***s",
		"w***e",
		"w****s"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
				      "obscenetab") ) 
			return log("build: Error initializing "
				    "obscene word hash table." );
		// now add in all the stop words
		int32_t n = sizeof(s_obscene) / sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_obscene[i] );
			if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_otable.getScore ( &h );
}

Ejemplo n.º 10

0

Mostrar archivo

Archivo: XmlNode.cpp Proyecto: Doken-Tokuyama/open-source-search-engine

nodeid_t getTagId ( char *s , NodeType **retp ) {

	// init table?
	static bool s_init = false;
	static HashTableX  s_ht;
	static char s_buf[10000];
	if ( ! s_init ) {
		s_init = true;
		s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
		// how many NodeTypes do we have in g_nodes?
		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			char *name = g_nodes[i].m_nodeName;
			int32_t  nlen = gbstrlen(name);
			int64_t h = hash64Upper_a ( name,nlen,0LL );
			NodeType *nt = &g_nodes[i];
			if ( ! s_ht.addKey(&h,&nt) ) { 
				char *xx=NULL;*xx=0; }
		}
		// sanity
		if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
		// sanity test
		nodeid_t tt = getTagId ( "br" );
		if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
	}


	// find end of tag name. hyphens are ok to be in name.
	// facebook uses underscores like <start_time>
	char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
	// hash it for lookup
	int64_t h = hash64Upper_a ( s , e - s , 0 );
	// look it up
	NodeType **ntp = (NodeType **)s_ht.getValue(&h);
	// assume none
	if ( retp ) *retp = NULL;
	// none?
	if ( ! ntp ) return 0;
	// got one
	if ( retp ) *retp = *ntp;
	// get id otherwise
	return (*ntp)->m_nodeId;
}

Ejemplo n.º 11

0

Mostrar archivo

Archivo: HashTableX.cpp Proyecto: BKJackson/open-source-search-engine

// . how many keys are dups
// . returns -1 on error
long HashTableX::getNumDups() {
	if ( ! m_allowDups ) return 0;
	HashTableX tmp;
	if ( ! tmp.set ( m_ks, 0, m_numSlots, NULL , 0 , false , m_niceness,
			 "htxtmp") )
		return -1;
	// put into that table
	for ( long i = 0 ; i < m_numSlots ; i++ ) {
		// skip empty bucket
		if ( ! m_flags[i] ) continue;
		// get the key
		char *kp = (char *)getKeyFromSlot(i);
		// add to new table
		if ( ! tmp.addKey ( kp ) ) return -1;
	}
	// the unqieus
	long uniques = tmp.m_numSlotsUsed;
	// the dups
	long dups = m_numSlotsUsed - uniques;
	// that's it
	return dups;
}

Ejemplo n.º 12

0

Mostrar archivo

Archivo: CountryCode.cpp Proyecto: ariosx/open-source-search-engine

// get the id from a 2 character country code
uint8_t getCountryId ( char *cc ) {
	static bool s_init = false;
	static char buf[2000];
	static HashTableX ht;
	char tmp[4];
	if ( ! s_init ) {
		s_init = true;
		// hash them up
		ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids");
		// now add in all the country codes
		long n = (long) sizeof(s_countryCode) / sizeof(char *); 
		for ( long i = 0 ; i < n ; i++ ) {
			char *s    = (char *)s_countryCode[i];
			//long  slen = gbstrlen ( s );
			// sanity check
			if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; }
			// map it to a 4 byte key
			tmp[0]=s[0];
			tmp[1]=s[1];
			tmp[2]=0;
			tmp[3]=0;
			// a val of 0 does not mean empty in HashTableX,
			// that is an artifact of HashTableT
			uint8_t val = i; // +1;
			// add 1 cuz 0 means lang unknown
			if ( ! ht.addKey ( tmp , &val ) ) {
				char *xx=NULL;*xx=0; }
		}
	}
	// lookup
	tmp[0]=to_lower_a(cc[0]);
	tmp[1]=to_lower_a(cc[1]);
	tmp[2]=0;
	tmp[3]=0;
	long slot = ht.getSlot ( tmp );
	if ( slot < 0 ) return 0;
	void *val = ht.getValueFromSlot ( slot );
	return *(uint8_t *)val ;
}

Ejemplo n.º 13

0

Mostrar archivo

Archivo: Entities.cpp Proyecto: privacore/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;
		}

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				}
				buf += len;
				
			}
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Ejemplo n.º 14

0

Mostrar archivo

Archivo: Unicode.cpp Proyecto: DeadNumbers/open-source-search-engine

bool ucInit(char *path, bool verifyFiles){

	char file[384];
	if (path == NULL) path = "./";

	// Might want to move this out of ucInit someday
	// but right now it's the only thing that uses .so files (?)
	char gbLibDir[512];
	snprintf(gbLibDir, 512, "%s/lib",path);
	// i don't think this is used any more because we don't have it!
	//log(LOG_INIT, "ucinit: Setting LD_RUN_PATH to \"%s\"",gbLibDir);
	if (setenv("LD_RUN_PATH", gbLibDir, 1)){
		log(LOG_INIT, "Failed to set LD_RUN_PATH");
	}
	//char *ldpath = getenv("LD_RUN_PATH");
	// i don't think this is used any more because we don't have it!
	//log(LOG_DEBUG, "ucinit: LD_RUN_PATH: %s\n", ldpath);


	strcpy(file, path);
	strcat(file, "/ucdata/uppermap.dat");
	if (!loadUnicodeTable(&g_ucUpperMap,file, 
			      VERIFY_UNICODE_CHECKSUMS, 
			      CHKSUM_UPPERMAP))
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/lowermap.dat");
	if (!loadUnicodeTable(&g_ucLowerMap,file, 
			      VERIFY_UNICODE_CHECKSUMS, 
			      CHKSUM_LOWERMAP))
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/properties.dat");
	if (!loadUnicodeTable(&g_ucProps, file, 
			      VERIFY_UNICODE_CHECKSUMS, 
			      CHKSUM_PROPERTIES))
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/combiningclass.dat");
	if (!loadUnicodeTable(&g_ucCombiningClass, file, 
			      VERIFY_UNICODE_CHECKSUMS, 
			      CHKSUM_COMBININGCLASS))
		goto failed;
	strcpy(file, path);
	strcat(file, "/ucdata/scripts.dat");
	if (!loadUnicodeTable(&g_ucScripts, file, 
			      VERIFY_UNICODE_CHECKSUMS, 
			      CHKSUM_SCRIPTS))
		goto failed;
	// MDW: do we need this for converting from X to utf8? or for
	// the is_alnum(), etc. functions?
	if (!loadDecompTables(path) ||
	    !initCompositionTable())
		goto failed;
	//s_convTable.set(1024);
	if ( ! s_convTable.set(4,sizeof(iconv_t),1024,NULL,0,false,0,"cnvtbl"))
		goto failed;
	
	// dont use these files anymore
	if (verifyFiles){
		if (!openIconvDescriptors())
			return log(LOG_WARN,
				   "uni: unable to open all iconv descriptors");
	}		

	return true;
	
failed:
	return log(LOG_WARN, 
		   "uni: unable to load all property tables");
}

Ejemplo n.º 15

0

Mostrar archivo

Archivo: Domains.cpp Proyecto: lemire/open-source-search-engine

static bool isTLD ( char *tld , int32_t tldLen ) {

	int32_t pcount = 0;
	// now they are random!
	for ( int32_t i = 0 ; i < tldLen ; i++ ) {
		// period count
		if ( tld[i] == '.' ) { pcount++; continue; }
		if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false;
	}

	if ( pcount == 0 ) return true;
	if ( pcount >= 2 ) return false;

	// otherwise, if one period, check table to see if qualified

	// we use this as our hashtable
	static bool       s_isInitialized = false;
	// . i shrunk this list a lot
	// . see backups for the hold list
	static const char * const s_tlds[] = {

	  // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
	"AAA",
	"AARP",
	"ABB",
	"ABBOTT",
	"ABBVIE",
	"ABOGADO",
	"ABUDHABI",
	"AC",
	"ACADEMY",
	"ACCENTURE",
	"ACCOUNTANT",
	"ACCOUNTANTS",
	"ACO",
	"ACTIVE",
	"ACTOR",
	"AD",
	"ADAC",
	"ADS",
	"ADULT",
	"AE",
	"AEG",
	"AERO",
	"AF",
	"AFL",
	"AG",
	"AGAKHAN",
	"AGENCY",
	"AI",
	"AIG",
	"AIRFORCE",
	"AIRTEL",
	"AKDN",
	"AL",
	"ALIBABA",
	"ALIPAY",
	"ALLFINANZ",
	"ALLY",
	"ALSACE",
	"AM",
	"AMICA",
	"AMSTERDAM",
	"ANALYTICS",
	"ANDROID",
	"ANQUAN",
	"AO",
	"APARTMENTS",
	"APP",
	"APPLE",
	"AQ",
	"AQUARELLE",
	"AR",
	"ARAMCO",
	"ARCHI",
	"ARMY",
	"ARPA",
	"ARTE",
	"AS",
	"ASIA",
	"ASSOCIATES",
	"AT",
	"ATTORNEY",
	"AU",
	"AUCTION",
	"AUDI",
	"AUDIO",
	"AUTHOR",
	"AUTO",
	"AUTOS",
	"AVIANCA",
	"AW",
	"AWS",
	"AX",
	"AXA",
	"AZ",
	"AZURE",
	"BA",
	"BABY",
	"BAIDU",
	"BAND",
	"BANK",
	"BAR",
	"BARCELONA",
	"BARCLAYCARD",
	"BARCLAYS",
	"BAREFOOT",
	"BARGAINS",
	"BAUHAUS",
	"BAYERN",
	"BB",
	"BBC",
	"BBVA",
	"BCG",
	"BCN",
	"BD",
	"BE",
	"BEATS",
	"BEER",
	"BENTLEY",
	"BERLIN",
	"BEST",
	"BET",
	"BF",
	"BG",
	"BH",
	"BHARTI",
	"BI",
	"BIBLE",
	"BID",
	"BIKE",
	"BING",
	"BINGO",
	"BIO",
	"BIZ",
	"BJ",
	"BLACK",
	"BLACKFRIDAY",
	"BLOOMBERG",
	"BLUE",
	"BM",
	"BMS",
	"BMW",
	"BN",
	"BNL",
	"BNPPARIBAS",
	"BO",
	"BOATS",
	"BOEHRINGER",
	"BOM",
	"BOND",
	"BOO",
	"BOOK",
	"BOOTS",
	"BOSCH",
	"BOSTIK",
	"BOT",
	"BOUTIQUE",
	"BR",
	"BRADESCO",
	"BRIDGESTONE",
	"BROADWAY",
	"BROKER",
	"BROTHER",
	"BRUSSELS",
	"BS",
	"BT",
	"BUDAPEST",
	"BUGATTI",
	"BUILD",
	"BUILDERS",
	"BUSINESS",
	"BUY",
	"BUZZ",
	"BV",
	"BW",
	"BY",
	"BZ",
	"BZH",
	"CA",
	"CAB",
	"CAFE",
	"CAL",
	"CALL",
	"CAMERA",
	"CAMP",
	"CANCERRESEARCH",
	"CANON",
	"CAPETOWN",
	"CAPITAL",
	"CAR",
	"CARAVAN",
	"CARDS",
	"CARE",
	"CAREER",
	"CAREERS",
	"CARS",
	"CARTIER",
	"CASA",
	"CASH",
	"CASINO",
	"CAT",
	"CATERING",
	"CBA",
	"CBN",
	"CC",
	"CD",
	"CEB",
	"CENTER",
	"CEO",
	"CERN",
	"CF",
	"CFA",
	"CFD",
	"CG",
	"CH",
	"CHANEL",
	"CHANNEL",
	"CHASE",
	"CHAT",
	"CHEAP",
	"CHLOE",
	"CHRISTMAS",
	"CHROME",
	"CHURCH",
	"CI",
	"CIPRIANI",
	"CIRCLE",
	"CISCO",
	"CITIC",
	"CITY",
	"CITYEATS",
	"CK",
	"CL",
	"CLAIMS",
	"CLEANING",
	"CLICK",
	"CLINIC",
	"CLINIQUE",
	"CLOTHING",
	"CLOUD",
	"CLUB",
	"CLUBMED",
	"CM",
	"CN",
	"CO",
	"COACH",
	"CODES",
	"COFFEE",
	"COLLEGE",
	"COLOGNE",
	"COM",
	"COMMBANK",
	"COMMUNITY",
	"COMPANY",
	"COMPARE",
	"COMPUTER",
	"COMSEC",
	"CONDOS",
	"CONSTRUCTION",
	"CONSULTING",
	"CONTACT",
	"CONTRACTORS",
	"COOKING",
	"COOL",
	"COOP",
	"CORSICA",
	"COUNTRY",
	"COUPON",
	"COUPONS",
	"COURSES",
	"CR",
	"CREDIT",
	"CREDITCARD",
	"CREDITUNION",
	"CRICKET",
	"CROWN",
	"CRS",
	"CRUISES",
	"CSC",
	"CU",
	"CUISINELLA",
	"CV",
	"CW",
	"CX",
	"CY",
	"CYMRU",
	"CYOU",
	"CZ",
	"DABUR",
	"DAD",
	"DANCE",
	"DATE",
	"DATING",
	"DATSUN",
	"DAY",
	"DCLK",
	"DE",
	"DEALER",
	"DEALS",
	"DEGREE",
	"DELIVERY",
	"DELL",
	"DELOITTE",
	"DELTA",
	"DEMOCRAT",
	"DENTAL",
	"DENTIST",
	"DESI",
	"DESIGN",
	"DEV",
	"DIAMONDS",
	"DIET",
	"DIGITAL",
	"DIRECT",
	"DIRECTORY",
	"DISCOUNT",
	"DJ",
	"DK",
	"DM",
	"DNP",
	"DO",
	"DOCS",
	"DOG",
	"DOHA",
	"DOMAINS",
	"DOWNLOAD",
	"DRIVE",
	"DUBAI",
	"DURBAN",
	"DVAG",
	"DZ",
	"EARTH",
	"EAT",
	"EC",
	"EDEKA",
	"EDU",
	"EDUCATION",
	"EE",
	"EG",
	"EMAIL",
	"EMERCK",
	"ENERGY",
	"ENGINEER",
	"ENGINEERING",
	"ENTERPRISES",
	"EPSON",
	"EQUIPMENT",
	"ER",
	"ERNI",
	"ES",
	"ESQ",
	"ESTATE",
	"ET",
	"EU",
	"EUROVISION",
	"EUS",
	"EVENTS",
	"EVERBANK",
	"EXCHANGE",
	"EXPERT",
	"EXPOSED",
	"EXPRESS",
	"EXTRASPACE",
	"FAGE",
	"FAIL",
	"FAIRWINDS",
	"FAITH",
	"FAMILY",
	"FAN",
	"FANS",
	"FARM",
	"FASHION",
	"FAST",
	"FEEDBACK",
	"FERRERO",
	"FI",
	"FILM",
	"FINAL",
	"FINANCE",
	"FINANCIAL",
	"FIRESTONE",
	"FIRMDALE",
	"FISH",
	"FISHING",
	"FIT",
	"FITNESS",
	"FJ",
	"FK",
	"FLICKR",
	"FLIGHTS",
	"FLORIST",
	"FLOWERS",
	"FLSMIDTH",
	"FLY",
	"FM",
	"FO",
	"FOO",
	"FOOTBALL",
	"FORD",
	"FOREX",
	"FORSALE",
	"FORUM",
	"FOUNDATION",
	"FOX",
	"FR",
	"FRESENIUS",
	"FRL",
	"FROGANS",
	"FRONTIER",
	"FTR",
	"FUND",
	"FURNITURE",
	"FUTBOL",
	"FYI",
	"GA",
	"GAL",
	"GALLERY",
	"GALLO",
	"GALLUP",
	"GAME",
	"GARDEN",
	"GB",
	"GBIZ",
	"GD",
	"GDN",
	"GE",
	"GEA",
	"GENT",
	"GENTING",
	"GF",
	"GG",
	"GGEE",
	"GH",
	"GI",
	"GIFT",
	"GIFTS",
	"GIVES",
	"GIVING",
	"GL",
	"GLASS",
	"GLE",
	"GLOBAL",
	"GLOBO",
	"GM",
	"GMAIL",
	"GMBH",
	"GMO",
	"GMX",
	"GN",
	"GOLD",
	"GOLDPOINT",
	"GOLF",
	"GOO",
	"GOOG",
	"GOOGLE",
	"GOP",
	"GOT",
	"GOV",
	"GP",
	"GQ",
	"GR",
	"GRAINGER",
	"GRAPHICS",
	"GRATIS",
	"GREEN",
	"GRIPE",
	"GROUP",
	"GS",
	"GT",
	"GU",
	"GUCCI",
	"GUGE",
	"GUIDE",
	"GUITARS",
	"GURU",
	"GW",
	"GY",
	"HAMBURG",
	"HANGOUT",
	"HAUS",
	"HDFCBANK",
	"HEALTH",
	"HEALTHCARE",
	"HELP",
	"HELSINKI",
	"HERE",
	"HERMES",
	"HIPHOP",
	"HITACHI",
	"HIV",
	"HK",
	"HM",
	"HN",
	"HOCKEY",
	"HOLDINGS",
	"HOLIDAY",
	"HOMEDEPOT",
	"HOMES",
	"HONDA",
	"HORSE",
	"HOST",
	"HOSTING",
	"HOTELES",
	"HOTMAIL",
	"HOUSE",
	"HOW",
	"HR",
	"HSBC",
	"HT",
	"HTC",
	"HU",
	"HYUNDAI",
	"IBM",
	"ICBC",
	"ICE",
	"ICU",
	"ID",
	"IE",
	"IFM",
	"IINET",
	"IL",
	"IM",
	"IMAMAT",
	"IMMO",
	"IMMOBILIEN",
	"IN",
	"INDUSTRIES",
	"INFINITI",
	"INFO",
	"ING",
	"INK",
	"INSTITUTE",
	"INSURANCE",
	"INSURE",
	"INT",
	"INTERNATIONAL",
	"INVESTMENTS",
	"IO",
	"IPIRANGA",
	"IQ",
	"IR",
	"IRISH",
	"IS",
	"ISELECT",
	"ISMAILI",
	"IST",
	"ISTANBUL",
	"IT",
	"ITAU",
	"IWC",
	"JAGUAR",
	"JAVA",
	"JCB",
	"JCP",
	"JE",
	"JETZT",
	"JEWELRY",
	"JLC",
	"JLL",
	"JM",
	"JMP",
	"JNJ",
	"JO",
	"JOBS",
	"JOBURG",
	"JOT",
	"JOY",
	"JP",
	"JPMORGAN",
	"JPRS",
	"JUEGOS",
	"KAUFEN",
	"KDDI",
	"KE",
	"KERRYHOTELS",
	"KERRYLOGISTICS",
	"KERRYPROPERTIES",
	"KFH",
	"KG",
	"KH",
	"KI",
	"KIA",
	"KIM",
	"KINDER",
	"KITCHEN",
	"KIWI",
	"KM",
	"KN",
	"KOELN",
	"KOMATSU",
	"KP",
	"KPMG",
	"KPN",
	"KR",
	"KRD",
	"KRED",
	"KUOKGROUP",
	"KW",
	"KY",
	"KYOTO",
	"KZ",
	"LA",
	"LACAIXA",
	"LAMBORGHINI",
	"LAMER",
	"LANCASTER",
	"LAND",
	"LANDROVER",
	"LANXESS",
	"LASALLE",
	"LAT",
	"LATROBE",
	"LAW",
	"LAWYER",
	"LB",
	"LC",
	"LDS",
	"LEASE",
	"LECLERC",
	"LEGAL",
	"LEXUS",
	"LGBT",
	"LI",
	"LIAISON",
	"LIDL",
	"LIFE",
	"LIFEINSURANCE",
	"LIFESTYLE",
	"LIGHTING",
	"LIKE",
	"LIMITED",
	"LIMO",
	"LINCOLN",
	"LINDE",
	"LINK",
	"LIPSY",
	"LIVE",
	"LIVING",
	"LIXIL",
	"LK",
	"LOAN",
	"LOANS",
	"LOCUS",
	"LOL",
	"LONDON",
	"LOTTE",
	"LOTTO",
	"LOVE",
	"LR",
	"LS",
	"LT",
	"LTD",
	"LTDA",
	"LU",
	"LUPIN",
	"LUXE",
	"LUXURY",
	"LV",
	"LY",
	"MA",
	"MADRID",
	"MAIF",
	"MAISON",
	"MAKEUP",
	"MAN",
	"MANAGEMENT",
	"MANGO",
	"MARKET",
	"MARKETING",
	"MARKETS",
	"MARRIOTT",
	"MBA",
	"MC",
	"MD",
	"ME",
	"MED",
	"MEDIA",
	"MEET",
	"MELBOURNE",
	"MEME",
	"MEMORIAL",
	"MEN",
	"MENU",
	"MEO",
	"MG",
	"MH",
	"MIAMI",
	"MICROSOFT",
	"MIL",
	"MINI",
	"MK",
	"ML",
	"MLS",
	"MM",
	"MMA",
	"MN",
	"MO",
	"MOBI",
	"MOBILY",
	"MODA",
	"MOE",
	"MOI",
	"MOM",
	"MONASH",
	"MONEY",
	"MONTBLANC",
	"MORMON",
	"MORTGAGE",
	"MOSCOW",
	"MOTORCYCLES",
	"MOV",
	"MOVIE",
	"MOVISTAR",
	"MP",
	"MQ",
	"MR",
	"MS",
	"MT",
	"MTN",
	"MTPC",
	"MTR",
	"MU",
	"MUSEUM",
	"MUTUAL",
	"MUTUELLE",
	"MV",
	"MW",
	"MX",
	"MY",
	"MZ",
	"NA",
	"NADEX",
	"NAGOYA",
	"NAME",
	"NATURA",
	"NAVY",
	"NC",
	"NE",
	"NEC",
	"NET",
	"NETBANK",
	"NETWORK",
	"NEUSTAR",
	"NEW",
	"NEWS",
	"NEXT",
	"NEXTDIRECT",
	"NEXUS",
	"NF",
	"NG",
	"NGO",
	"NHK",
	"NI",
	"NICO",
	"NIKON",
	"NINJA",
	"NISSAN",
	"NISSAY",
	"NL",
	"NO",
	"NOKIA",
	"NORTHWESTERNMUTUAL",
	"NORTON",
	"NOWRUZ",
	"NP",
	"NR",
	"NRA",
	"NRW",
	"NTT",
	"NU",
	"NYC",
	"NZ",
	"OBI",
	"OFFICE",
	"OKINAWA",
	"OLAYAN",
	"OM",
	"OMEGA",
	"ONE",
	"ONG",
	"ONL",
	"ONLINE",
	"OOO",
	"ORACLE",
	"ORANGE",
	"ORG",
	"ORGANIC",
	"ORIGINS",
	"OSAKA",
	"OTSUKA",
	"OVH",
	"PA",
	"PAGE",
	"PAMPEREDCHEF",
	"PANERAI",
	"PARIS",
	"PARS",
	"PARTNERS",
	"PARTS",
	"PARTY",
	"PASSAGENS",
	"PE",
	"PET",
	"PF",
	"PG",
	"PH",
	"PHARMACY",
	"PHILIPS",
	"PHOTO",
	"PHOTOGRAPHY",
	"PHOTOS",
	"PHYSIO",
	"PIAGET",
	"PICS",
	"PICTET",
	"PICTURES",
	"PID",
	"PIN",
	"PING",
	"PINK",
	"PIZZA",
	"PK",
	"PL",
	"PLACE",
	"PLAY",
	"PLAYSTATION",
	"PLUMBING",
	"PLUS",
	"PM",
	"PN",
	"POHL",
	"POKER",
	"P**N",
	"POST",
	"PR",
	"PRAXI",
	"PRESS",
	"PRO",
	"PROD",
	"PRODUCTIONS",
	"PROF",
	"PROGRESSIVE",
	"PROMO",
	"PROPERTIES",
	"PROPERTY",
	"PROTECTION",
	"PS",
	"PT",
	"PUB",
	"PW",
	"PWC",
	"PY",
	"QA",
	"QPON",
	"QUEBEC",
	"QUEST",
	"RACING",
	"RE",
	"READ",
	"REALTOR",
	"REALTY",
	"RECIPES",
	"RED",
	"REDSTONE",
	"REDUMBRELLA",
	"REHAB",
	"REISE",
	"REISEN",
	"REIT",
	"REN",
	"RENT",
	"RENTALS",
	"REPAIR",
	"REPORT",
	"REPUBLICAN",
	"REST",
	"RESTAURANT",
	"REVIEW",
	"REVIEWS",
	"REXROTH",
	"RICH",
	"RICOH",
	"RIO",
	"RIP",
	"RO",
	"ROCHER",
	"ROCKS",
	"RODEO",
	"ROOM",
	"RS",
	"RSVP",
	"RU",
	"RUHR",
	"RUN",
	"RW",
	"RWE",
	"RYUKYU",
	"SA",
	"SAARLAND",
	"SAFE",
	"SAFETY",
	"SAKURA",
	"SALE",
	"SALON",
	"SAMSUNG",
	"SANDVIK",
	"SANDVIKCOROMANT",
	"SANOFI",
	"SAP",
	"SAPO",
	"SARL",
	"SAS",
	"SAXO",
	"SB",
	"SBI",
	"SBS",
	"SC",
	"SCA",
	"SCB",
	"SCHAEFFLER",
	"SCHMIDT",
	"SCHOLARSHIPS",
	"SCHOOL",
	"SCHULE",
	"SCHWARZ",
	"SCIENCE",
	"SCOR",
	"SCOT",
	"SD",
	"SE",
	"SEAT",
	"SECURITY",
	"SEEK",
	"SELECT",
	"SENER",
	"SERVICES",
	"SEVEN",
	"SEW",
	"SEX",
	"SEXY",
	"SFR",
	"SG",
	"SH",
	"SHARP",
	"SHAW",
	"SHELL",
	"SHIA",
	"SHIKSHA",
	"SHOES",
	"SHOUJI",
	"SHOW",
	"SHRIRAM",
	"SI",
	"SINA",
	"SINGLES",
	"SITE",
	"SJ",
	"SK",
	"SKI",
	"SKIN",
	"SKY",
	"SKYPE",
	"SL",
	"SM",
	"SMILE",
	"SN",
	"SNCF",
	"SO",
	"SOCCER",
	"SOCIAL",
	"SOFTBANK",
	"SOFTWARE",
	"SOHU",
	"SOLAR",
	"SOLUTIONS",
	"SONG",
	"SONY",
	"SOY",
	"SPACE",
	"SPIEGEL",
	"SPOT",
	"SPREADBETTING",
	"SR",
	"SRL",
	"ST",
	"STADA",
	"STAR",
	"STARHUB",
	"STATEBANK",
	"STATEFARM",
	"STATOIL",
	"STC",
	"STCGROUP",
	"STOCKHOLM",
	"STORAGE",
	"STORE",
	"STREAM",
	"STUDIO",
	"STUDY",
	"STYLE",
	"SU",
	"SUCKS",
	"SUPPLIES",
	"SUPPLY",
	"SUPPORT",
	"SURF",
	"SURGERY",
	"SUZUKI",
	"SV",
	"SWATCH",
	"SWISS",
	"SX",
	"SY",
	"SYDNEY",
	"SYMANTEC",
	"SYSTEMS",
	"SZ",
	"TAB",
	"TAIPEI",
	"TALK",
	"TAOBAO",
	"TATAMOTORS",
	"TATAR",
	"TATTOO",
	"TAX",
	"TAXI",
	"TC",
	"TCI",
	"TD",
	"TEAM",
	"TECH",
	"TECHNOLOGY",
	"TEL",
	"TELECITY",
	"TELEFONICA",
	"TEMASEK",
	"TENNIS",
	"TEVA",
	"TF",
	"TG",
	"TH",
	"THD",
	"THEATER",
	"THEATRE",
	"TICKETS",
	"TIENDA",
	"TIFFANY",
	"TIPS",
	"TIRES",
	"TIROL",
	"TJ",
	"TK",
	"TL",
	"TM",
	"TMALL",
	"TN",
	"TO",
	"TODAY",
	"TOKYO",
	"TOOLS",
	"TOP",
	"TORAY",
	"TOSHIBA",
	"TOTAL",
	"TOURS",
	"TOWN",
	"TOYOTA",
	"TOYS",
	"TR",
	"TRADE",
	"TRADING",
	"TRAINING",
	"TRAVEL",
	"TRAVELERS",
	"TRAVELERSINSURANCE",
	"TRUST",
	"TRV",
	"TT",
	"TUBE",
	"TUI",
	"TUNES",
	"TUSHU",
	"TV",
	"TVS",
	"TW",
	"TZ",
	"UA",
	"UBS",
	"UG",
	"UK",
	"UNICOM",
	"UNIVERSITY",
	"UNO",
	"UOL",
	"US",
	"UY",
	"UZ",
	"VA",
	"VACATIONS",
	"VANA",
	"VC",
	"VE",
	"VEGAS",
	"VENTURES",
	"VERISIGN",
	"VERSICHERUNG",
	"VET",
	"VG",
	"VI",
	"VIAJES",
	"VIDEO",
	"VIG",
	"VIKING",
	"VILLAS",
	"VIN",
	"VIP",
	"VIRGIN",
	"VISION",
	"VISTA",
	"VISTAPRINT",
	"VIVA",
	"VLAANDEREN",
	"VN",
	"VODKA",
	"VOLKSWAGEN",
	"VOTE",
	"VOTING",
	"VOTO",
	"VOYAGE",
	"VU",
	"VUELOS",
	"WALES",
	"WALTER",
	"WANG",
	"WANGGOU",
	"WARMAN",
	"WATCH",
	"WATCHES",
	"WEATHER",
	"WEATHERCHANNEL",
	"WEBCAM",
	"WEBER",
	"WEBSITE",
	"WED",
	"WEDDING",
	"WEIBO",
	"WEIR",
	"WF",
	"WHOSWHO",
	"WIEN",
	"WIKI",
	"WILLIAMHILL",
	"WIN",
	"WINDOWS",
	"WINE",
	"WME",
	"WOLTERSKLUWER",
	"WORK",
	"WORKS",
	"WORLD",
	"WS",
	"WTC",
	"WTF",
	"XBOX",
	"XEROX",
	"XIHUAN",
	"XIN",
	"XN--11B4C3D",
	"XN--1CK2E1B",
	"XN--1QQW23A",
	"XN--30RR7Y",
	"XN--3BST00M",
	"XN--3DS443G",
	"XN--3E0B707E",
	"XN--3PXU8K",
	"XN--42C2D9A",
	"XN--45BRJ9C",
	"XN--45Q11C",
	"XN--4GBRIM",
	"XN--55QW42G",
	"XN--55QX5D",
	"XN--5TZM5G",
	"XN--6FRZ82G",
	"XN--6QQ986B3XL",
	"XN--80ADXHKS",
	"XN--80AO21A",
	"XN--80ASEHDB",
	"XN--80ASWG",
	"XN--8Y0A063A",
	"XN--90A3AC",
	"XN--90AIS",
	"XN--9DBQ2A",
	"XN--9ET52U",
	"XN--9KRT00A",
	"XN--B4W605FERD",
	"XN--BCK1B9A5DRE4C",
	"XN--C1AVG",
	"XN--C2BR7G",
	"XN--CCK2B3B",
	"XN--CG4BKI",
	"XN--CLCHC0EA0B2G2A9GCD",
	"XN--CZR694B",
	"XN--CZRS0T",
	"XN--CZRU2D",
	"XN--D1ACJ3B",
	"XN--D1ALF",
	"XN--E1A4C",
	"XN--ECKVDTC9D",
	"XN--EFVY88H",
	"XN--ESTV75G",
	"XN--FCT429K",
	"XN--FHBEI",
	"XN--FIQ228C5HS",
	"XN--FIQ64B",
	"XN--FIQS8S",
	"XN--FIQZ9S",
	"XN--FJQ720A",
	"XN--FLW351E",
	"XN--FPCRJ9C3D",
	"XN--FZC2C9E2C",
	"XN--G2XX48C",
	"XN--GCKR3F0F",
	"XN--GECRJ9C",
	"XN--H2BRJ9C",
	"XN--HXT814E",
	"XN--I1B6B1A6A2E",
	"XN--IMR513N",
	"XN--IO0A7I",
	"XN--J1AEF",
	"XN--J1AMH",
	"XN--J6W193G",
	"XN--JLQ61U9W7B",
	"XN--JVR189M",
	"XN--KCRX77D1X4A",
	"XN--KPRW13D",
	"XN--KPRY57D",
	"XN--KPU716F",
	"XN--KPUT3I",
	"XN--L1ACC",
	"XN--LGBBAT1AD8J",
	"XN--MGB9AWBF",
	"XN--MGBA3A3EJT",
	"XN--MGBA3A4F16A",
	"XN--MGBA7C0BBN0A",
	"XN--MGBAAM7A8H",
	"XN--MGBAB2BD",
	"XN--MGBAYH7GPA",
	"XN--MGBB9FBPOB",
	"XN--MGBBH1A71E",
	"XN--MGBC0A9AZCG",
	"XN--MGBCA7DZDO",
	"XN--MGBERP4A5D4AR",
	"XN--MGBPL2FH",
	"XN--MGBT3DHD",
	"XN--MGBTX2B",
	"XN--MGBX4CD0AB",
	"XN--MIX891F",
	"XN--MK1BU44C",
	"XN--MXTQ1M",
	"XN--NGBC5AZD",
	"XN--NGBE9E0A",
	"XN--NODE",
	"XN--NQV7F",
	"XN--NQV7FS00EMA",
	"XN--NYQY26A",
	"XN--O3CW4H",
	"XN--OGBPF8FL",
	"XN--P1ACF",
	"XN--P1AI",
	"XN--PBT977C",
	"XN--PGBS0DH",
	"XN--PSSY2U",
	"XN--Q9JYB4C",
	"XN--QCKA1PMC",
	"XN--QXAM",
	"XN--RHQV96G",
	"XN--ROVU88B",
	"XN--S9BRJ9C",
	"XN--SES554G",
	"XN--T60B56A",
	"XN--TCKWE",
	"XN--UNUP4Y",
	"XN--VERMGENSBERATER-CTB",
	"XN--VERMGENSBERATUNG-PWB",
	"XN--VHQUV",
	"XN--VUQ861B",
	"XN--W4R85EL8FHU5DNRA",
	"XN--WGBH1C",
	"XN--WGBL6A",
	"XN--XHQ521B",
	"XN--XKC2AL3HYE2A",
	"XN--XKC2DL3A5EE0H",
	"XN--Y9A3AQ",
	"XN--YFRO4I67O",
	"XN--YGBI2AMMX",
	"XN--ZFR164B",
	"XPERIA",
	"XXX",
	"XYZ",
	"YACHTS",
	"YAHOO",
	"YAMAXUN",
	"YANDEX",
	"YE",
	"YODOBASHI",
	"YOGA",
	"YOKOHAMA",
	"YOU",
	"YOUTUBE",
	"YT",
	"YUN",
	"ZA",
	"ZARA",
	"ZERO",
	"ZIP",
	"ZM",
	"ZONE",
	"ZUERICH",
	"ZW",


	"AB.CA",
	"AC.AE",
	"AC.AT",
	"AC.CN",
	"AC.CR",
	"AC.CY",
	"AC.FJ",
	"AC.GG",
	"AC.ID",
	"AC.IL",
	"AC.IM",
	"AC.IN",
	"AC.JE",
	"AC.JP",
	"AC.KR",
	"AC.NZ",
	"AC.PA",
	"AC.TH",
	"AC.UG",
	"AC.UK",
	"AC.YU",
	"AC.ZA",
	"AD.JP",
	"AH.CN",
	"ALDERNEY.GG",
	"ALT.ZA",
	"ART.BR",
	"ART.DO",
	"ARTS.CO",
	"ARTS.VE",
	"ASN.AU",
	"ASN.LV",
	"BBS.TR",
	"BC.CA",
	"BIB.VE",
	"BJ.CN",
	"CO.AT",
	"CO.AO",
	"CO.CK",
	"CO.CR",
	"CO.GG",
	"CO.HU",
	"CO.ID",
	"CO.IL",
	"CO.IM",
	"CO.IN",
	"CO.JE",
	"CO.JP",
	"CO.KR",
	"COM.AR",
	"COM.AU",
	"COM.AZ",
	"COM.BB",
	"COM.BM",
	"COM.BR",
	"COM.BS",
	"COM.CN",
	"COM.CO",
	"COM.CU",
	"COM.CY",
	"COM.DO",
	"COM.EC",
	"COM.EG",
	"COM.FJ",
	"COM.GE",
	"COM.GU",
	"COM.HK",
	"COM.JO",
	"COM.KH",
	"COM.LA",
	"COM.LB",
	"COM.LC",
	"COM.LV",
	"COM.LY",
	"COM.MM",
	"COM.MO",
	"COM.MT",
	"COM.MX",
	"COM.MY",
	"COM.NA",
	"COM.NC",
	"COM.NI",
	"COM.NP",
	"COM.PA",
	"COM.PE",
	"COM.PH",
	"COM.PL",
	"COM.PY",
	"COM.RU",
	"COM.SG",
	"COM.SH",
	"COM.SY",
	"COM.TN",
	"COM.TR",
	"COM.TW",
	"COM.UA",
	"COM.UY",
	"COM.VE",
	"CONF.AU",
	"CONF.LV",
	"CO.NZ",
	"COOP",
	"CO.AE",
	"CO.SV",
	"CO.TH",
	"CO.UG",
	"CO.UK",
	"CO.VE",
	"CO.VI",
	"CO.YU",
	"CO.ZA",
	"CQ.CN",
	"CSIRO.AU",
	"ED.CR",
	"EDU.BM",
	"EDU.AR",
	"EDU.CN",
	"EDU.CO",
	"EDU.DO",
	"EDU.EC",
	"EDU.EG",
	"EDU.GE",
	"EDU.GU",
	"EDU.JO",
	"EDU.LC",
	"EDU.LV",
	"EDU.MM",
	"EDU.MO",
	"EDU.MY",
	"EDUNET.TN",
	"EDU.PA",
	"EDU.PY",
	"EDU.SG",
	"EDU.SH",
	"EDU.TR",
	"EDU.TW",
	"EDU.UY",
	"EDU.VE",
	"EDU.YU",
	"EDU.ZA",
	"ENS.TN",
	"ERNET.IN",
	"ESP.BR",
	"ETC.BR",
	"EUN.EG",
	"FI.CR",
	"FIN.EC",
	"FIN.TN",
	"FIRM.CO",
	"FIRM.VE",
	"G12.BR",
	"GD.CN",
	"GEN.NZ",
	"GOB.PA",
	"GO.CR",
	"GO.ID",
	"GO.KR",
	"GO.TH",
	"GO.UG",
	"GOV.AE",
	"GOV.AR",
	"GOV.AU",
	"GOV.BM",
	"GOV.BR",
	"GOV.CN",
	"GOV.CO",
	"GOV.CY",
	"GOV.DO",
	"GOV.EC",
	"GOV.EG",
	"GOVE.TW",
	"GOV.FJ",
	"GOV.GE",
	"GOV.GG",
	"GOV.GU",
	"GOV.IL",
	"GOV.IM",
	"GOV.IN",
	"GOV.JE",
	"GOV.JO",
	"GOV.JP",
	"GOV.LB",
	"GOV.LC",
	"GOV.LV",
	"GOV.MM",
	"GOV.MO",
	"GOV.MY",
	"GOV.SG",
	"GOV.SH",
	"GOV.TN",
	"GOVT.NZ",
	"GOV.TR",
	"GOV.UA",
	"GOV.UK",
	"GOV.VE",
	"GOV.ZA",
	"GS.CN",
	"GUERNSEY.GG",
	"GX.CN",
	"GZ.CN",
	"HB.CN",
	"HE.CN",
	"HI.CN",
	"HK.CN",
	"HL.CN",
	"HN.CN",
	"ID.AU",
	"ID.FJ",
	"ID.LV",
	"IND.BR",
	"IND.GG",
	"IND.JE",
	"IND.TN",
	"INF.BR",
	"INFO.AU",
	"INFO.CO",
	"INFO.HU",
	"INFO.TN",
	"INFO.VE",
	"INT.CO",
	"INTL.TN",
	"INT.VE",
	"JERSEY.JE",
	"JL.CN",
	"JS.CN",
	"K12.EC",
	"K12.IL",
	"K12.TR",
	"LKD.CO.IM",
	"LN.CN",
	"LTD.GG",
	"LTD.JE",
	"LTD.UK",
	"MB.CA",
	"MED.EC",
	"MIL.BR",
	"MIL.CO",
	"MIL.DO",
	"MIL.EC",
	"MIL.GE",
	"MIL.GU",
	"MIL.ID",
	"MIL.LB",
	"MIL.LV",
	"MIL.PH",
	"MIL.SH",
	"MIL.TR",
	"MIL.VE",
	"MIL.ZA",
	"MO.CN",
	"MOD.UK",
	"MUNI.IL",
	"MUSEUM",
	"NAME",
	"NAT.TN",
	"NB.CA",
	"NET.AR",
	"NET.AU",
	"NET.AZ",
	"NET.BB",
	"NET.BM",
	"NET.BR",
	"NET.BS",
	"NET.CN",
	"NET.CU",
	"NET.CY",
	"NET.DO",
	"NET.EC",
	"NET.EG",
	"NET.GE",
	"NET.GG",
	"NET.GU",
	"NET.HK",
	"NET.ID",
	"NET.IL",
	"NET.IM",
	"NET.IN",
	"NET.JE",
	"NET.JO",
	"NET.JP",
	"NET.KH",
	"NET.LA",
	"NET.LB",
	"NET.LC",
	"NET.LV",
	"NET.LY",
	"NET.MM",
	"NET.MO",
	"NET.MT",
	"NET.MX",
	"NET.MY",
	"NET.NA",
	"NET.NC",
	"NET.NP",
	"NET.NZ",
	"NET.PA",
	"NET.PE",
	"NET.PH",
	"NET.PL",
	"NET.PY",
	"NET.RU",
	"NET.SG",
	"NET.SH",
	"NET.SY",
	"NET.TH",
	"NET.TN",
	"NET.TR",
	"NET.TW",
	"NET.UA",
	"NET.UK",
	"NET.UY",
	"NET.VE",
	"NET.VI",
	"NET.ZA",
	"NF.CA",
	"NGO.PH",
	"NGO.ZA",
	"NHS.UK",
	"NIC.IM",
	"NIC.IN",
	"NM.CN",
	"NM.KR",
	"NOM.CO",
	"NOM.VE",
	"NOM.ZA",
	"NS.CA",
	"NSK.SU",
	"NT.CA",
	"NUI.HU",
	"NX.CN",
	"ON.CA",
	"OR.CR",
	"ORG.AE",
	"ORG.AR",
	"ORG.AU",
	"ORG.AZ",
	"ORG.BB",
	"ORG.BM",
	"ORG.BR",
	"ORG.BS",
	"ORG.CN",
	"ORG.CO",
	"ORG.CU",
	"ORG.CY",
	"ORG.DO",
	"ORG.EC",
	"ORG.EG",
	"ORG.FJ",
	"ORG.GE",
	"ORG.GG",
	"ORG.GU",
	"ORG.HK",
	"ORG.HU",
	"ORG.IL",
	"ORG.IM",
	"ORG.JE",
	"ORG.JP",
	"ORG.KH",
	"ORG.LA",
	"ORG.LB",
	"ORG.LC",
	"ORG.LV",
	"ORG.LY",
	"ORG.MM",
	"ORG.MO",
	"ORG.MT",
	"ORG.MX",
	"ORG.MY",
	"ORG.NA",
	"ORG.NC",
	"ORG.NZ",
	"ORG.PA",
	"ORG.PE",
	"ORG.PH",
	"ORG.PL",
	"ORG.PY",
	"ORG.RU",
	"ORG.SG",
	"ORG.SH",
	"ORG.SY",
	"ORG.TN",
	"ORG.TR",
	"ORG.TW",
	"ORG.UK",
	"ORG.UY",
	"ORG.VE",
	"ORG.VI",
	"ORG.YU",
	"ORG.ZA",
	"OR.ID",
	"OR.KR",
	"OR.TH",
	"ORT.NP",
	"OR.UG",
	"OZ.AU",
	"PE.CA",
	"PLC.CO.IM",
	"PLC.UK",
	"POLICE.UK",
	"PRIV.HU",
	"PSI.BR",
	"PVT.GE",
	"QC.CA",
	"QH.CN",
	"REC.BR",
	"REC.CO",
	"REC.VE",
	"RE.KR",
	"RES.IN",
	"RNRT.TN",
	"RNS.TN",
	"RNU.TN",
	"SA.CR",
	"SARK.GG",
	"SC.CN",
	"SCH.GG",
	"SCH.JE",
	"SCHOOL.FJ",
	"SCHOOL.ZA",
	"SCH.UK",
	"SCI.EG",
	"SH.CN",
	"SK.CA",
	"SLD.PA",
	"SN.CN",
	"STORE.CO",
	"STORE.VE",
	"SX.CN",
	"TEC.VE",
	"TELEMEMO.AU",
	"TJ.CN",
	"TM.HU",
	"TMP.BR",
	"TM.ZA",
	"TOURISM.TN",
	"TW.CN",
	"WEB.CO",
	"WEB.DO",
	"WEB.VE",
	"WEB.ZA",
	"XJ.CN",
	"XZ.CN",
	"YK.CA",
	"YN.CN",
	"ZJ.CN"
};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
				     "tldtbl") ) 
			return log("build: Could not init table of TLDs.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char      *d    = s_tlds[i];
			int32_t       dlen = gbstrlen ( d );
			int64_t  dh   = hash64Lower_a ( d , dlen );
			if ( ! s_table.addKey (&dh,NULL) )
				return log("build: dom table failed");
		}
		s_isInitialized = true;
	} 
	int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
	return s_table.isInTable ( &h );//getScoreFromTermId ( h );
}

Ejemplo n.º 16

0

Mostrar archivo

Archivo: PageReindex.cpp Proyecto: exename/open-source-search-engine

// . this returns false if blocks, true otherwise
// . sets g_errno on failure
bool Msg1c::gotList ( ) {

	if ( g_errno ) return true;

	int64_t *tmpDocIds = m_msg3a.getDocIds();
	int32_t       numDocIds = m_msg3a.getNumDocIds();

	if ( m_startNum > 0) {
		numDocIds -= m_startNum;
		tmpDocIds = &tmpDocIds[m_startNum];
	}

	m_numDocIds = numDocIds; // save for reporting
	// log it
	log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds);
	// bail if no need
	if ( numDocIds <= 0 ) return true;

	// force spiders on on entire network. they will progagate from 
	// host #0... 
	g_conf.m_spideringEnabled = true;

	int32_t nowGlobal = getTimeGlobal();

	HashTableX dt;
	char dbuf[1024];
	dt.set(8,0,64,dbuf,1024,false,0,"ddocids");

	m_sb.setLabel("reiadd");

	State13 *st = (State13 *)m_state;
	GigablastRequest *gr = &st->m_gr;

	m_numDocIdsAdded = 0;

	// list consists of docIds, loop through each one
 	for(int32_t i = 0; i < numDocIds; i++) {
		int64_t docId = tmpDocIds[i];
		// when searching events we get multiple docids that are same
		if ( dt.isInTable ( &docId ) ) continue;
		// add it
		if ( ! dt.addKey ( &docId ) ) return true;

		SpiderRequest sr;
		sr.reset();

		// url is a docid!
		sprintf ( sr.m_url , "%" PRIu64 , docId );

		// make a fake first ip
		// use only 64k values so we don't stress doledb/waittrees/etc.
		// for large #'s of docids
		int32_t firstIp = (docId & 0x0000ffff);

		// bits 6-13 of the docid are the domain hash so use those
		// when doing a REINDEX (not delete!) to ensure that requests
		// on the same domain go to the same shard, at least when
		// we have up to 256 shards. if we have more than 256 shards
		// at this point some shards will not participate in the
		// query reindex/delete process because of this, so 
		// we'll want to allow more bits in in that case perhaps.
		// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
		// to see what shard is responsible for storing and indexing 
		// this SpiderRequest based on the firstIp.
		if ( ! m_forceDel ) { 
			// if we are a REINDEX not a delete because 
			// deletes don't need to spider/redownload the doc
			// so the distribution can be more random
			firstIp >>= 6;
			firstIp &= 0xff;
		}

		// 0 is not a legit val. it'll core below.
		if ( firstIp == 0 ) {
			firstIp = 1;
		}

		// use a fake ip
		sr.m_firstIp        =  firstIp;
		// we are not really injecting...
		sr.m_isInjecting    =  false;//true;
		sr.m_hopCount       = -1;
		sr.m_isPageReindex  =  1;
		sr.m_urlIsDocId     =  1;
		sr.m_fakeFirstIp    =  1;

		// now you can recycle content instead of re-downloading it
		// for every docid
		sr.m_recycleContent = gr->m_recycleContent;
		// if this is zero we end up getting deduped in
		// dedupSpiderList() if there was a SpiderReply whose
		// spider time was > 0
		sr.m_addedTime = nowGlobal;
	    sr.m_forceDelete = m_forceDel ? 1 : 0;

		// . complete its m_key member
		// . parentDocId is used to make the key, but only allow one
		//   page reindex spider request per url... so use "0"
		// . this will set "uh48" to hash64b(m_url) which is the docid
		sr.setKey( firstIp, 0LL , false );

		// how big to serialize
		int32_t recSize = sr.getRecSize();

		m_numDocIdsAdded++;
	
		// store it
		if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) {
			// g_errno must be set
			if ( ! g_errno ) { g_process.shutdownAbort(true); }

			log(LOG_LOGIC,
			    "admin: Query reindex size of %" PRId32" "
			    "too big. Aborting. Bad engineer." , 
			    (int32_t)0);//m_list.getListSize() );
			return true;
		}
	}

Ejemplo n.º 17

0

Mostrar archivo

Archivo: Synonyms.cpp Proyecto: BillWangCS/open-source-search-engine

// langId is language of the query
long long getSynBaseHash64 ( char *qstr , uint8_t langId ) {
	Words ww;
	ww.set3 ( qstr );
	long nw = ww.getNumWords();
	long long *wids = ww.getWordIds();
	//char **wptrs = ww.getWords();
	//long *wlens = ww.getWordLens();
	long long baseHash64 = 0LL;
	Synonyms syn;
	// assume english if unknown to fix 'pandora's tower'
	// vs 'pandoras tower' where both words are in both
	// english and german so langid is unknown
	if ( langId == langUnknown ) langId = langEnglish;
	// . store re-written query into here then hash that string
	// . this way we can get rid of spaces
	//char rebuf[1024];
	//char *p = rebuf;
	//if ( strstr(qstr,"cheatcodes") )
	//	log("hey");
	// for deduping
	HashTableX dups;
	if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false;
	// scan the words
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if not alnum
		if ( ! wids[i] ) continue;
		// get its synonyms into tmpBuf
		char tmpBuf[TMPSYNBUFSIZE];
		// . assume niceness of 0 for now
		// . make sure to get all synsets!! ('love' has two synsets)
		long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0);
		// term freq algo
		//long pop = g_speller.getPhrasePopularity(NULL,
		//					 wids[i],
		//					 true,
		//					 langId);
		// is it a queryStopWord like "the" or "and"?
		bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]);
		// a more restrictive list
		bool isStop = ::isStopWord(NULL,0,wids[i]);
		if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true;
		// find the smallest one
		unsigned long long min = wids[i];
		//char *minWordPtr = wptrs[i];
		//long  minWordLen = wlens[i];
		// declare up here since we have a goto below
		long j;
		// add to table too
		if ( dups.isInTable ( &min ) ) goto gotdup;
		// add to it
		if ( ! dups.addKey ( &min ) ) return false;
		// now scan the synonyms, they do not include "min" in them
		for ( j = 0 ; j < naids ; j++ ) {
			// get it
			unsigned long long aid64;
			aid64 = (unsigned long long)syn.m_aids[j];
			// if any syn already hashed then skip it and count
			// as a repeated term. we have to do it this way
			// rather than just getting the minimum synonym 
			// word id, because 'love' has two synsets and
			// 'like', a synonym of 'love' only has one synset
			// and they end up having different minimum synonym
			// word ids!!!
			if ( dups.isInTable ( &aid64 ) ) break;
			// add it. this could fail!
			if ( ! dups.addKey ( &aid64 ) ) return false;
			// set it?
			if ( aid64 >= min ) continue;
			// got a new min
			min = aid64;
			//minWordPtr = syn.m_termPtrs[j];
			//minWordLen = syn.m_termLens[j];
			// get largest term freq of all synonyms
			//long pop2 = g_speller.getPhrasePopularity(NULL,aid64,
			//					  true,langId);
			//if ( pop2 > pop ) pop = pop2;
		}
		// early break out means a hit in dups table
		if ( j < naids ) {
		gotdup:
			// do not count as repeat if query stop word
			// because they often repeat
			if ( isQueryStop ) continue;
			// count # of repeated word forms
			//nrwf++;
			continue;
		}
		// hash that now
		// do not include stop words in synbasehash so
		// 'search the web' != 'search web'
		if ( ! isStop ) {
			// no! make it order independent so 'search the web'
			// equals 'web the search' and 'engine search'
			// equals 'search engine'
			//baseHash64 <<= 1LL;
			baseHash64 ^= min;
		}
		// count it, but only if not a query stop word like "and"
		// or "the" or "a". # of unique word forms.
		//if ( ! isQueryStop ) nuwf++;
		// get term freq 
		//if ( pop > maxPop ) maxPop = pop;
		// control word?
		//if ( wids[i] == cw1 ) ncwf++;
	}
	return baseHash64;
}

Ejemplo n.º 18

0

Mostrar archivo

Archivo: Synonyms.cpp Proyecto: BillWangCS/open-source-search-engine

// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
long Synonyms::getSynonyms ( Words *words , 
			     long wordNum , 
			     uint8_t langId ,
			     char *tmpBuf ,
			     long niceness ) {

	// punct words have no synoyms
	if ( ! words->m_wordIds[wordNum] ) return 0;

	// store these
	m_words     = words;
	m_docLangId = langId;
	m_niceness = niceness;

	// sanity check
	if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; }

	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];
	dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds");


	long maxSyns = (long)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (long *)bufPtr;
	bufPtr += maxSyns * 4;


	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = m_termPtrs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;

	
	char *w    = m_words->m_words   [wordNum];
	long  wlen = m_words->m_wordLens[wordNum];

	//
	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below
	//

	char sourceId = SOURCE_WIKTIONARY;
	char *ss = NULL;
	long long bwid;
	char wikiLangId = m_docLangId;
	bool hadSpace ;
	long klen ;
	long baseNumAlnumWords;

 tryOtherLang:

	/*
	// if word only exists in one language, assume that language for word
	// even if m_docLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_docLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		long long bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		else
			wikiLangId = getCharacterLanguage(w);
	}
	*/

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId && 
	     wordNum+2< m_words->m_numWords &&
	     m_words->m_wordIds[wordNum+2]) {
		// get phrase id bigram then
		long conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		char *wp2 = m_words->m_words[wordNum+2];
		long  wlen2 = m_words->m_wordLens[wordNum+2];
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
	}

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss && 
		     wlen >= 3 &&
		     w[wlen-2]=='\'' && 
		     w[wlen-1]=='s' ) {
			long long cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );
		}
	}

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_docLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_docLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		sourceId   = SOURCE_WIKTIONARY_EN;
		goto tryOtherLang;
	}

	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		long count = 0;
	addSynSet:
		// do we have another set following this
		char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf");
		}
		// skip over the pipe i guess
		char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) { char *xx=NULL;*xx=0; }
		// point to word list
		char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		char *e = p + 1;
		// save count in case we need to undo
		//long saved = m_numAlts[wordNum];
	hashLoop:


		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer" 
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		long long h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		}
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		hadSpace = false;
		klen = e - p;
		for ( long k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			Words sw;
			sw.setx ( p , e - p , m_niceness );
			*(long long *)m_wids0Ptr = sw.m_wordIds[0];
			*(long long *)m_wids1Ptr = sw.m_wordIds[2];
			*(long  *)m_numAlnumWordsPtr = sw.getNumAlnumWords();
		}

		m_wids0Ptr++;
		m_wids1Ptr++;
		m_numAlnumWordsPtr++;

		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) goto done;
	getNextSyn:
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
	done:
		// all done
		return m_aidsPtr - m_aids;
	}


	// strip marks from THIS word, return -1 w/ g_errno set on error
	if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' && 
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe ( wordNum, &dt ) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
}

Ejemplo n.º 19

0

Mostrar archivo

Archivo: qa.cpp Proyecto: firatkarakusoglu/open-source-search-engine

void processReply ( char *reply , long replyLen ) {

	// store our current reply
	SafeBuf fb2;
	fb2.safeMemcpy(reply,replyLen );
	fb2.nullTerm();

	// log that we got the reply
	log("qa: got reply(len=%li)(errno=%s)=%s",
	    replyLen,mstrerror(g_errno),reply);

	char *content = NULL;
	long  contentLen = 0;

	// get mime
	if ( reply ) {
		HttpMime mime;
		mime.set ( reply, replyLen , NULL );
		// only hash content since mime has a timestamp in it
		content = mime.getContent();
		contentLen = mime.getContentLen();
		if ( content && contentLen>0 && content[contentLen] ) { 
			char *xx=NULL;*xx=0; }
	}

	if ( ! content ) {
		content = "";
		contentLen = 0;
	}

	s_content = content;

	// take out <responseTimeMS>
	markOut ( content , "<currentTimeUTC>");
	markOut ( content , "<responseTimeMS>");

	// until i figure this one out, take it out
	markOut ( content , "<docsInCollection>");

	// until i figure this one out, take it out
	markOut ( content , "<hits>");

	// for those links in the html pages
	markOut ( content, "rand64=");

	// for json
	markOut ( content , "\"currentTimeUTC\":" );
	markOut ( content , "\"responseTimeMS\":");
	markOut ( content , "\"docsInCollection\":");

	// for xml
	markOut ( content , "<currentTimeUTC>" );
	markOut ( content , "<responseTimeMS>");
	markOut ( content , "<docsInCollection>");

	// indexed 1 day ago
	markOut ( content,"indexed:");
	// modified 1 day ago
	markOut ( content,"modified:");

	// s_gigabitCount... it is perpetually incrementing static counter
	// in PageResults.cpp
	markOut(content,"ccc(");
	markOut(content,"id=fd");
	markOut(content,"id=sd");

	// for some reason the term freq seems to change a little in
	// the scoring table
	markOut(content,"id=tf");

	// make checksum. we ignore back to back spaces so this
	// hash works for <docsInCollection>10 vs <docsInCollection>9
	long contentCRC = 0; 
	if ( content ) contentCRC = qa_hash32 ( content );

	// note it
	log("qa: got contentCRC of %lu",contentCRC);


	// if what we expected, save to disk if not there yet, then
	// call s_callback() to resume the qa pipeline
	/*
	if ( contentCRC == s_expectedCRC ) {
		// save content if good
		char fn3[1024];
		sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC);
		File ff; ff.set ( fn3 );
		if ( ! ff.doesExist() ) {
			// if not there yet then save it
			fb2.save(fn3);
		}
		// . continue on with the qa process
		// . which qa function that may be
		//s_callback();
		return;
	}
	*/

	//
	// if crc of content does not match what was expected then do a diff
	// so we can see why not
	//

	// this means caller does not care about the response
	if ( ! s_checkCRC ) {
		//s_callback();
		return;
	}

	//const char *emsg = "qa: bad contentCRC of %li should be %li "
	//	"\n";//"phase=%li\n";
	//fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1);

	// hash url
	long urlHash32 = hash32n ( s_url.getUrl() );

	// combine test function too since two tests may use the same url
	long nameHash = hash32n ( s_qt->m_testName );

	// combine together
	urlHash32 = hash32h ( nameHash , urlHash32 );

	static bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		s_ht.set(4,4,1024,NULL,0,false,0,"qaht");
		// make symlink
		//char cmd[512];
		//snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir);
		//system(cmd);
		char dir[1024];
		snprintf(dir,1000,"%sqa",g_hostdb.m_dir);
		long status = ::mkdir ( dir ,
					S_IRUSR | S_IWUSR | S_IXUSR | 
					S_IRGRP | S_IWGRP | S_IXGRP | 
					S_IROTH | S_IXOTH );
	        if ( status == -1 && errno != EEXIST && errno )
			log("qa: Failed to make directory %s: %s.",
			    dir,mstrerror(errno));
		// try to load from disk
		SafeBuf fn;
		fn.safePrintf("%s/qa/",g_hostdb.m_dir);
		log("qa: loading crctable.dat");
		s_ht.load ( fn.getBufStart() , "crctable.dat" );
	}

	// break up into lines
	char fn2[1024];
	sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC);
	fb2.save ( fn2 );

	// look up in hashtable to see what reply crc should be
	long *val = (long *)s_ht.getValue ( &urlHash32 );

	// just return if the same
	if ( val && contentCRC == *val ) {
		g_qaOutput.safePrintf("<b style=color:green;>"
				      "passed test</b><br>%s : "
				      "<a href=%s>%s</a> (urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>"
				      "%lu</a>)<br>"
				      "<hr>",
				      s_qt->m_testName,
				      s_url.getUrl(),
				      s_url.getUrl(),
				      urlHash32,
				      contentCRC,
				      contentCRC);
		return;
	}



	if ( ! val ) {
		// add it so we know
		s_ht.addKey ( &urlHash32 , &contentCRC );
		g_qaOutput.safePrintf("<b style=color:blue;>"
				      "first time testing</b><br>%s : "
				      "<a href=%s>%s</a> "
				      "(urlhash=%lu "
				      "crc=<a href=/qa/content.%lu>%lu"
				      "</a>)<br>"
				      "<hr>",
				      s_qt->m_testName,
				      s_url.getUrl(),
				      s_url.getUrl(),
				      urlHash32,
				      contentCRC,
				      contentCRC);
		return;
	}


	log("qa: crc changed for url %s from %li to %li",
	    s_url.getUrl(),*val,contentCRC);

	// get response on file
	SafeBuf fb1;
	char fn1[1024];
	sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val);
	fb1.load(fn1);
	fb1.nullTerm();

	// do the diff between the two replies so we can see what changed
	char cmd[1024];
	sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2);
	log("qa: %s\n",cmd);
	system(cmd);

	g_numErrors++;
	
	g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
			      "<a href=%s>%s</a> (urlhash=%lu)<br>"

			      "<input type=checkbox name=urlhash%lu value=1 "
			      // use ajax to update test crc. if you undo your
			      // check then it should put the old val back.
			      // when you first click the checkbox it should
			      // gray out the diff i guess.
			      "onclick=submitchanges(%lu,%lu);> "
			      "Accept changes"

			      "<br>"
			      "original on left, new on right. "
			      "oldcrc = <a href=/qa/content.%lu>%lu</a>"

			      " != <a href=/qa/content.%lu>%lu</a> = newcrc"
			      "<br>diff output follows:<br>"
			      "<pre id=%lu style=background-color:0xffffff;>",
			      s_qt->m_testName,
			      s_url.getUrl(),
			      s_url.getUrl(),
			      urlHash32,

			      // input checkbox name field
			      urlHash32,

			      // submitchanges() parms
			      urlHash32, 
			      contentCRC,

			      // original/old content.%lu
			      *val,
			      *val,

			      // new content.%lu
			      contentCRC,
			      contentCRC,

			      // for the pre tag id:
			      urlHash32);


	// store in output
	SafeBuf sb;
	sb.load("/tmp/diffout");
	g_qaOutput.htmlEncode ( sb.getBufStart() );

	g_qaOutput.safePrintf("</pre><br><hr>");

	// if this is zero allow it to slide by. it is learning mode i guess.
	// so we can learn what crc we need to use.
	// otherwise, stop right there for debugging
	//if ( s_expectedCRC != 0 ) exit(1);

	// keep on going
	//s_callback();
}

Ejemplo n.º 20

0

Mostrar archivo

Archivo: Turkdb.cpp Proyecto: DeadNumbers/open-source-search-engine

void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}

Ejemplo n.º 21

0

Mostrar archivo

Archivo: Entities.cpp Proyecto: BlaBlaNet/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );
			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;
			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }
			// point to it
			char *buf = (char *)s_entities[i].utf8;
			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);
			//
			// make my own mods to make parsing easier
			//
			if ( up == 160 ) {  // nbsp
				buf[0] = ' '; len = 1; }
			// make all quotes equal '\"' (34 decimal)
			// double and single curling quotes
			//http://www.dwheeler.com/essays/quotes-test-utf-8.html
			// &#x201c, 201d, 2018, 2019 (unicode values, not utf8)
			// &ldquo, &rdquo, &lsquo, &rsquo
			/*
			if ( up == 171 ||
			     up == 187 ||
			     up == 8216 ||
			     up == 8217 ||
			     up == 8218 ||
			     up == 8220 ||
			     up == 8221 ||
			     up == 8222 ||
			     up == 8249 ||
			     up == 8250 ) {
				buf[0] = '\"'; len = 1; }
			// and normalize all dashes (mdash,ndash)
			if ( up == 8211 || up == 8212 ) {
				buf[0] = '-'; len = 1; }
			*/

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Ejemplo n.º 22

0

Mostrar archivo

Archivo: Msg3a.cpp Proyecto: rdhananjaya/open-source-search-engine

// . merge all the replies together
// . put final merged docids into m_docIds[],m_bitScores[],m_scores[],...
// . this calls Msg51 to get cluster levels when done merging
// . Msg51 remembers clusterRecs from previous call to avoid repeating lookups
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg3a::mergeLists ( ) {

    // time how long the merge takes
    if ( m_debug ) {
        logf( LOG_DEBUG, "query: msg3a: --- Final DocIds --- " );
        m_startTime = gettimeofdayInMilliseconds();
    }

    // reset our final docids count here in case we are a re-call
    m_numDocIds = 0;
    // a secondary count, how many unique docids we scanned, and not
    // necessarily added to the m_docIds[] array
    //m_totalDocCount = 0; // long docCount = 0;
    m_moreDocIdsAvail = true;


    // shortcut
    //long numSplits = m_numHosts;//indexdbSplit;

    // . point to the various docids, etc. in each split reply
    // . tcPtr = term count. how many required query terms does the doc
    //   have? formerly called topExplicits in IndexTable2.cpp
    long long     *diPtr [MAX_INDEXDB_SPLIT];
    float         *rsPtr [MAX_INDEXDB_SPLIT];
    key_t         *ksPtr [MAX_INDEXDB_SPLIT];
    long long     *diEnd [MAX_INDEXDB_SPLIT];
    for ( long j = 0; j < m_numHosts ; j++ ) {
        Msg39Reply *mr =m_reply[j];
        // if we have gbdocid:| in query this could be NULL
        if ( ! mr ) {
            diPtr[j] = NULL;
            diEnd[j] = NULL;
            rsPtr[j] = NULL;
            ksPtr[j] = NULL;
            continue;
        }
        diPtr [j] = (long long *)mr->ptr_docIds;
        rsPtr [j] = (float     *)mr->ptr_scores;
        ksPtr [j] = (key_t     *)mr->ptr_clusterRecs;
        diEnd [j] = (long long *)(mr->ptr_docIds +
                                  mr->m_numDocIds * 8);
    }

    // clear if we had it
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    //
    // HACK: START section stats merge
    //
    m_sectionStats.reset();
    long sneed = 0;
    for ( long j = 0; j < m_numHosts ; j++ ) {
        Msg39Reply *mr = m_reply[j];
        if ( ! mr ) continue;
        sneed += mr->size_siteHashList/4;
    }
    HashTableX dt;
    //char tmpBuf[5000];
    if (sneed&&!dt.set(4,0,sneed,NULL,0,false,
                       m_r->m_niceness,"uniqsit"))
        return true;
    for ( long j = 0; sneed && j < m_numHosts ; j++ ) {
        Msg39Reply *mr =m_reply[j];
        if ( ! mr ) continue;
        SectionStats *src = &mr->m_sectionStats;
        SectionStats *dst = &m_sectionStats;
        dst->m_onSiteDocIds      += src->m_onSiteDocIds;
        dst->m_offSiteDocIds     += src->m_offSiteDocIds;
        // now the list should be the unique site hashes that
        // had the section hash. we need to uniquify them again
        // here.
        long *p = (long *)mr->ptr_siteHashList;
        long np = mr->size_siteHashList / 4;
        for ( long k = 0 ; k < np ; k++ )
            // hash it up, no dups!
            dt.addKey(&p[k]);
        // update our count based on that
        dst->m_numUniqueSites = dt.getNumSlotsUsed();
    }
    if ( m_r->m_getSectionStats ) return true;
    //
    // HACK: END section stats merge
    //


    if ( m_docsToGet <= 0 ) {
        char *xx=NULL;
        *xx=0;
    }

    // . how much do we need to store final merged docids, etc.?
    // . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1
    long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1);
    // allocate it
    m_finalBuf     = (char *)mmalloc ( need , "finalBuf" );
    m_finalBufSize = need;
    // g_errno should be set if this fails
    if ( ! m_finalBuf ) return true;
    // hook into it
    char *p = m_finalBuf;
    m_docIds        = (long long *)p;
    p += m_docsToGet * 8;
    m_scores        = (float     *)p;
    p += m_docsToGet * sizeof(float);
    m_clusterRecs   = (key_t     *)p;
    p += m_docsToGet * sizeof(key_t);
    m_clusterLevels = (char      *)p;
    p += m_docsToGet * 1;
    m_scoreInfos    = (DocIdScore **)p;
    p+=m_docsToGet*sizeof(DocIdScore *);

    // sanity check
    char *pend = m_finalBuf + need;
    if ( p != pend ) {
        char *xx = NULL;
        *xx =0;
    }
    // . now allocate for hash table
    // . get at least twice as many slots as docids
    HashTableT<long long,char> htable;
    // returns false and sets g_errno on error
    if ( ! htable.set ( m_docsToGet * 2 ) ) return true;
    // hash table for doing site clustering, provided we
    // are fully split and we got the site recs now
    HashTableT<long long,long> htable2;
    if ( m_r->m_doSiteClustering && ! htable2.set ( m_docsToGet * 2 ) )
        return true;

    //
    // ***MERGE ALL SPLITS INTO m_docIds[], etc.***
    //
    // . merge all lists in m_replyDocIds[splitNum]
    // . we may be re-called later after m_docsToGet is increased
    //   if too many docids were clustered/filtered out after the call
    //   to Msg51.
mergeLoop:

    // the winning docid will be diPtr[maxj]
    long maxj = -1;
    //Msg39Reply *mr;
    long hslot;

    // get the next highest-scoring docids from all split lists
    for ( long j = 0; j < m_numHosts; j++ ) {
        // . skip exhausted lists
        // . these both should be NULL if reply was skipped because
        //   we did a gbdocid:| query
        if ( diPtr[j] >= diEnd[j] ) continue;
        // compare the score
        if ( maxj == -1 ) {
            maxj = j;
            continue;
        }
        if ( *rsPtr[j] < *rsPtr[maxj] ) continue;
        if ( *rsPtr[j] > *rsPtr[maxj] ) {
            maxj = j;
            continue;
        }
        // prefer lower docids on top
        if ( *diPtr[j] < *diPtr[maxj] ) {
            maxj = j;
            continue;
        }
    }

    if ( maxj == -1 ) {
        m_moreDocIdsAvail = false;
        goto doneMerge;
    }

    // only do this logic if we have clusterdb recs included
    if ( m_r->m_doSiteClustering     &&
            // if the clusterLevel was set to CR_*errorCode* then this key
            // will be 0, so in that case, it might have been a not found
            // or whatever, so let it through regardless
            ksPtr[maxj]->n0 != 0LL &&
            ksPtr[maxj]->n1 != 0   ) {
        // get the hostname hash, a long long
        long sh = g_clusterdb.getSiteHash26 ((char *)ksPtr[maxj]);
        // do we have enough from this hostname already?
        long slot = htable2.getSlot ( sh );
        // if this hostname already visible, do not over-display it...
        if ( slot >= 0 ) {
            // get the count
            long val = htable2.getValueFromSlot ( slot );
            // . if already 2 or more, give up
            // . if the site hash is 0, that usually means a
            //   "not found" in clusterdb, and the accompanying
            //   cluster level would be set as such, but since we
            //   did not copy the cluster levels over in the merge
            //   algo above, we don't know for sure... cluster recs
            //   are set to 0 in the Msg39.cpp clustering.
            if ( sh && val >= 2 ) goto skip;
            // inc the count
            val++;
            // store it
            htable2.setValue ( slot , val );
        }
        // . add it, this should be pre-allocated!
        // . returns false and sets g_errno on error
        else if ( ! htable2.addKey(sh,1) ) return true;
    }

    hslot = htable.getSlot ( *diPtr[maxj] );

    // . only add it to the final list if the docid is "unique"
    // . BUT since different event ids share the same docid, exception!
    if ( hslot < 0 ) {
        // always inc this
        //m_totalDocCount++;
        // only do this if we need more
        if ( m_numDocIds < m_docsToGet ) {
            // get DocIdScore class for this docid
            Msg39Reply *mr = m_reply[maxj];
            // point to the array of DocIdScores
            DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
            long nds = mr->size_scoreInfo/sizeof(DocIdScore);
            DocIdScore *dp = NULL;
            for ( long i = 0 ; i < nds ; i++ ) {
                if ( ds[i].m_docId != *diPtr[maxj] )  continue;
                dp = &ds[i];
                break;
            }
            // add the max to the final merged lists
            m_docIds    [m_numDocIds] = *diPtr[maxj];

            // wtf?
            if ( ! dp ) {
                // this is empty if no scoring info
                // supplied!
                if ( m_r->m_getDocIdScoringInfo )
                    log("msg3a: CRAP! got empty score "
                        "info for "
                        "d=%lli",
                        m_docIds[m_numDocIds]);
                //char *xx=NULL; *xx=0;  261561804684
                // qry = www.yahoo
            }
            // point to the single DocIdScore for this docid
            m_scoreInfos[m_numDocIds] = dp;

            // reset this just in case
            if ( dp ) {
                dp->m_singleScores = NULL;
                dp->m_pairScores   = NULL;
            }

            // now fix DocIdScore::m_pairScores and m_singleScores
            // ptrs so they reference into the
            // Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
            // like they should. it seems we do not free the
            // Msg39Replies so we should be ok referencing them.
            if ( dp && dp->m_singlesOffset >= 0 )
                dp->m_singleScores =
                    (SingleScore *)(mr->ptr_singleScoreBuf+
                                    dp->m_singlesOffset) ;
            if ( dp && dp->m_pairsOffset >= 0 )
                dp->m_pairScores =
                    (PairScore *)(mr->ptr_pairScoreBuf +
                                  dp->m_pairsOffset );


            // turn it into a float, that is what rscore_t is.
            // we do this to make it easier for PostQueryRerank.cpp
            m_scores    [m_numDocIds]=(float)*rsPtr[maxj];
            if ( m_r->m_doSiteClustering )
                m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
            // clear this out
            //m_eventIdBits[m_numDocIds].clear();
            // set this for use below
            hslot = m_numDocIds;
            // point to next available slot to add to
            m_numDocIds++;
        }
        // if it has ALL the required query terms, count it
        //if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
        // . add it, this should be pre-allocated!
        // . returns false and sets g_errno on error
        if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
    }

skip:
    // increment the split pointers from which we took the max
    rsPtr[maxj]++;
    diPtr[maxj]++;
    ksPtr[maxj]++;
    // get the next highest docid and add it in
    if ( m_numDocIds < m_docsToGet ) goto mergeLoop;

doneMerge:

    if ( m_debug ) {
        // show how long it took
        logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li "
              "splits in %llu ms. "
              ,
              (unsigned long)this,
              m_numDocIds, (long)m_numHosts,
              gettimeofdayInMilliseconds() - m_startTime
            );
        // show the final merged docids
        for ( long i = 0 ; i < m_numDocIds ; i++ ) {
            long sh = 0;
            if ( m_r->m_doSiteClustering )
                sh=g_clusterdb.getSiteHash26((char *)
                                             &m_clusterRecs[i]);
            // print out score_t
            logf(LOG_DEBUG,"query: msg3a: [%lu] "
                 "%03li) merged docId=%012llu "
                 "score=%.01f hosthash=0x%lx",
                 (unsigned long)this,
                 i,
                 m_docIds    [i] ,
                 (float)m_scores    [i] ,
                 sh );
        }
    }

    // if we had a full split, we should have gotten the cluster recs
    // from each split already
    memset ( m_clusterLevels , CR_OK , m_numDocIds );

    return true;
}

Ejemplo n.º 23

0

Mostrar archivo

Archivo: SpiderProxy.cpp Proyecto: privacore/open-source-search-engine

// . when the Conf::m_proxyIps parm is updated we call this to rebuild
//   s_iptab, our table of SpiderProxy instances, which has the proxies and 
//   their performance statistics.
// . we try to maintain stats of ip/ports that did NOT change when rebuilding.
bool buildProxyTable ( ) {

	// scan the NEW list of proxy ip/port pairs in g_conf
	char *p = g_conf.m_proxyIps.getBufStart();

	HashTableX tmptab;
	tmptab.set(8,0,16,NULL,0,false,"tmptab");

	// scan the user inputted space-separated list of ip:ports
	// (optional username:password@ip:port)
	for ( ; *p ; ) {
		// skip white space
		if ( is_wspace_a(*p) ) { p++; continue; }

		// skip http://
		if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; }

		// scan in an ip:port
		char *s = p; char *portStr = NULL;
		int32_t dc = 0, pc = 0, gc = 0, bc = 0;
		const char *msg;

		char *usernamePwd = NULL;
		int32_t usernamePwdLen = 0;
		char *ipStart = p;

		// scan all characters until we hit \0 or another whitespace
		for ( ; *s && !is_wspace_a(*s); s++) {

			if ( *s == '@' ) {
				// must be username:pwd
				if ( pc != 1 ) {
					msg = "bad username:password";
					goto hadError;
				}
				usernamePwd = p;
				usernamePwdLen = s - p;
				if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) {
					msg = "username:password too long";
					goto hadError;
				}
				dc = 0;
				gc = 0;
				bc = 0;
				pc = 0;
				portStr = NULL;
				ipStart = s+1;
				continue;
			}

			if ( *s == '.' ) { dc++; continue; }
			if ( *s == ':' ) { portStr=s; pc++; continue; }
			if ( is_digit(*s) ) { gc++; continue; }
			bc++;
			continue;
		}
		// ensure it is a legit ip:port combo
		msg = NULL;
		if ( gc < 4 ) 
			msg = "not enough digits for an ip";
		if ( pc > 1 )
			msg = "too many colons";
		if ( dc != 3 )
			msg = "need 3 dots for an ip address";
		if ( bc )
			msg = "got illegal char in ip:port listing";
		if ( msg ) {
		hadError:
			char c = *s;
			*s = '\0';
			log("buf: %s for %s",msg,p);
			*s = c;
			return false;
		}

		// convert it
		int32_t iplen = s - ipStart;
		if ( portStr ) iplen = portStr - ipStart;
		int32_t ip = atoip(ipStart,iplen);
		// another sanity check
		if ( ip == 0 || ip == -1 ) {
			log("spider: got bad proxy ip for %s",p);
			return false;
		}

		// and the port default is 80
		int32_t port = 80;
		if ( portStr ) port = atol2(portStr+1,s-portStr-1);
		if ( port < 0 || port > 65535 ) {
			log("spider: got bad proxy port for %s",p);
			return false;
		}


		// . we got a legit ip:port
		// . see if already in our table
		uint64_t ipKey = (uint32_t)ip;
		ipKey <<= 16;
		ipKey |= (uint16_t)(port & 0xffff);

		// also store into tmptable to see what we need to remove
		tmptab.addKey(&ipKey);

		// see if in table
		int32_t islot = s_iptab.getSlot( &ipKey);

		// advance p
		p = s;

		// if in there, keep it as is
		if ( islot >= 0 ) continue;

		// otherwise add new entry
		SpiderProxy newThing;
		memset ( &newThing , 0 , sizeof(SpiderProxy));
		newThing.m_ip = ip;
		newThing.m_port = port;
		newThing.m_lastDownloadTookMS = -1;
		newThing.m_lastSuccessfulTestMS = -1;

		gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen);
		// ensure it is NULL terminated
		newThing.m_usernamePwd[usernamePwdLen] = '\0';

		if ( ! s_iptab.addKey ( &ipKey, &newThing ) )
			return false;
	}		

 redo:
	int32_t removed = 0;
	// scan all SpiderProxies in tmptab
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty buckets in hashtable s_iptab
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the key
		int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i);
		// must also exist in tmptab, otherwise it got removed by user
		if ( tmptab.isInTable ( &key ) ) continue;
		// skip if not in table
		if ( s_iptab.getSlot ( &key ) < 0 ) {
			log("sproxy: iptable hashing messed up");
			continue;
		}
		// shoot, it got removed. not in the new list of ip:ports
		s_iptab.removeKey ( &key );
		removed++;
		// hashtable is messed up now, start over
		//goto redo;
	}
	if ( removed ) goto redo;
	return true;
}

Ejemplo n.º 24

0

Mostrar archivo

Archivo: Statsdb.cpp Proyecto: automatedtendencies/open-source-search-engine

// returns false if blocked, true otherwise
bool Statsdb::gifLoop ( ) {
	// shortcut
	Msg5 *m = &m_msg5;

	//#ifndef _USEPLOTTER_
	//return true;
	//#endif

	// loop over all the lists in the time range, [m_t1,m_t2]
	for ( ; ! m_done ; ) {
		if ( ! m->getList ( (char)RDB_STATSDB	,
				    "statsdb"		, // coll
				    &m_list		,
				    (char *)&m_startKey	,
				    (char *)&m_endKey	,
				    32000	, // requested scan size
				    true 	, // include tree?
				    false	, // add to cache?
				    0		, // max cache age
				    0		, // start file number
				    -1		, // number of files
				    NULL	, // state
				    gotListWrapper, // callback
				    m_niceness	, // niceness
				    false	, // do error correction?
				    NULL	, // cache key pointer
				    0		, // # retries
				    -1		, // max # retries
				    true	, // compensate for merge?
				    -1		, // sync point
				    NULL	) ) // msg5b
			return false;
		// . process list
		// . returns false with g_errno set on error
		if ( ! processList() ) return true;
	}

	// define time delta - commented out because it's currently not used.
	long dt = m_t2 - m_t1;

	//#ifdef _USEPLOTTER_

	// gif size
	//char tmp[64];
	// dimensions of the gif
	//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
	//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
	// create one
	//GIFPlotter plotter ( NULL , m_fd , NULL );
	// open it
	//plotter.openpl ( );

	// define the space with boundaries 100 unit wide boundaries
	//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );

	// line thickness in user coordinates (pixels for us)
	//plotter.linewidth ( 1 );       
	// set bg color to gray (r/g/b) 
	//plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 );
	// erase Plotter's graphics display
	//plotter.erase ();                
	// draw axises in black
	//plotter.pencolorname ("black");    

	//
	// main graphing window
	//
	m_gw.safePrintf("<div style=\"position:relative;"
		      "background-color:#c0c0c0;"
		      //"overflow-y:hidden;"
		      "overflow-x:hidden;"
		      "z-index:-10;"
		      // the tick marks we print below are based on it
		      // being a window of the last 20 seconds... and using
		      // DX pixels
		      "min-width:%lipx;"
		      "min-height:%lipx;"
		      //"width:100%%;"
		      //"min-height:600px;"
		      "margin-top:10px;"
		      "margin-bottom:10px;"
		      "margin-right:10px;"
		      "margin-left:10px;\">"
		      ,(long)DX + 2 *m_bx
			,(long)DY + 2*m_by);


	// draw the x-axis
	//plotter.line ( m_bx , m_by , DX + m_bx , m_by  );

	// 10 x-axis tick marks
	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
		// tick mark
		//plotter.line ( x , -20 , x , 20 );
		m_gw.safePrintf("<div style=\"position:absolute;"
			      "left:%li;"
			      "bottom:0;"
			      "background-color:#000000;"
			      "z-index:110;"
			      "min-height:20px;"
			      "min-width:3px;\"></div>\n"
			      , m_bx + (long)x-1
			      );
		long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
		// LABEL
		m_gw.safePrintf("<div style=\"position:absolute;"
				"left:%li;"
				"bottom:20;"
				//"background-color:#000000;"
				"z-index:110;"
				"min-height:20px;"
				"min-width:3px;\">%lis</div>\n"
				, (long)x-10 + m_bx
				// the label:
				, xv
				);
	}


	HashTableX tmpht;
	tmpht.set(4,0,0,NULL,0,false,m_niceness,"statsparms");

	long col = 0;

	m_sb2->safePrintf("<table border=1 width=100%%>\n");

	// label offset to prevent collisions of superimposing multiple
	// graph calbrations
	long zoff = 0;


	//
	// point to the triplets in m_sb1's buffer (x,y,c)
	//
	char *p    = m_sb1.getBufStart();
	char *pend = p + m_sb1.length();
	for ( ; p < pend ; p += 12 ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get graph hash of this point
		long  gh = *(long *)(p +8);

		// if we already did this graph, skip it
		if ( tmpht.isInTable ( &gh ) ) continue;

		// . graph this single graph of this color
		// . returns ptr to first point of different color!
		plotGraph ( p , pend , gh , m_gw , zoff );
		// prevent collisions
		zoff += 20;

		// get the label based on graphHash
		Label *bb = getLabel ( gh );

		// add to key
		if ( col == 0 )
			m_sb2->safePrintf("<tr>");

		m_sb2->safePrintf("<td bgcolor=#%06lx>&nbsp; &nbsp;</td>"
				 "<td>%s</td>\n",
				 bb->m_color ,
				 bb->m_keyDesc );

		if ( col == 1 )
			m_sb2->safePrintf("</tr>\n");

		// inc column and wrap
		if ( ++col >= 2 ) col = 0;

		// . do not re-display 
		// . TODO: deal with error
		tmpht.addKey ( &gh );
		
	}



	// clear that up
	m_sb1.reset();

	// now plot the events, horizontal line segments like the performance
	// graph uses
	for ( long i = 0 ; i < m_ht3.m_numSlots ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if slot empty
		if ( ! m_ht3.m_flags[i] ) continue;
		// get the offset into m_sb3
		long offset = *(long *)m_ht3.getValueFromSlot(i);
		// get buf start
		char *bufStart = m_sb3.getBufStart();
		// get the ptr
		EventPoint *pp = (EventPoint *)(bufStart + offset);

		// get name of parm
		Parm *m = g_parms.getParmFromParmHash ( pp->m_parmHash );
		// make sure we got it
		if ( ! m ) { 
			log("statsdb: unrecognized parm hash = %li",
			    pp->m_parmHash);
			continue;
			//char *xx=NULL;*xx=0; }
		}

		// set the line width
		//plotter.linewidth ( pp->m_thickness );

		// get parm hash
		long colorHash = pp->m_parmHash;
		// add in old/new values to make it different
		colorHash = hash32h ( (long)pp->m_oldVal , colorHash );
		colorHash = hash32h ( (long)pp->m_newVal , colorHash );
		// . get color
		// . is really the parm hash in disguise
		long c1 = colorHash & 0x00ffffff;
		// use the color specified from addStat_r() for this line/pt
		//plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 ,
		//		   ((c1 >>  8) & 0xff) << 8 ,
		//		   ((c1 >>  0) & 0xff) << 8 );

		long x1 = pp->m_a;
		long x2 = pp->m_b;
		long y1 = *(long *)m_ht3.getKey(i); // i value
		// ensure at least 3 units wide for visibility
		if ( x2 < x1 + 10 ) x2 = x1 + 10;
		// . flip the y so we don't have to scroll the browser down
		// . DY does not include the axis and tick marks
		//long fy1 = DY - y1 + m_by ;
		// plot it
		//plotter.line ( x1 , fy1 , x2 , fy1 );
		drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );

		// add to map key? only if we haven't already
		if ( tmpht.isInTable ( &colorHash ) ) continue;

		// add it
		if ( col == 0 )
			m_sb2->safePrintf("<tr>");

		char *title = "unknown parm";
		if ( m ) title = m->m_title;

		m_sb2->safePrintf("<td bgcolor=#%06lx>&nbsp; &nbsp;</td>",c1);

		// print the parm name and old/new values
		m_sb2->safePrintf("<td><b>%s</b>",title);

		if ( pp->m_oldVal != pp->m_newVal )
			m_sb2->safePrintf(" (%.02f -> %.02f)",
					 pp->m_oldVal,pp->m_newVal);

		m_sb2->safePrintf("</td>");

		if ( col == 1 )
			m_sb2->safePrintf("</tr>\n");

		// inc column and wrap
		if ( ++col >= 2 ) col = 0;

		// . do not re-display 
		// . TODO: deal with error
		tmpht.addKey ( &colorHash ) ;
	}
	m_sb2->safePrintf("</table>\n");

	// clear that up
	m_ht3.reset();
	m_sb3.reset();

	// and stat states
	m_ht0.reset();
	m_sb0.reset();

	// all done free some mem
	m_sb1.reset();
	//m_sb2.reset();

	//
	// but not m_sb2 cuz that has the html in it!!
	//

	// all done
	//if ( plotter.closepl () < 0 ) 
	//	log("admin: Could not close performance graph object.");
	// close the file
	//fclose ( m_fd );

	//#endif

	// close main graphing window
	m_gw.safePrintf("</div>\n");

	return true;
}

Ejemplo n.º 25

0

Mostrar archivo

Archivo: Words.cpp Proyecto: BillWangCS/open-source-search-engine

// returns -1 and sets g_errno on error, because 0 means langUnknown
long Words::getLanguage( Sections *sections ,
			 long maxSamples,
			 long niceness,
			 long *langScore) {
	// calculate scores if not given
	//Scores calcdScores;
	//if ( ! scores ) {
	//	if ( ! calcdScores.set( this,m_version,false ) )
	//		return -1;
	//	scores = &calcdScores;
	//}

	// . take a random sample of words and look them up in the
	//   language dictionary
	//HashTableT<long long, char> ht;
	HashTableX ht;
	long long langCount[MAX_LANGUAGES];
	long long langWorkArea[MAX_LANGUAGES];
	long numWords = m_numWords;
	//long skip = numWords/maxSamples;
	//if ( skip == 0 ) skip = 1;
	// reset the language count
	memset(langCount, 0, sizeof(long long)*MAX_LANGUAGES);
	// sample the words
	//long wordBase  = 0;
	long wordi     = 0;
	//if ( ! ht.set(maxSamples*1.5) ) return -1;
	if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false,
		      niceness,"wordslang")) 
		return -1;
 
	// . avoid words in these bad sections
	// . google seems to index SEC_MARQUEE so i took that out of badFlags
	long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
	// shortcuts
	long long *wids  = m_wordIds;
	long      *wlens = m_wordLens;
	char     **wptrs = m_words;

	//long langTotal = 0;
// 	log ( LOG_WARN, "xmldoc: Picking language from %li words with %li skip",
// 			numWords, skip );
	char numOne = 1;
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	// this means null too
	if ( sections && sections->m_numSections == 0 ) sp = NULL;

	long maxCount = 1000;

	while ( wordi < numWords ) {
		// breathe
		QUICKPOLL( niceness );
		// move to the next valid word
		if ( ! wids [wordi]     ) { wordi++; continue; }
		if (   wlens[wordi] < 2 ) { wordi++; continue; }
		// skip if in a bad section
		//long flags = sections->m_sectionPtrs[i]->m_flags;
		// meaning script section ,etc
		if ( sp && ( sp[wordi]->m_flags & badFlags ) ) {
			wordi++; continue; }
		// check the language
		//unsigned char lang = 0;

		// Skip if word is capitalized and not preceded by a tag
		//if(s_isWordCap(getWord(wordi), getWordLen(wordi)) &&
		//   wordi > 0 && !getTagId(wordi - 1)) {
		//	wordi++;
		//	continue;
		//}

		// Skip word if bounded by '/' or '?' might be in a URL
		if(isBounded(wordi)) {
			wordi++;
			continue;
		}

		// is it arabic? sometimes they are spammy pages and repeat
		// a few arabic words over and over again, so don't do deduping
		// with "ht" before checking this.
		char cl = getCharacterLanguage ( wptrs[wordi] );
		if ( cl ) {
		        langCount[(unsigned char)cl]++;
			wordi++;
			continue;
		}

		//if(ht.getSlot(m_wordIds[wordi]) !=-1) {
		if(!ht.isEmpty(&m_wordIds[wordi]) ) {
			wordi++;
			continue;
		}

		// If we can't add the word, it's not that bad.
		// Just gripe about it in the log.
		if(!ht.addKey(&m_wordIds[wordi], &numOne)) {
			log(LOG_WARN, "build: Could not add word to temporary "
			    "table, memory error?\n");
			g_errno = ENOMEM;
			return -1;
		}

		if ( maxCount-- <= 0 ) break;

		// No lang from charset, got a phrase, and 0 language does not have 
		// a score Order is very important!
		int foundone = 0;
		if ( // lang == 0 &&
		    // we seem to be missing hungarian and thai
		    g_speller.getPhraseLanguages(getWord(wordi),
						 getWordLen(wordi), 
						 langWorkArea) &&
		    // why must it have an "unknown score" of 0?
		    // allow -1... i don't know what that means!!
		    langWorkArea[0] <= 0) {
			
			int lasty = -1;
			for(int y = 1; y < MAX_LANGUAGES; y++) {
				if(langWorkArea[y] == 0) continue;
				langCount[y]++;
				long pop = langWorkArea[y];
				// negative means in an official dictionary
				if ( pop < 0 ) {
					pop *= -1;
					langCount[y] += 1;
				}
				// extra?
				if ( pop > 1000 )
					langCount[y] += 2;
				if ( pop > 10000 )
					langCount[y] += 2;
				lasty = y;
				foundone++;
			}
			// . if it can only belong to one language
			// . helps fix that fact that our unifiedDict is crummy
			//   and identifes some words as being in a lot of languages
			//   like "Pronto" as being in english and not giving
			//   the popularities correctly.
			if ( foundone == 1 )
				// give massive boost
				langCount[lasty] += 10;
		}
		// . try to skip unknown words without killing sample size
		// . we lack russian, hungarian and arabic in the unified
		//   dict, so try to do character detection for those langs.
		// . should prevent them from being detected as unknown
		//   langs and coming up for english search 'gigablast'
		if ( ! foundone ) {
			langCount[langUnknown]++;
			// do not count towards sample size
			maxCount++;
		}

		// skip to the next word
		//wordBase += skip;
		//if ( wordi < wordBase )
		//	wordi = wordBase;
		//else
		wordi++;
	}
	// punish unknown count in case a doc has a lot of proper names
	// or something
	//langCount[langUnknown] /= 2;
	// get the lang with the max score then
	int l = s_findMaxIndex(langCount, MAX_LANGUAGES);
	// if(langCount[l] < 15) return(langUnknown);
	if(langScore) *langScore = langCount[l];
	// return if known now
	return l;
}

Ejemplo n.º 26

0

Mostrar archivo

Archivo: Msg51.cpp Proyecto: privacore/open-source-search-engine

// . cluster the docids based on the clusterRecs
// . returns false and sets g_errno on error
// . if maxDocIdsPerHostname is -1 do not do hostname clsutering
bool setClusterLevels ( const key96_t   *clusterRecs,
			const int64_t *docIds,
			int32_t       numRecs              ,
			int32_t       maxDocIdsPerHostname ,
			bool       doHostnameClustering ,
			bool       familyFilter         ,
			bool       isDebug              ,
			// output to clusterLevels[]
			char    *clusterLevels        ) {
	
	if ( numRecs <= 0 ) return true;

	// skip if not clustering on anything
	//if ( ! doHostnameClustering && ! familyFilter ) {
	//	memset ( clusterLevels, CR_OK, numRecs );
	//	return true;
	//}

	// how many negative site hashes do we have?
	// count how many docids we got, they are a cgi value, so represented
	// in ascii separated by +'s. i.e. "12345+435322+3439333333"
	//HashTableT <int64_t,char> sht;
	//if ( ! hashFromString ( &sht , noSiteIds ) ) return false;
	//bool checkNegative = ( sht.getNumSlotsUsed() > 0 );

	HashTableX ctab;
	// init to 2*numRecs for speed. use 0 for niceness!
	if ( ! ctab.set ( 8 , 4 , numRecs * 2,NULL,0,false,"clustertab" ) )
		return false;

	// time it
	u_int64_t startTime = gettimeofdayInMilliseconds();

	// init loop counter vars
	int32_t           count = 0;
	uint32_t  score = 0;
	char          *crec ;
	int64_t      h  ;
	char          *level ;
	bool           fakeIt ;

	for(int32_t i=0; i<numRecs; i++) {
		crec = (char *)&clusterRecs[i];
		// . set this cluster level
		// . right now will be CR_ERROR_CLUSTERDB or CR_OK...
		level = &clusterLevels[i];

		// sanity check
		if ( *level == CR_UNINIT ) gbshutdownLogicError();
		// and the adult bit, for cleaning the results
		if ( familyFilter && g_clusterdb.hasAdultContent ( crec ) ) {
			*level = CR_DIRTY;
			continue;
		}
		// if error looking up in clusterdb, use a 8 bit domainhash from docid
		fakeIt = (*level==CR_ERROR_CLUSTERDB);
		// assume ok, show it, it is visible
		*level = CR_OK;
		// site hash comes next
		if(!doHostnameClustering)
			continue;

		// . get the site hash
		// . these are only 32 bits!
		if(fakeIt)
			h = Titledb::getDomHash8FromDocId(docIds[i]);
		else
			h = g_clusterdb.getSiteHash26 ( crec );

		// inc this count!
		if ( fakeIt ) {
			g_stats.m_filterStats[CR_ERROR_CLUSTERDB]++;
		}

		// if it matches a siteid on our black list
		//if ( checkNegative && sht.getSlot((int64_t)h) > 0 ) {
		//	*level = CR_BLACKLISTED_SITE; goto loop; }
		// look it up
		score = ctab.getScore(h) ;
		// if still visible, just continue
		if ( score < (uint32_t)maxDocIdsPerHostname ) {
			if ( ! ctab.addTerm(h))
				return false;
			continue;
		}
		// otherwise, no lonegr visible
		*level = CR_CLUSTERED;
	}


	// debug
	for ( int32_t i = 0 ; i < numRecs && isDebug ; i++ ) {
		crec = (char *)&clusterRecs[i];
		uint32_t siteHash26=g_clusterdb.getSiteHash26(crec);
		logf(LOG_DEBUG,"query: msg51: hit #%" PRId32") sitehash26=%" PRIu32" "
		     "rec.n0=%" PRIx64" docid=%" PRId64" cl=%" PRId32" (%s)",
		     (int32_t)count++,
		     (int32_t)siteHash26,
		     clusterRecs[i].n0,
		     (int64_t)docIds[i],
		     (int32_t)clusterLevels[i],
		     g_crStrings[(int32_t)clusterLevels[i]] );
	}


	//log(LOG_DEBUG,"build: numVisible=%" PRId32" numClustered=%" PRId32" numErrors=%" PRId32,
	//    *numVisible,*numClustered,*numErrors);
	// show time
	uint64_t took = gettimeofdayInMilliseconds() - startTime;
	if ( took > 3 )
		log(LOG_INFO,"build: Took %" PRId64" ms to do clustering.",took);

	// we are all done
	return true;
}