C++ (Cpp) HashTableX::addTerm Beispiele

Programmiersprache: C++ (Cpp)

Klasse / Typ: HashTableX

Methode / Funktion: addTerm

Beispiele auf hotexamples.com: 8

C++ (Cpp) HashTableX::addTerm - 8 Beispiele gefunden. Dies sind die am besten bewerteten C++ (Cpp) Beispiele für die HashTableX::addTerm, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

set(26)

addKey(17)

isInTable(10)

getValueFromSlot(10)

addTerm(8)

getScore(8)

getValue(6)

getNumSlots(6)

reset(5)

getSlot(5)

save(3)

removeSlot(3)

load(2)

getKeyFromSlot(2)

getNextSlot(2)

getNumSlotsUsed(2)

removeKey(1)

addTerm32(1)

m_callback(1)

isEmpty(1)

clear(1)

getScore32(1)

getKey32FromSlot(1)

getValFromSlot(1)

Beispiel #1

Datei anzeigen

Datei: iana_charset.cpp Projekt: privacore/open-source-search-engine

// Slightly modified from getTextEntity
int16_t get_iana_charset(const char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,"ianatbl") ) {
		log(LOG_WARN, "build: Could not init table of IANA Charsets.");
		return csUnknown;
	}
	// now add in all the charset entries
	int32_t n = (int32_t)sizeof(s_charsets) / (int32_t)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( int32_t i = 0 ; i < n ; i++ ) {
	    int64_t h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(h, i+1) ) {
			log(LOG_WARN, "build: add term failed");
			return csUnknown;
		}
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    int64_t h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    int32_t i = (int32_t) s_table.getScore(h);
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (int16_t)s_charsets[i-1].mib_enum;
}

Beispiel #2

Datei anzeigen

Datei: iana_charset.cpp Projekt: BILObilo/open-source-search-engine

// Slightly modified from getTextEntity
short get_iana_charset(char *cs, int len)
{
    if (!s_isInitialized){
	// set up the hash table
	if ( ! s_table.set ( 8,4,4096,NULL,0,false,0,"ianatbl") )
	    return log("build: Could not init table of "
		       "IANA Charsets.");
	// now add in all the charset entries
	long n = (long)sizeof(s_charsets) / (long)sizeof(IANACharset);
	// turn off quickpolling
	char saved = g_conf.m_useQuickpoll;
	g_conf.m_useQuickpoll = false;
	for ( long i = 0 ; i < n ; i++ ) {
	    long long h = hash64Lower_a ( s_charsets[i].name, strlen(s_charsets[i].name) );
	    // store the charset index in the hash table as score
		if ( ! s_table.addTerm(&h, i+1) ) 
		return log("build: add term failed");
	}
	g_conf.m_useQuickpoll = saved;
	s_isInitialized = true;
    }
    long long h = hash64Lower_a ( cs , len );
    // get the entity index from table (stored in the score field)
    long i = (long) s_table.getScore ( &h );
    // return 0 if no match
    if ( i == 0 ) return csUnknown;
    // return the iso character
    return (short)s_charsets[i-1].mib_enum;
}

Beispiel #3

Datei anzeigen

Datei: AdultBit.cpp Projekt: BlaBlaNet/open-source-search-engine

bool AdultBit::isDirty ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_dirty[] = {
		"anal",
		"analsex",
		"b*****b",
		"blowjobs",
		"boob",
		"boobs",
		"clitoris",
		"c**k",
		"cocks",
		"cum",
		"dick",
		"dicks",
		"g******g",
		"gangbangs",
		"gangbanging",
		"movie",
		"movies",
		"oral",
		"oralsex",
		"p**n",
		"porno",
		"pussy",
		"pussies",
		"sex",
		"sexy",
		"tit",
		"t**s",
		"video",
		"videos",
		"xxx",
		"xxxx",
		"xxxx"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_dtable.set ( 8,4,sizeof(s_dirty  )*2,NULL,0,false,0,
				      "adulttab")) 
			return log("build: Error initializing "
				    "dirty word hash table." );
		// now add in all the dirty words
		int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_dirty  [i] );
			if ( ! s_dtable.addTerm (&h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_dtable.getScore ( &h );
}

Beispiel #4

Datei anzeigen

Datei: Entities.cpp Projekt: lemire/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;

			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }

			// point to it
			char *buf = (char *)s_entities[i].utf8;

			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);

			//
			// make my own mods to make parsing easier
			//

			if ( up == 160 ) {  // nbsp
				buf[0] = ' ';
				len = 1;
			}

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Beispiel #5

Datei anzeigen

Datei: AdultBit.cpp Projekt: BlaBlaNet/open-source-search-engine

bool AdultBit::isObscene ( char *s , int32_t len ) {

	static bool       s_isInitialized = false;
	static char      *s_obscene[] = {
		"c**t",
		"clits",
//		"cum",    magna cum laude
		"cums",
		"cumshot",
		"c**t",
		"cunts",
		"milf",
		"rimjob",
		"felch",
		"f**k",
		"f****d",
		"f****r",
		"f*****g",
		"f***s",
		"w***e",
		"w****s"
	};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0,
				      "obscenetab") ) 
			return log("build: Error initializing "
				    "obscene word hash table." );
		// now add in all the stop words
		int32_t n = sizeof(s_obscene) / sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_obscene[i] );
			if ( ! s_otable.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 

	// compute the hash of the word "s"
	int64_t h = hash64Lower_a ( s , len );

	// get from table
	return s_otable.getScore ( &h );
}

Beispiel #6

Datei anzeigen

Datei: Entities.cpp Projekt: privacore/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) {
			log("build: Could not init table of HTML entities.");
			return false;
		}

		// now add in all the html entities
		const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );

			// convert the unicode codepoints to an utf8 string
			char *buf = (char *)s_entities[i].utf8;
			for(int j=0; j<s_entities[i].codepoints; j++) {
				UChar32 codepoint = s_entities[i].codepoint[j];
				int32_t len = utf8Encode(codepoint,buf);
				if ( len == 0 ) { g_process.shutdownAbort(true); }
				
				// make modification to make parsing easier
				if ( codepoint == 160 ) {  // nbsp
					buf[0] = ' ';
					len = 1;
				}
				buf += len;
				
			}
			s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8);
			// must not exist!
			if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm(h, i+1) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}

Beispiel #7

Datei anzeigen

Datei: Msg51.cpp Projekt: privacore/open-source-search-engine

// . cluster the docids based on the clusterRecs
// . returns false and sets g_errno on error
// . if maxDocIdsPerHostname is -1 do not do hostname clsutering
bool setClusterLevels ( const key96_t   *clusterRecs,
			const int64_t *docIds,
			int32_t       numRecs              ,
			int32_t       maxDocIdsPerHostname ,
			bool       doHostnameClustering ,
			bool       familyFilter         ,
			bool       isDebug              ,
			// output to clusterLevels[]
			char    *clusterLevels        ) {
	
	if ( numRecs <= 0 ) return true;

	// skip if not clustering on anything
	//if ( ! doHostnameClustering && ! familyFilter ) {
	//	memset ( clusterLevels, CR_OK, numRecs );
	//	return true;
	//}

	// how many negative site hashes do we have?
	// count how many docids we got, they are a cgi value, so represented
	// in ascii separated by +'s. i.e. "12345+435322+3439333333"
	//HashTableT <int64_t,char> sht;
	//if ( ! hashFromString ( &sht , noSiteIds ) ) return false;
	//bool checkNegative = ( sht.getNumSlotsUsed() > 0 );

	HashTableX ctab;
	// init to 2*numRecs for speed. use 0 for niceness!
	if ( ! ctab.set ( 8 , 4 , numRecs * 2,NULL,0,false,"clustertab" ) )
		return false;

	// time it
	u_int64_t startTime = gettimeofdayInMilliseconds();

	// init loop counter vars
	int32_t           count = 0;
	uint32_t  score = 0;
	char          *crec ;
	int64_t      h  ;
	char          *level ;
	bool           fakeIt ;

	for(int32_t i=0; i<numRecs; i++) {
		crec = (char *)&clusterRecs[i];
		// . set this cluster level
		// . right now will be CR_ERROR_CLUSTERDB or CR_OK...
		level = &clusterLevels[i];

		// sanity check
		if ( *level == CR_UNINIT ) gbshutdownLogicError();
		// and the adult bit, for cleaning the results
		if ( familyFilter && g_clusterdb.hasAdultContent ( crec ) ) {
			*level = CR_DIRTY;
			continue;
		}
		// if error looking up in clusterdb, use a 8 bit domainhash from docid
		fakeIt = (*level==CR_ERROR_CLUSTERDB);
		// assume ok, show it, it is visible
		*level = CR_OK;
		// site hash comes next
		if(!doHostnameClustering)
			continue;

		// . get the site hash
		// . these are only 32 bits!
		if(fakeIt)
			h = Titledb::getDomHash8FromDocId(docIds[i]);
		else
			h = g_clusterdb.getSiteHash26 ( crec );

		// inc this count!
		if ( fakeIt ) {
			g_stats.m_filterStats[CR_ERROR_CLUSTERDB]++;
		}

		// if it matches a siteid on our black list
		//if ( checkNegative && sht.getSlot((int64_t)h) > 0 ) {
		//	*level = CR_BLACKLISTED_SITE; goto loop; }
		// look it up
		score = ctab.getScore(h) ;
		// if still visible, just continue
		if ( score < (uint32_t)maxDocIdsPerHostname ) {
			if ( ! ctab.addTerm(h))
				return false;
			continue;
		}
		// otherwise, no lonegr visible
		*level = CR_CLUSTERED;
	}


	// debug
	for ( int32_t i = 0 ; i < numRecs && isDebug ; i++ ) {
		crec = (char *)&clusterRecs[i];
		uint32_t siteHash26=g_clusterdb.getSiteHash26(crec);
		logf(LOG_DEBUG,"query: msg51: hit #%" PRId32") sitehash26=%" PRIu32" "
		     "rec.n0=%" PRIx64" docid=%" PRId64" cl=%" PRId32" (%s)",
		     (int32_t)count++,
		     (int32_t)siteHash26,
		     clusterRecs[i].n0,
		     (int64_t)docIds[i],
		     (int32_t)clusterLevels[i],
		     g_crStrings[(int32_t)clusterLevels[i]] );
	}


	//log(LOG_DEBUG,"build: numVisible=%" PRId32" numClustered=%" PRId32" numErrors=%" PRId32,
	//    *numVisible,*numClustered,*numErrors);
	// show time
	uint64_t took = gettimeofdayInMilliseconds() - startTime;
	if ( took > 3 )
		log(LOG_INFO,"build: Took %" PRId64" ms to do clustering.",took);

	// we are all done
	return true;
}

Beispiel #8

Datei anzeigen

Datei: Entities.cpp Projekt: BlaBlaNet/open-source-search-engine

static bool initEntityTable(){
	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) )
			return log("build: Could not init table of "
					   "HTML entities.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			int64_t h = hash64b ( s_entities[i].entity );
			// grab the unicode code point
			UChar32 up = s_entities[i].unicode;
			// now we are 100% up
			if ( ! up ) { char *xx=NULL;*xx=0; }
			// point to it
			char *buf = (char *)s_entities[i].utf8;
			// if uchar32 not 0 then set the utf8 with it
			int32_t len = utf8Encode(up,buf);
			//
			// make my own mods to make parsing easier
			//
			if ( up == 160 ) {  // nbsp
				buf[0] = ' '; len = 1; }
			// make all quotes equal '\"' (34 decimal)
			// double and single curling quotes
			//http://www.dwheeler.com/essays/quotes-test-utf-8.html
			// &#x201c, 201d, 2018, 2019 (unicode values, not utf8)
			// &ldquo, &rdquo, &lsquo, &rsquo
			/*
			if ( up == 171 ||
			     up == 187 ||
			     up == 8216 ||
			     up == 8217 ||
			     up == 8218 ||
			     up == 8220 ||
			     up == 8221 ||
			     up == 8222 ||
			     up == 8249 ||
			     up == 8250 ) {
				buf[0] = '\"'; len = 1; }
			// and normalize all dashes (mdash,ndash)
			if ( up == 8211 || up == 8212 ) {
				buf[0] = '-'; len = 1; }
			*/

			//
			// end custom mods
			//

			// set length
			s_entities[i].utf8Len = len;
			// check it
			if ( len == 0 ) { char *xx=NULL;*xx=0; }
			// must not exist!
			if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;}
			// store the entity index in the hash table as score
			if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
		}
		s_isInitialized = true;
	} 
	return true;
}