C++ (Cpp) hash64Lower_utf8_cont Exemples

Exemple #1

0

Afficher le fichier

Fichier : Synonyms.cpp Projet : nikhs/open-source-search-engine

// just index the first bigram for now to give a little bonus
bool Synonyms::addAmpPhrase ( long wordNum , HashTableX *dt ) {
	// . "D & B" --> dandb
	// . make the "andb" a suffix
	//char tbuf[100];
	if ( wordNum +2 >= m_words->m_numWords   ) return true;
	if ( ! m_words->m_wordIds [wordNum+2]    ) return true;
	if ( m_words->m_wordLens[wordNum+2] > 50 ) return true;
	if ( ! m_words->hasChar(wordNum+1,'&')   ) return true;

	long  wlen = m_words->m_wordLens[wordNum];
	char *w    = m_words->m_words[wordNum];

	// need this for hash continuation procedure
	long conti = 0;
	// hack for "d & b" -> "dandb"
	uint64_t h = hash64Lower_utf8_cont ( w , wlen,0LL,&conti );
	// just make it a bigram with the word "and" after it
	// . we usually ignore stop words like and when someone does the query
	//   but we give out bonus points if the query term's left or right
	//   bigram has that stop word where it should be.
	// . so Dave & Barry will index "daveand" as a bigram and the
	//   search for 'Dave and Barry' will give bonus points for that
	//   bigram.
	h = hash64Lower_utf8_cont ( "and", 3,h,&conti);
	// logic in Phrases.cpp will xor it with 0x768867 
	// because it contains a stop word. this prevents "st.
	// and" from matching "stand".
	h ^= 0x768867;
	
	// do not add dups
	if ( dt->isInTable ( &h ) ) return true;
	// add to dedup table. return false with g_errno set on error
	if ( ! dt->addKey ( &h ) ) return false;

	// store that
	*m_aidsPtr++ = h;
	*m_wids0Ptr++ = 0LL;
	*m_wids1Ptr++ = 0LL;
	*m_termOffsPtr++ = m_synWordBuf.length();
	m_synWordBuf.safeMemcpy ( w , wlen );
	m_synWordBuf.safeStrcpy (" and");
	m_synWordBuf.pushChar('\0');
	*m_termLensPtr++ = wlen+4;
	*m_termPtrsPtr++ = NULL;

	*m_numAlnumWordsPtr++ = 1;
	*m_numAlnumWordsInBasePtr++ = 1;
	*m_srcPtr++ = SOURCE_GENERATED;

	return true;
}

Exemple #2

0

Afficher le fichier

Fichier : Synonyms.cpp Projet : BillWangCS/open-source-search-engine

// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
long Synonyms::getSynonyms ( Words *words , 
			     long wordNum , 
			     uint8_t langId ,
			     char *tmpBuf ,
			     long niceness ) {

	// punct words have no synoyms
	if ( ! words->m_wordIds[wordNum] ) return 0;

	// store these
	m_words     = words;
	m_docLangId = langId;
	m_niceness = niceness;

	// sanity check
	if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; }

	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];
	dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds");


	long maxSyns = (long)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (long *)bufPtr;
	bufPtr += maxSyns * 4;


	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = m_termPtrs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;

	
	char *w    = m_words->m_words   [wordNum];
	long  wlen = m_words->m_wordLens[wordNum];

	//
	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below
	//

	char sourceId = SOURCE_WIKTIONARY;
	char *ss = NULL;
	long long bwid;
	char wikiLangId = m_docLangId;
	bool hadSpace ;
	long klen ;
	long baseNumAlnumWords;

 tryOtherLang:

	/*
	// if word only exists in one language, assume that language for word
	// even if m_docLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_docLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		long long bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		else
			wikiLangId = getCharacterLanguage(w);
	}
	*/

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId && 
	     wordNum+2< m_words->m_numWords &&
	     m_words->m_wordIds[wordNum+2]) {
		// get phrase id bigram then
		long conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		char *wp2 = m_words->m_words[wordNum+2];
		long  wlen2 = m_words->m_wordLens[wordNum+2];
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
	}

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss && 
		     wlen >= 3 &&
		     w[wlen-2]=='\'' && 
		     w[wlen-1]=='s' ) {
			long long cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );
		}
	}

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_docLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_docLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		sourceId   = SOURCE_WIKTIONARY_EN;
		goto tryOtherLang;
	}

	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		long count = 0;
	addSynSet:
		// do we have another set following this
		char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf");
		}
		// skip over the pipe i guess
		char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) { char *xx=NULL;*xx=0; }
		// point to word list
		char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		char *e = p + 1;
		// save count in case we need to undo
		//long saved = m_numAlts[wordNum];
	hashLoop:


		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer" 
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		long long h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		}
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		hadSpace = false;
		klen = e - p;
		for ( long k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			Words sw;
			sw.setx ( p , e - p , m_niceness );
			*(long long *)m_wids0Ptr = sw.m_wordIds[0];
			*(long long *)m_wids1Ptr = sw.m_wordIds[2];
			*(long  *)m_numAlnumWordsPtr = sw.getNumAlnumWords();
		}

		m_wids0Ptr++;
		m_wids1Ptr++;
		m_numAlnumWordsPtr++;

		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) goto done;
	getNextSyn:
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
	done:
		// all done
		return m_aidsPtr - m_aids;
	}


	// strip marks from THIS word, return -1 w/ g_errno set on error
	if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' && 
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe ( wordNum, &dt ) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
}

Exemple #3

0

Afficher le fichier

Fichier : Phrases.cpp Projet : BlaBlaNet/open-source-search-engine

// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
	// . if the ith word cannot start a phrase then we have no phrase
	// . we indicate NULL phrasesIds with a spam of PSKIP
	// . we now index all regardless! we want to be able to search
	//   for "a thing" or something. so do it!
	//if ( ! m_bits->canStartPhrase ( i ) ) {
	//	m_phraseSpam[i] = PSKIP; 
	//	m_phraseIds [i] = 0LL;
	//	return;
	//}

	// MDW: now Weights.cpp should encompass all this logic
	// or if score <= 0, set in Scores.cpp
	//if ( m_wordScores && m_wordScores[i] <= 0 ) {
	//	m_phraseSpam[i] = PSKIP; 
	//	m_phraseIds [i] = 0LL;
	//	return;
	//}

	// hash of the phrase
	int64_t h   = 0LL; 
	// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
	int64_t h2  = 0LL; 
	int64_t h3  = 0LL; 
	//int64_t h4  = 0LL; 
	//int64_t h5  = 0LL; 
	// reset
	unsigned char pos = 0;
	// now look for other tokens that should follow the ith token
	int32_t          nw               = m_words->getNumWords();
	int32_t          numWordsInPhrase = 1;
	// use the min spam from all words in the phrase as the spam for phrase
	char minSpam = -1;
	// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
	char isNum = is_digit(m_wptrs[i][0]);
	// min score
	//int32_t minScore ;
	//if ( m_wordScores ) minScore = m_wordScores[i];
	// if i is not a stop word, it can set the min spam initially
	//if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i);
	// do not include punct/tag words in the m_numWordsTotal[j] count
	// of the total words in the phrase. these are just usesless tails.
	int32_t lastWordj = -1;
	// loop over following words
	int32_t j;
	bool hasHyphen ;
	bool hasStopWord2 ;

	// . NOTE: a token can start a phrase but NOT be in it. 
	// . like a large number for example.
	// . wordId is the lower ascii hash of the ith word
	// . NO... this is allowing the query operator PiiPe to start
	//   a phrase but not be in it, then the phrase id ends up just
	//   being the following word's id. causing the synonyms code to
	//   give a synonym which it should not un Synonyms::set()
	if ( ! m_bits->canBeInPhrase(i) )
		// so indeed, skip it then
		goto nophrase;

	//h = hash64 ( h, m_words->getWordId(i));
	h = m_wids[i];
	// set position
	pos = (unsigned char)m_wlens[i];
	//if (m_words->getStripWordId(i)) 
	//	h2 = hash64 ( h2, m_words->getStripWordId(i));
	//else h2 = h;

	hasHyphen = false;
	hasStopWord2 = m_bits->isStopWord(i);
	// this makes it true now too
	//if ( m_wlens[i] <= 2 ) hasStopWord = true;

	for ( j = i + 1 ; j < nw ; j++ ) {
		QUICKPOLL(niceness);

		// . do not allow more than 32 alnum/punct "words" in a phrase
		// . this prevents phrases with 100,000 words from slowing
		//   us down. would put us in a huge double-nested for loop
		if ( j > i + 32 ) goto nophrase;
		// deal with punct words
		if ( ! m_wids[j] ) {
			// if we cannot pair across word j then break
			if ( ! m_bits->canPairAcross (j) ) break;
			// does it have a hyphen?
			if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
			/*
			// "D & B" --> dandb
			if (j==i+1 && m_words->hasChar(j,'&')) {
				// set this
				hasStopWord = true;
				// insert "and"
				int32_t conti=pos;
				h = hash64Lower_utf8_cont("and",3,h,&conti);
				pos=conti;
				// the two-word phrase, set it if we need to
				h2 = h;
				m_numWordsTotal2[i] = j-i+1;
			}
			*/
			continue;
		}
		// . if this word can not be in a phrase then continue our 
		//   search for a word that can
		// . no punctuation can be in a phrase currently (++?)
		//if ( m_bits->canBeInPhrase (j) ) {
		//}

		// keep this set right
		//if (m_bits->isStopWord(j)||m_wlens[j]<=2) hasStopWord = true;
		//if ( m_bits->isStopWord(j) ) hasStopWord = true;

		// record lastWordj to indicate that word #j was a true word
		lastWordj = j;
		// . stop words should have a 0 spam value so don't count those
		// . added by mdw in march 2002
		/*
		if ( ! m_bits->isStopWord(j) && m_spam ) {
			// maintain the min spam
			char spam  = m_spam->getSpam ( j );
			if ( minSpam == -1 || spam < minSpam ) minSpam = spam;
			// . min weight from score vector
			// . normal score here is 256, not 128, so shift
			//   down 3 to normalize it relatively
			//if ( m_wordScores && (m_wordScores[j]>>3)<minScore) 
			//	minScore = m_wordScores[j]>>3;
			//if ( m_wordScores && m_wordScores[j] < minScore ) 
			//	minScore = m_wordScores[j];
		}
		*/
		// if word #j can be in phrase then incorporate it's hash
		if ( m_bits->canBeInPhrase (j) ) {
			// continue the hash
		        //unsigned char *p= (unsigned char *)m_wptrs[j];
			//unsigned char *pend = p + m_wlens[j];
			//for ( ; p < pend ; p++ ) 
			//	h ^= g_hashtab[pos++][*p];

			int32_t conti = pos;

			// . get the punctuation mark separting two numbers
			// . use space if can't find one
			// . 1/234 1,234 1.234 10/11 "1 234" 1-5
			//if (isNum && j==i + 2 && is_digit(m_wptrs[j][0]) ) {
			//	// get punct mark
			//	char c = m_wptrs[i+1][0];
			//	// if space try next
			//	if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
			//	// treat comma as nothing
			//	if ( c==',' ) c='\0';
			//	// treat / and . and - as they are, everything
			//	// else should be treated as a space
			//	else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
			//	// incorporate into hash if c is there
			//	if (c)h=hash64Lower_utf8_cont(&c,1,h,&conti);
			//}

			// hash the jth word into the hash
			h = hash64Lower_utf8_cont(m_wptrs[j], 
						  m_wlens[j],
						  h,
						  &conti );
			pos = conti;
			//h = hash64 ( h , m_words->getWordId (j) );
			//if (m_words->getStripWordId(j)) 
			//	h2 = hash64 ( h2, m_words->getStripWordId(j));
			//else h2 = hash64(h2, m_words->getWordId(j));
			numWordsInPhrase++;

			// N-word phrases?
			if ( numWordsInPhrase == 2 ) { // h != h2 ) {
				h2 = h;
				m_numWordsTotal2[i] = j-i+1;
				if ( m_bits->isStopWord(j) ) 
					hasStopWord2 = true;
				continue;
			}
			if ( numWordsInPhrase == 3 ) {
				h3 = h;
				m_numWordsTotal3[i] = j-i+1;
				//continue;
				break;
			}
			/*
			if ( numWordsInPhrase == 4 ) {
				h4 = h;
				m_numWordsTotal4[i] = j-i+1;
				continue;
			}
			if ( numWordsInPhrase == 5 ) {
				h5 = h;
				m_numWordsTotal5[i] = j-i+1;
				continue;
			}
			*/
		}
		// if we cannot pair across word j then break
		if ( ! m_bits->canPairAcross (j) ) break;
		// keep chugging?
		if ( numWordsInPhrase >= 5 ) {
			// if we're not using stop words then break
			if ( ! m_useStopWords ) break;
			// if it's not a stop word then break
			if ( ! m_bits->isStopWord (j) ) break;
		}
		// otherwise, get the next word
	}
	// if we had no phrase then use 0 as id (need 2+ words to be a pharse)
	if ( numWordsInPhrase <= 1 ) { 
	nophrase:
		m_phraseSpam[i]      = PSKIP; 
		//m_phraseIds [i]      = 0LL; 
		m_phraseIds2[i]      = 0LL; 
		m_phraseIds3[i]      = 0LL; 
		//m_stripPhraseIds [i] = 0LL; 
		//m_numWordsTotal[i]   = 0;
		m_numWordsTotal2[i]   = 0;
		m_numWordsTotal3[i]   = 0;
		return;
	}
	// don't jump the edge
	//if ( j >= nw ) j = nw - 1;
	// sanity check
	if ( lastWordj == -1 ) { char *xx = NULL; *xx = 0; }
	// set the phrase length (from word #i upto & including word #j)
	//m_numWordsTotal[i] = j - i + 1;
	//m_numWordsTotal [i] = lastWordj - i + 1;
	// sanity check
	if ( lastWordj - i + 1 > 255 ) { char *xx=NULL;*xx=0; }
	// set the phrase spam
	if ( minSpam == -1 ) minSpam = 0;
	m_phraseSpam[i] = minSpam;
	// return the phraseId
	//m_phraseIds [i] = h;
	// hyphen between numbers does not count (so 1-2 != 12)
	if ( isNum ) hasHyphen = false;
	// . the two word phrase id
	// . "cd rom"    -> cdrom
	// . "fly paper" -> flypaper
	// . "i-phone"   -> iphone
	// . "e-mail"    -> email
	if ( hasHyphen || ! hasStopWord2 ) {
		//m_phraseIds [i] = h;
		m_phraseIds2[i] = h2;
	}
	// . "st. and"    !-> stand
	// . "the rapist" !-> therapist
	else {
		//m_phraseIds [i] = h  ^ 0x768867;
		m_phraseIds2[i] = h2 ^ 0x768867;
	}
	// forget hyphen logic for these
	m_phraseIds3[i] = h3;
	//m_phraseIds4[i] = h4;
	//m_phraseIds5[i] = h5;

	//if ( h != h2 ) m_stripPhraseIds[i] = h2;
	//else m_stripPhraseIds[i] = 0LL;
		
	// the score weight, if any
	//if ( m_phraseScores ) m_phraseScores [i] = minScore;
	// sanity check
	//if(m_phraseScores && minScore == 0x7fffffff ) {char *xx =NULL;*xx=0;}
	// debug msg
	//char *w = m_words->getWord(i) ;
	//int32_t  wlen = m_words->getWordLen(i) ; 
	//for ( int32_t k = 0 ; k < wlen ; k++ )
	//	fprintf(stderr,"%c",w[k]);
	//fprintf(stderr,"--> hash=%"UINT64"\n",(uint64_t)h);
}