bool Synonyms::addWithoutApostrophe ( long wordNum , HashTableX *dt ) { long wlen = m_words->m_wordLens[wordNum]; char *w = m_words->m_words[wordNum]; wlen -= 2; uint64_t h = hash64Lower_utf8 ( w, wlen ); // do not add dups if ( dt->isInTable ( &h ) ) return true; // add to dedup table. return false with g_errno set on error if ( ! dt->addKey ( &h ) ) return false; // store that *m_aidsPtr++ = h; *m_wids0Ptr++ = 0LL; *m_wids1Ptr++ = 0LL; *m_termPtrsPtr++ = NULL; *m_termLensPtr++ = 0; *m_numAlnumWordsPtr++ = 1; *m_numAlnumWordsInBasePtr++ = 1; *m_srcPtr++ = SOURCE_GENERATED; return true; }
// return false and set g_errno on error bool Synonyms::addStripped ( char *w , long wlen , HashTableX *dt ) { // avoid overflow if ( wlen > 200 ) return true; // require utf8 bool hadUtf8 = false; char size; for ( long i = 0 ; i < wlen ; i += size ) { size = getUtf8CharSize(w+i); if ( size == 1 ) continue; hadUtf8 = true; break; } if ( ! hadUtf8 ) return true; // filter out accent marks char abuf[256]; //long alen = utf8ToAscii(abuf,256,(unsigned char *)w,wlen); long alen = stripAccentMarks(abuf,256,(unsigned char *)w,wlen); // skip if can't convert to ascii... (unsupported letter) if ( alen < 0 ) return true; // if same as original word, skip if ( wlen==alen && strncmp(abuf,w,wlen) == 0 ) return true; // hash it uint64_t h2 = hash64Lower_utf8(abuf,alen); // do not add dups if ( dt->isInTable ( &h2 ) ) return true; // add to dedup table. return false with g_errno set if ( ! dt->addKey ( &h2 ) ) return false; // store that *m_aidsPtr++ = h2; *m_wids0Ptr++ = 0LL; *m_wids1Ptr++ = 0LL; *m_termPtrsPtr++ = NULL; *m_termOffsPtr++ = m_synWordBuf.length(); *m_termLensPtr++ = alen; *m_numAlnumWordsPtr++ = 1; *m_numAlnumWordsInBasePtr++ = 1; *m_srcPtr++ = SOURCE_GENERATED; m_synWordBuf.safeStrcpy(abuf); m_synWordBuf.pushChar('\0'); return true; }
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr // to the first one. // . then the parent caller can store that ptr in the m_wordToSyn[] array // which we pre-alloc upon calling the set() function based on the # of // words we got // . returns # of synonyms stored into "tmpBuf" long Synonyms::getSynonyms ( Words *words , long wordNum , uint8_t langId , char *tmpBuf , long niceness ) { // punct words have no synoyms if ( ! words->m_wordIds[wordNum] ) return 0; // store these m_words = words; m_docLangId = langId; m_niceness = niceness; // sanity check if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; } // init the dedup table to dedup wordIds HashTableX dt; char dbuf[512]; dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds"); long maxSyns = (long)MAX_SYNS; char *bufPtr = tmpBuf; // point into buffer m_aids = (long long *)bufPtr; bufPtr += maxSyns * 8; // then the word ids m_wids0 = (long long *)bufPtr; bufPtr += maxSyns * 8; // second word ids, for multi alnum word synonyms, i.e. "New Jersey" m_wids1 = (long long *)bufPtr; bufPtr += maxSyns * 8; m_termPtrs = (char **)bufPtr; bufPtr += maxSyns * 4; m_termLens = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWords = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWordsInBase = (long *)bufPtr; bufPtr += maxSyns * 4; // source m_src = bufPtr; bufPtr += maxSyns; // cursors m_aidsPtr = m_aids; m_wids0Ptr = m_wids0; m_wids1Ptr = m_wids1; m_srcPtr = m_src; m_termPtrsPtr = m_termPtrs; m_termLensPtr = m_termLens; m_numAlnumWordsPtr = m_numAlnumWords; m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase; char *w = m_words->m_words [wordNum]; long wlen = m_words->m_wordLens[wordNum]; // // NOW hit wiktionary // Trust this less then our s_exceptions above, but more than // our morph computations below // char sourceId = SOURCE_WIKTIONARY; char *ss = NULL; long long bwid; char wikiLangId = m_docLangId; bool hadSpace ; long klen ; long baseNumAlnumWords; tryOtherLang: /* // if word only exists in one language, assume that language for word // even if m_docLangId is langUnknown (0) if ( ! ss && ! m_docLangId && ! wikiLangId ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; // each lang has its own bit long long bits = g_speller.getLangBits64 ( &bwid ); // skip if not unique char count = getNumBitsOn64 ( bits ) ; // if we only got one lang we could be, assume that if ( count == 1 ) // get it. bit #0 is english, so add 1 wikiLangId = getBitPosLL((uint8_t *)&bits) + 1; // try setting based on script. greek. russian. etc. // if the word was not in the wiktionary. // this will be langUnknown if not definitive. else wikiLangId = getCharacterLanguage(w); } */ // try looking up bigram so "new jersey" gets "nj" as synonym if ( wikiLangId && wordNum+2< m_words->m_numWords && m_words->m_wordIds[wordNum+2]) { // get phrase id bigram then long conti = 0; bwid = hash64Lower_utf8_cont(w,wlen,0,&conti); // then the next word char *wp2 = m_words->m_words[wordNum+2]; long wlen2 = m_words->m_wordLens[wordNum+2]; bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti); baseNumAlnumWords = 2; ss = g_wiktionary.getSynSet( bwid, wikiLangId ); } // need a language for wiktionary to work with if ( wikiLangId && ! ss ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; baseNumAlnumWords = 1; //if ( bwid == 1424622907102375150LL) // log("a"); ss = g_wiktionary.getSynSet( bwid, wikiLangId ); // if that failed try removing 's from word if there if ( ! ss && wlen >= 3 && w[wlen-2]=='\'' && w[wlen-1]=='s' ) { long long cwid = hash64Lower_utf8(w,wlen-2); ss = g_wiktionary.getSynSet( cwid, wikiLangId ); } } // even though a document may be in german it often has some // english words "pdf download" "copyright" etc. so if the word // has no synset in german, try it in english if ( //numPresets == 0 && ! ss && m_docLangId != langEnglish && wikiLangId != langEnglish && m_docLangId && g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) { // try english wikiLangId = langEnglish; sourceId = SOURCE_WIKTIONARY_EN; goto tryOtherLang; } // if it was in wiktionary, just use that synset if ( ss ) { // prepare th HashTableX dedup; HashTableX *dd = NULL; char dbuf[512]; long count = 0; addSynSet: // do we have another set following this char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss); // if so, init the dedup table then if ( next && ! dd ) { dd = &dedup; dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf"); } // skip over the pipe i guess char *pipe = ss + 2; // zh_ch? if ( *pipe == '_' ) pipe += 3; // sanity if ( *pipe != '|' ) { char *xx=NULL;*xx=0; } // point to word list char *p = pipe + 1; // hash up the list of words, they are in utf8 and char *e = p + 1; // save count in case we need to undo //long saved = m_numAlts[wordNum]; hashLoop: // skip synonyms that are anagrams because its to ambiguous // the are mappings like // "PC" -> "PC,Personal Computer" // "PC" -> "PC,Probable Cause" ... (lots more!) //bool isAnagram = true; for ( ; *e !='\n' && *e != ',' ; e++ ) ; // if ( ! is_upper_a(*e) ) isAnagram = false; // get it long long h = hash64Lower_utf8_nospaces ( p , e - p ); // skip if same as base word if ( h == bwid ) goto getNextSyn; // should we check for dups? if ( dd ) { // skip dups if ( dd->isInTable(&h) ) goto getNextSyn; // dedup. return false with g_errno set on error if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids; } // store it *m_aidsPtr++ = h; // store source *m_srcPtr++ = sourceId; hadSpace = false; klen = e - p; for ( long k = 0 ; k < klen ; k++ ) if ( is_wspace_a(p[k]) ) hadSpace = true; *m_termPtrsPtr++ = p; *m_termLensPtr++ = e-p; // only for multi-word synonyms like "New Jersey"... *m_wids0Ptr = 0LL; *m_wids1Ptr = 0LL; *m_numAlnumWordsPtr = 1; // and for multi alnum word synonyms if ( hadSpace ) { Words sw; sw.setx ( p , e - p , m_niceness ); *(long long *)m_wids0Ptr = sw.m_wordIds[0]; *(long long *)m_wids1Ptr = sw.m_wordIds[2]; *(long *)m_numAlnumWordsPtr = sw.getNumAlnumWords(); } m_wids0Ptr++; m_wids1Ptr++; m_numAlnumWordsPtr++; // how many words did we have to hash to find a synset? // i.e. "new jersey" would be 2, to get "nj" *m_numAlnumWordsInBasePtr++ = baseNumAlnumWords; // do not breach if ( ++count >= maxSyns ) goto done; getNextSyn: // loop for more if ( *e == ',' ) { e++; p = e; goto hashLoop; } // add in the next syn set, deduped if ( next ) { ss = next; goto addSynSet; } // wrap it up done: // all done return m_aidsPtr - m_aids; } // strip marks from THIS word, return -1 w/ g_errno set on error if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids; // returns false with g_errno set if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids; // if we end in apostrophe, strip and add if ( wlen>= 3 && w[wlen-1] == 's' && w[wlen-2]=='\'' && ! addWithoutApostrophe ( wordNum, &dt ) ) return m_aidsPtr - m_aids; return m_aidsPtr - m_aids; }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }
bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) { int32_t i = 0; int32_t j; int32_t wlen; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) { goto done; } if ( ! s[i] ) { goto done; } if ( !is_alnum_utf8( s + i ) ) { if ( m_numWords >= m_preCount ) { goto done; } // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if( m_tagIds ) { if ( s[i + 1] == '/' ) { // skip over / m_tagIds[m_numWords] = ::getTagId( s + i + 2 ); m_tagIds[m_numWords] |= BACKBIT; } else { m_tagIds[m_numWords] = ::getTagId( s + i + 1 ); } } m_words[m_numWords] = s + i; m_wordIds[m_numWords] = 0LL; // skip till end int32_t tagLen = getTagLen( s + i ); m_wordLens[m_numWords] = tagLen; m_nodes[m_numWords] = 0; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; for ( ; s[i] ; i += getUtf8CharSize(s+i)) { // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) { break; } // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) { continue; } // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) { continue; } // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; m_nodes [ m_numWords ] = 0; if (m_tagIds) { m_tagIds[m_numWords] = 0; } m_numWords++; goto uptop; } // get an alnum word j = i; again: for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // comma is ok if like ,ddd!d if ( s[i]==',' && i-j <= 3 && is_digit(s[i-1]) ) { // if word so far is 2 or 3 chars, make sure digits if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward while ( s[i] == ',' && is_digit(s[i+1]) && is_digit(s[i+2]) && is_digit(s[i+3]) && ! is_digit(s[i+4]) ) { i += 4; } } // decimal point? if ( s[i] == '.' && is_digit(s[i-1]) && is_digit(s[i+1]) ) { // allow the decimal point i++; // skip over string of digits while ( is_digit(s[i]) ) i++; } nogo: // allow for words like we're dave's and i'm if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) { i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; if ( computeWordIds ) { int64_t h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; } m_nodes[m_numWords] = 0; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // get a punct word goto uptop; done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); gbshutdownLogicError(); } return true; }
static int64_t hashWord(const char *word) { return (hash64Lower_utf8(word) & TERMID_MASK); }
static int64_t hashWord(const char *prefix, const char *word) { uint64_t prefixHash = hash64(prefix, strlen(prefix)); return (hash64(hash64Lower_utf8(word), prefixHash) & TERMID_MASK); }