// just index the first bigram for now to give a little bonus bool Synonyms::addAmpPhrase ( long wordNum , HashTableX *dt ) { // . "D & B" --> dandb // . make the "andb" a suffix //char tbuf[100]; if ( wordNum +2 >= m_words->m_numWords ) return true; if ( ! m_words->m_wordIds [wordNum+2] ) return true; if ( m_words->m_wordLens[wordNum+2] > 50 ) return true; if ( ! m_words->hasChar(wordNum+1,'&') ) return true; long wlen = m_words->m_wordLens[wordNum]; char *w = m_words->m_words[wordNum]; // need this for hash continuation procedure long conti = 0; // hack for "d & b" -> "dandb" uint64_t h = hash64Lower_utf8_cont ( w , wlen,0LL,&conti ); // just make it a bigram with the word "and" after it // . we usually ignore stop words like and when someone does the query // but we give out bonus points if the query term's left or right // bigram has that stop word where it should be. // . so Dave & Barry will index "daveand" as a bigram and the // search for 'Dave and Barry' will give bonus points for that // bigram. h = hash64Lower_utf8_cont ( "and", 3,h,&conti); // logic in Phrases.cpp will xor it with 0x768867 // because it contains a stop word. this prevents "st. // and" from matching "stand". h ^= 0x768867; // do not add dups if ( dt->isInTable ( &h ) ) return true; // add to dedup table. return false with g_errno set on error if ( ! dt->addKey ( &h ) ) return false; // store that *m_aidsPtr++ = h; *m_wids0Ptr++ = 0LL; *m_wids1Ptr++ = 0LL; *m_termOffsPtr++ = m_synWordBuf.length(); m_synWordBuf.safeMemcpy ( w , wlen ); m_synWordBuf.safeStrcpy (" and"); m_synWordBuf.pushChar('\0'); *m_termLensPtr++ = wlen+4; *m_termPtrsPtr++ = NULL; *m_numAlnumWordsPtr++ = 1; *m_numAlnumWordsInBasePtr++ = 1; *m_srcPtr++ = SOURCE_GENERATED; return true; }
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr // to the first one. // . then the parent caller can store that ptr in the m_wordToSyn[] array // which we pre-alloc upon calling the set() function based on the # of // words we got // . returns # of synonyms stored into "tmpBuf" long Synonyms::getSynonyms ( Words *words , long wordNum , uint8_t langId , char *tmpBuf , long niceness ) { // punct words have no synoyms if ( ! words->m_wordIds[wordNum] ) return 0; // store these m_words = words; m_docLangId = langId; m_niceness = niceness; // sanity check if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; } // init the dedup table to dedup wordIds HashTableX dt; char dbuf[512]; dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds"); long maxSyns = (long)MAX_SYNS; char *bufPtr = tmpBuf; // point into buffer m_aids = (long long *)bufPtr; bufPtr += maxSyns * 8; // then the word ids m_wids0 = (long long *)bufPtr; bufPtr += maxSyns * 8; // second word ids, for multi alnum word synonyms, i.e. "New Jersey" m_wids1 = (long long *)bufPtr; bufPtr += maxSyns * 8; m_termPtrs = (char **)bufPtr; bufPtr += maxSyns * 4; m_termLens = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWords = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWordsInBase = (long *)bufPtr; bufPtr += maxSyns * 4; // source m_src = bufPtr; bufPtr += maxSyns; // cursors m_aidsPtr = m_aids; m_wids0Ptr = m_wids0; m_wids1Ptr = m_wids1; m_srcPtr = m_src; m_termPtrsPtr = m_termPtrs; m_termLensPtr = m_termLens; m_numAlnumWordsPtr = m_numAlnumWords; m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase; char *w = m_words->m_words [wordNum]; long wlen = m_words->m_wordLens[wordNum]; // // NOW hit wiktionary // Trust this less then our s_exceptions above, but more than // our morph computations below // char sourceId = SOURCE_WIKTIONARY; char *ss = NULL; long long bwid; char wikiLangId = m_docLangId; bool hadSpace ; long klen ; long baseNumAlnumWords; tryOtherLang: /* // if word only exists in one language, assume that language for word // even if m_docLangId is langUnknown (0) if ( ! ss && ! m_docLangId && ! wikiLangId ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; // each lang has its own bit long long bits = g_speller.getLangBits64 ( &bwid ); // skip if not unique char count = getNumBitsOn64 ( bits ) ; // if we only got one lang we could be, assume that if ( count == 1 ) // get it. bit #0 is english, so add 1 wikiLangId = getBitPosLL((uint8_t *)&bits) + 1; // try setting based on script. greek. russian. etc. // if the word was not in the wiktionary. // this will be langUnknown if not definitive. else wikiLangId = getCharacterLanguage(w); } */ // try looking up bigram so "new jersey" gets "nj" as synonym if ( wikiLangId && wordNum+2< m_words->m_numWords && m_words->m_wordIds[wordNum+2]) { // get phrase id bigram then long conti = 0; bwid = hash64Lower_utf8_cont(w,wlen,0,&conti); // then the next word char *wp2 = m_words->m_words[wordNum+2]; long wlen2 = m_words->m_wordLens[wordNum+2]; bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti); baseNumAlnumWords = 2; ss = g_wiktionary.getSynSet( bwid, wikiLangId ); } // need a language for wiktionary to work with if ( wikiLangId && ! ss ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; baseNumAlnumWords = 1; //if ( bwid == 1424622907102375150LL) // log("a"); ss = g_wiktionary.getSynSet( bwid, wikiLangId ); // if that failed try removing 's from word if there if ( ! ss && wlen >= 3 && w[wlen-2]=='\'' && w[wlen-1]=='s' ) { long long cwid = hash64Lower_utf8(w,wlen-2); ss = g_wiktionary.getSynSet( cwid, wikiLangId ); } } // even though a document may be in german it often has some // english words "pdf download" "copyright" etc. so if the word // has no synset in german, try it in english if ( //numPresets == 0 && ! ss && m_docLangId != langEnglish && wikiLangId != langEnglish && m_docLangId && g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) { // try english wikiLangId = langEnglish; sourceId = SOURCE_WIKTIONARY_EN; goto tryOtherLang; } // if it was in wiktionary, just use that synset if ( ss ) { // prepare th HashTableX dedup; HashTableX *dd = NULL; char dbuf[512]; long count = 0; addSynSet: // do we have another set following this char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss); // if so, init the dedup table then if ( next && ! dd ) { dd = &dedup; dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf"); } // skip over the pipe i guess char *pipe = ss + 2; // zh_ch? if ( *pipe == '_' ) pipe += 3; // sanity if ( *pipe != '|' ) { char *xx=NULL;*xx=0; } // point to word list char *p = pipe + 1; // hash up the list of words, they are in utf8 and char *e = p + 1; // save count in case we need to undo //long saved = m_numAlts[wordNum]; hashLoop: // skip synonyms that are anagrams because its to ambiguous // the are mappings like // "PC" -> "PC,Personal Computer" // "PC" -> "PC,Probable Cause" ... (lots more!) //bool isAnagram = true; for ( ; *e !='\n' && *e != ',' ; e++ ) ; // if ( ! is_upper_a(*e) ) isAnagram = false; // get it long long h = hash64Lower_utf8_nospaces ( p , e - p ); // skip if same as base word if ( h == bwid ) goto getNextSyn; // should we check for dups? if ( dd ) { // skip dups if ( dd->isInTable(&h) ) goto getNextSyn; // dedup. return false with g_errno set on error if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids; } // store it *m_aidsPtr++ = h; // store source *m_srcPtr++ = sourceId; hadSpace = false; klen = e - p; for ( long k = 0 ; k < klen ; k++ ) if ( is_wspace_a(p[k]) ) hadSpace = true; *m_termPtrsPtr++ = p; *m_termLensPtr++ = e-p; // only for multi-word synonyms like "New Jersey"... *m_wids0Ptr = 0LL; *m_wids1Ptr = 0LL; *m_numAlnumWordsPtr = 1; // and for multi alnum word synonyms if ( hadSpace ) { Words sw; sw.setx ( p , e - p , m_niceness ); *(long long *)m_wids0Ptr = sw.m_wordIds[0]; *(long long *)m_wids1Ptr = sw.m_wordIds[2]; *(long *)m_numAlnumWordsPtr = sw.getNumAlnumWords(); } m_wids0Ptr++; m_wids1Ptr++; m_numAlnumWordsPtr++; // how many words did we have to hash to find a synset? // i.e. "new jersey" would be 2, to get "nj" *m_numAlnumWordsInBasePtr++ = baseNumAlnumWords; // do not breach if ( ++count >= maxSyns ) goto done; getNextSyn: // loop for more if ( *e == ',' ) { e++; p = e; goto hashLoop; } // add in the next syn set, deduped if ( next ) { ss = next; goto addSynSet; } // wrap it up done: // all done return m_aidsPtr - m_aids; } // strip marks from THIS word, return -1 w/ g_errno set on error if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids; // returns false with g_errno set if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids; // if we end in apostrophe, strip and add if ( wlen>= 3 && w[wlen-1] == 's' && w[wlen-2]=='\'' && ! addWithoutApostrophe ( wordNum, &dt ) ) return m_aidsPtr - m_aids; return m_aidsPtr - m_aids; }
// . add the phrase that starts with the ith word // . "read Of Mice and Men" should make 3 phrases: // . read.ofmice // . ofmice // . mice.andmen void Phrases::setPhrase ( int32_t i, int32_t niceness ) { // . if the ith word cannot start a phrase then we have no phrase // . we indicate NULL phrasesIds with a spam of PSKIP // . we now index all regardless! we want to be able to search // for "a thing" or something. so do it! //if ( ! m_bits->canStartPhrase ( i ) ) { // m_phraseSpam[i] = PSKIP; // m_phraseIds [i] = 0LL; // return; //} // MDW: now Weights.cpp should encompass all this logic // or if score <= 0, set in Scores.cpp //if ( m_wordScores && m_wordScores[i] <= 0 ) { // m_phraseSpam[i] = PSKIP; // m_phraseIds [i] = 0LL; // return; //} // hash of the phrase int64_t h = 0LL; // the hash of the two-word phrase (now we do 3,4 and 5 word phrases) int64_t h2 = 0LL; int64_t h3 = 0LL; //int64_t h4 = 0LL; //int64_t h5 = 0LL; // reset unsigned char pos = 0; // now look for other tokens that should follow the ith token int32_t nw = m_words->getNumWords(); int32_t numWordsInPhrase = 1; // use the min spam from all words in the phrase as the spam for phrase char minSpam = -1; // we need to hash "1 / 8" differently from "1.8" from "1,000" etc. char isNum = is_digit(m_wptrs[i][0]); // min score //int32_t minScore ; //if ( m_wordScores ) minScore = m_wordScores[i]; // if i is not a stop word, it can set the min spam initially //if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i); // do not include punct/tag words in the m_numWordsTotal[j] count // of the total words in the phrase. these are just usesless tails. int32_t lastWordj = -1; // loop over following words int32_t j; bool hasHyphen ; bool hasStopWord2 ; // . NOTE: a token can start a phrase but NOT be in it. // . like a large number for example. // . wordId is the lower ascii hash of the ith word // . NO... this is allowing the query operator PiiPe to start // a phrase but not be in it, then the phrase id ends up just // being the following word's id. causing the synonyms code to // give a synonym which it should not un Synonyms::set() if ( ! m_bits->canBeInPhrase(i) ) // so indeed, skip it then goto nophrase; //h = hash64 ( h, m_words->getWordId(i)); h = m_wids[i]; // set position pos = (unsigned char)m_wlens[i]; //if (m_words->getStripWordId(i)) // h2 = hash64 ( h2, m_words->getStripWordId(i)); //else h2 = h; hasHyphen = false; hasStopWord2 = m_bits->isStopWord(i); // this makes it true now too //if ( m_wlens[i] <= 2 ) hasStopWord = true; for ( j = i + 1 ; j < nw ; j++ ) { QUICKPOLL(niceness); // . do not allow more than 32 alnum/punct "words" in a phrase // . this prevents phrases with 100,000 words from slowing // us down. would put us in a huge double-nested for loop if ( j > i + 32 ) goto nophrase; // deal with punct words if ( ! m_wids[j] ) { // if we cannot pair across word j then break if ( ! m_bits->canPairAcross (j) ) break; // does it have a hyphen? if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true; /* // "D & B" --> dandb if (j==i+1 && m_words->hasChar(j,'&')) { // set this hasStopWord = true; // insert "and" int32_t conti=pos; h = hash64Lower_utf8_cont("and",3,h,&conti); pos=conti; // the two-word phrase, set it if we need to h2 = h; m_numWordsTotal2[i] = j-i+1; } */ continue; } // . if this word can not be in a phrase then continue our // search for a word that can // . no punctuation can be in a phrase currently (++?) //if ( m_bits->canBeInPhrase (j) ) { //} // keep this set right //if (m_bits->isStopWord(j)||m_wlens[j]<=2) hasStopWord = true; //if ( m_bits->isStopWord(j) ) hasStopWord = true; // record lastWordj to indicate that word #j was a true word lastWordj = j; // . stop words should have a 0 spam value so don't count those // . added by mdw in march 2002 /* if ( ! m_bits->isStopWord(j) && m_spam ) { // maintain the min spam char spam = m_spam->getSpam ( j ); if ( minSpam == -1 || spam < minSpam ) minSpam = spam; // . min weight from score vector // . normal score here is 256, not 128, so shift // down 3 to normalize it relatively //if ( m_wordScores && (m_wordScores[j]>>3)<minScore) // minScore = m_wordScores[j]>>3; //if ( m_wordScores && m_wordScores[j] < minScore ) // minScore = m_wordScores[j]; } */ // if word #j can be in phrase then incorporate it's hash if ( m_bits->canBeInPhrase (j) ) { // continue the hash //unsigned char *p= (unsigned char *)m_wptrs[j]; //unsigned char *pend = p + m_wlens[j]; //for ( ; p < pend ; p++ ) // h ^= g_hashtab[pos++][*p]; int32_t conti = pos; // . get the punctuation mark separting two numbers // . use space if can't find one // . 1/234 1,234 1.234 10/11 "1 234" 1-5 //if (isNum && j==i + 2 && is_digit(m_wptrs[j][0]) ) { // // get punct mark // char c = m_wptrs[i+1][0]; // // if space try next // if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1]; // // treat comma as nothing // if ( c==',' ) c='\0'; // // treat / and . and - as they are, everything // // else should be treated as a space // else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' '; // // incorporate into hash if c is there // if (c)h=hash64Lower_utf8_cont(&c,1,h,&conti); //} // hash the jth word into the hash h = hash64Lower_utf8_cont(m_wptrs[j], m_wlens[j], h, &conti ); pos = conti; //h = hash64 ( h , m_words->getWordId (j) ); //if (m_words->getStripWordId(j)) // h2 = hash64 ( h2, m_words->getStripWordId(j)); //else h2 = hash64(h2, m_words->getWordId(j)); numWordsInPhrase++; // N-word phrases? if ( numWordsInPhrase == 2 ) { // h != h2 ) { h2 = h; m_numWordsTotal2[i] = j-i+1; if ( m_bits->isStopWord(j) ) hasStopWord2 = true; continue; } if ( numWordsInPhrase == 3 ) { h3 = h; m_numWordsTotal3[i] = j-i+1; //continue; break; } /* if ( numWordsInPhrase == 4 ) { h4 = h; m_numWordsTotal4[i] = j-i+1; continue; } if ( numWordsInPhrase == 5 ) { h5 = h; m_numWordsTotal5[i] = j-i+1; continue; } */ } // if we cannot pair across word j then break if ( ! m_bits->canPairAcross (j) ) break; // keep chugging? if ( numWordsInPhrase >= 5 ) { // if we're not using stop words then break if ( ! m_useStopWords ) break; // if it's not a stop word then break if ( ! m_bits->isStopWord (j) ) break; } // otherwise, get the next word } // if we had no phrase then use 0 as id (need 2+ words to be a pharse) if ( numWordsInPhrase <= 1 ) { nophrase: m_phraseSpam[i] = PSKIP; //m_phraseIds [i] = 0LL; m_phraseIds2[i] = 0LL; m_phraseIds3[i] = 0LL; //m_stripPhraseIds [i] = 0LL; //m_numWordsTotal[i] = 0; m_numWordsTotal2[i] = 0; m_numWordsTotal3[i] = 0; return; } // don't jump the edge //if ( j >= nw ) j = nw - 1; // sanity check if ( lastWordj == -1 ) { char *xx = NULL; *xx = 0; } // set the phrase length (from word #i upto & including word #j) //m_numWordsTotal[i] = j - i + 1; //m_numWordsTotal [i] = lastWordj - i + 1; // sanity check if ( lastWordj - i + 1 > 255 ) { char *xx=NULL;*xx=0; } // set the phrase spam if ( minSpam == -1 ) minSpam = 0; m_phraseSpam[i] = minSpam; // return the phraseId //m_phraseIds [i] = h; // hyphen between numbers does not count (so 1-2 != 12) if ( isNum ) hasHyphen = false; // . the two word phrase id // . "cd rom" -> cdrom // . "fly paper" -> flypaper // . "i-phone" -> iphone // . "e-mail" -> email if ( hasHyphen || ! hasStopWord2 ) { //m_phraseIds [i] = h; m_phraseIds2[i] = h2; } // . "st. and" !-> stand // . "the rapist" !-> therapist else { //m_phraseIds [i] = h ^ 0x768867; m_phraseIds2[i] = h2 ^ 0x768867; } // forget hyphen logic for these m_phraseIds3[i] = h3; //m_phraseIds4[i] = h4; //m_phraseIds5[i] = h5; //if ( h != h2 ) m_stripPhraseIds[i] = h2; //else m_stripPhraseIds[i] = 0LL; // the score weight, if any //if ( m_phraseScores ) m_phraseScores [i] = minScore; // sanity check //if(m_phraseScores && minScore == 0x7fffffff ) {char *xx =NULL;*xx=0;} // debug msg //char *w = m_words->getWord(i) ; //int32_t wlen = m_words->getWordLen(i) ; //for ( int32_t k = 0 ; k < wlen ; k++ ) // fprintf(stderr,"%c",w[k]); //fprintf(stderr,"--> hash=%"UINT64"\n",(uint64_t)h); }