uint8_t CountryCode::getLanguageFromDMOZ(long catid) { if(!m_init) return(0); catcountryrec_t ccr; ccr.lval = 0L; if(s_catToCountry.getNumSlotsUsed() < 1) return(0); long slot = s_catToCountry.getSlot((long)catid); if(slot < 0) return(0); ccr.lval = s_catToCountry.getValueFromSlot(slot); return(ccr.sval.lang); }
// . returns 0.0 to 1.0 // . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1) // . gets 50% points if has all single words, and the other 50% if all phrases // . Scores class applies to w1 only, use NULL if none // . use word popularity information for scoring rarer term matches more // . ONLY CHECKS FIRST 1000 WORDS of w2 for speed float Title::getSimilarity ( Words *w1 , int32_t i0 , int32_t i1 , Words *w2 , int32_t t0 , int32_t t1 ) { // if either empty, that's 0% contained if ( w1->getNumWords() <= 0 ) return 0; if ( w2->getNumWords() <= 0 ) return 0; if ( i0 >= i1 ) return 0; if ( t0 >= t1 ) return 0; // invalids vals if ( i0 < 0 ) return 0; if ( t0 < 0 ) return 0; // . for this to be useful we must use idf // . get the popularity of each word in w1 // . w1 should only be a few words since it is a title candidate // . does not add pop for word #i if scores[i] <= 0 // . take this out for now since i removed the unified dict, // we could use this if we added popularity to g_wiktionary // but it would have to be language dependent Pops pops1; Pops pops2; if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0; if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0; // now hash the words in w1, the needle in the haystack int32_t nw1 = w1->getNumWords(); if ( i1 > nw1 ) i1 = nw1; HashTable table; // this augments the hash table int64_t lastWid = -1; float lastScore = 0.0; // but we cannot have more than 1024 slots then if ( ! table.set ( 1024 ) ) return -1.0; // and table auto grows when 90% full, so limit us here int32_t count = 0; int32_t maxCount = 20; // sum up everything we add float sum = 0.0; // loop over all words in "w1" and hash them for ( int32_t i = i0 ; i < i1 ; i++ ) { // the word id int64_t wid = w1->getWordId(i); // skip if not indexable if ( wid == 0 ) { continue; } // no room left in table! if ( count++ > maxCount ) { //logf(LOG_DEBUG, "query: Hash table for title " // "generation too small. Truncating words from w1."); break; } // . make this a float. it ranges from 0.0 to 1.0 // . 1.0 means the word occurs in 100% of documents sampled // . 0.0 means it occurs in none of them // . but "val" is the complement of those two statements! float score = 1.0 - pops1.getNormalizedPop(i); // accumulate sum += score; // add to table if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) ) { return -1.0; } // if no last wid, continue if ( lastWid == -1LL ) { lastWid = wid; lastScore = score; continue; } // . what was his val? // . the "val" of the phrase: float phrScore = score + lastScore; // do not count as much as single words phrScore *= 0.5; // accumulate sum += phrScore; // get the phrase id int64_t pid = hash64 ( wid , lastWid ); // now add that if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) ) return -1.0; // we are now the last wid lastWid = wid; lastScore = score; } // sanity check. it can't grow cuz we keep lastWids[] 1-1 with it if ( table.getNumSlots() != 1024 ) { log(LOG_LOGIC,"query: Title has logic bug."); return -1.0; } // accumulate scores of words that are found float found = 0.0; // reset lastWid = -1LL; // loop over all words in "w1" and hash them for ( int32_t i = t0 ; i < t1 ; i++ ) { // the word id int64_t wid = w2->getWordId(i); // skip if not indexable if ( wid == 0 ) { continue; } // . make this a float. it ranges from 0.0 to 1.0 // . 1.0 means the word occurs in 100% of documents sampled // . 0.0 means it occurs in none of them // . but "val" is the complement of those two statements! float score = 1.0 - pops2.getNormalizedPop(i); // accumulate sum += score; // is it in table? int32_t slot = table.getSlot ( (int32_t)wid ) ; // . if in table, add that up to "found" // . we essentially find his wid AND our wid, so 2.0 times if ( slot >= 0 ) { found += 2.0 * score; } // now the phrase if ( lastWid == -1LL ) { lastWid = wid; lastScore = score; continue; } // . what was his val? // . the "val" of the phrase: float phrScore = score + lastScore; // do not count as much as single words phrScore *= 0.5; // accumulate sum += phrScore; // get the phrase id int64_t pid = hash64 ( wid , lastWid ); // is it in table? slot = table.getSlot ( (int32_t)pid ) ; // . accumulate if in there // . we essentially find his wid AND our wid, so 2.0 times if ( slot >= 0 ) found += 2.0 * phrScore; // we are now the last wid lastWid = wid; lastScore = score; } // do not divide by zero if ( sum == 0.0 ) return 0.0; // sanity check //if ( found > sum ) { char *xx=NULL;*xx=0; } if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; } // . return the percentage matched // . will range from 0.0 to 1.0 return found / sum; }