uint8_t CountryCode::getLanguageFromDMOZ(long catid) {
	if(!m_init) return(0);
	catcountryrec_t ccr;
	ccr.lval = 0L;
	if(s_catToCountry.getNumSlotsUsed() < 1) return(0);
	long slot = s_catToCountry.getSlot((long)catid);
	if(slot < 0) return(0);
	ccr.lval = s_catToCountry.getValueFromSlot(slot);
	return(ccr.sval.lang);
}
Exemplo n.º 2
0
// . returns 0.0 to 1.0
// . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1)
// . gets 50% points if has all single words, and the other 50% if all phrases
// . Scores class applies to w1 only, use NULL if none
// . use word popularity information for scoring rarer term matches more
// . ONLY CHECKS FIRST 1000 WORDS of w2 for speed
float Title::getSimilarity ( Words  *w1 , int32_t i0 , int32_t i1 ,
			     Words  *w2 , int32_t t0 , int32_t t1 ) {
	// if either empty, that's 0% contained
	if ( w1->getNumWords() <= 0 ) return 0;
	if ( w2->getNumWords() <= 0 ) return 0;
	if ( i0 >= i1 ) return 0;
	if ( t0 >= t1 ) return 0;

	// invalids vals
	if ( i0 < 0   ) return 0;
	if ( t0 < 0   ) return 0;

	// . for this to be useful we must use idf
	// . get the popularity of each word in w1
	// . w1 should only be a few words since it is a title candidate
	// . does not add pop for word #i if scores[i] <= 0
	// . take this out for now since i removed the unified dict,
	//   we could use this if we added popularity to g_wiktionary
	//   but it would have to be language dependent
	Pops pops1;
	Pops pops2;
	if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0;
	if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0;

	// now hash the words in w1, the needle in the haystack
	int32_t nw1 = w1->getNumWords();
	if ( i1 > nw1 ) i1 = nw1;
	HashTable table;

	// this augments the hash table
	int64_t lastWid   = -1;
	float     lastScore = 0.0;

	// but we cannot have more than 1024 slots then
	if ( ! table.set ( 1024 ) ) return -1.0;

	// and table auto grows when 90% full, so limit us here
	int32_t count    = 0;
	int32_t maxCount = 20;

	// sum up everything we add
	float sum = 0.0;

	// loop over all words in "w1" and hash them
	for ( int32_t i = i0 ; i < i1 ; i++ ) {
		// the word id
		int64_t wid = w1->getWordId(i);

		// skip if not indexable
		if ( wid == 0 ) {
			continue;
		}

		// no room left in table!
		if ( count++ > maxCount ) {
			//logf(LOG_DEBUG, "query: Hash table for title "
			//    "generation too small. Truncating words from w1.");
			break;
		}

		// . make this a float. it ranges from 0.0 to 1.0
		// . 1.0 means the word occurs in 100% of documents sampled
		// . 0.0 means it occurs in none of them
		// . but "val" is the complement of those two statements!
		float score = 1.0 - pops1.getNormalizedPop(i);

		// accumulate
		sum += score;

		// add to table
		if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) ) {
			return -1.0;
		}

		// if no last wid, continue
		if ( lastWid == -1LL ) {
			lastWid = wid;
			lastScore = score;
			continue;
		}

		// . what was his val?
		// . the "val" of the phrase: 
		float phrScore = score + lastScore;

		// do not count as much as single words
		phrScore *= 0.5;

		// accumulate
		sum += phrScore;

		// get the phrase id
		int64_t pid = hash64 ( wid , lastWid );

		// now add that
		if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) )
			return -1.0;
		// we are now the last wid
		lastWid   = wid;
		lastScore = score;
	}

	// sanity check. it can't grow cuz we keep lastWids[] 1-1 with it
	if ( table.getNumSlots() != 1024 ) {
		log(LOG_LOGIC,"query: Title has logic bug.");
		return -1.0;
	}

	// accumulate scores of words that are found
	float found = 0.0;

	// reset
	lastWid = -1LL;

	// loop over all words in "w1" and hash them
	for ( int32_t i = t0 ; i < t1 ; i++ ) {
		// the word id
		int64_t wid = w2->getWordId(i);

		// skip if not indexable
		if ( wid == 0 ) {
			continue;
		}

		// . make this a float. it ranges from 0.0 to 1.0
		// . 1.0 means the word occurs in 100% of documents sampled
		// . 0.0 means it occurs in none of them
		// . but "val" is the complement of those two statements!
		float score = 1.0 - pops2.getNormalizedPop(i);

		// accumulate
		sum += score;

		// is it in table? 
		int32_t slot = table.getSlot ( (int32_t)wid ) ;

		// . if in table, add that up to "found"
		// . we essentially find his wid AND our wid, so 2.0 times
		if ( slot >= 0 ) {
			found += 2.0 * score;
		}

		// now the phrase
		if ( lastWid == -1LL ) {
			lastWid = wid;
			lastScore = score;
			continue;
		}

		// . what was his val?
		// . the "val" of the phrase: 
		float phrScore = score + lastScore;

		// do not count as much as single words
		phrScore *= 0.5;

		// accumulate
		sum += phrScore;

		// get the phrase id
		int64_t pid = hash64 ( wid , lastWid );

		// is it in table? 
		slot = table.getSlot ( (int32_t)pid ) ;

		// . accumulate if in there
		// . we essentially find his wid AND our wid, so 2.0 times
		if ( slot >= 0 ) found += 2.0 * phrScore;

		// we are now the last wid
		lastWid   = wid;
		lastScore = score;
	}

	// do not divide by zero
	if ( sum == 0.0 ) return 0.0;
	// sanity check
	//if ( found > sum              ) { char *xx=NULL;*xx=0; }
	if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; }
	// . return the percentage matched
	// . will range from 0.0 to 1.0
	return found / sum;
}