void gbiconv_reset(){
	for (long i=0;i<s_convTable.getNumSlots();i++){
		long key = s_convTable.getKey(i);
		if (!key) continue;
		iconv_t conv = (iconv_t)s_convTable.getValueFromSlot(i);
		if (!conv) continue;
		//logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv);
		g_mem.rmMem((void*)conv, 52, "iconv");
		libiconv_close(conv);
	}
	s_convTable.reset();
}
// Do not call this function lightly, it takes an hour to run
int CountryCode::createHashTable(void) {
	if(!fillRegexTable()) return(0);

	char tmpbuf[2048];
	HashTable ht;
	unsigned long long entries = 0UL;
	long catid;
	long numcats = g_categories->m_numCats;
	catcountryrec_t ccr;
	SafeBuf sb(tmpbuf, 2048);

	log( "cat: Creating category country/language table.\n");

	if(!ht.set(2,NULL,0,"ctrycode")) {
		log( "cat: Could not allocate memory for table.\n");
		return(0);
	}
	for(long idx = 0; idx < numcats; idx++) {
		catid = g_categories->m_cats[idx].m_catid;
		sb.reset();
		g_categories->printPathFromId(&sb, catid, true);
		if(!sb.getBufStart()) continue;
		if(!(numcats % 1000))
			log( "init: %ld/%ld Generated %llu so far...\n",
					numcats,
					idx,
					entries);
		ccr.lval = 0L;
		ccr.sval.country = lookupCountryFromDMOZTopic(sb.getBufStart(), sb.length());
		ccr.sval.lang = s_getLangIdxFromDMOZ(sb.getBufStart(), sb.length());
		if(!ccr.lval) continue;
		if(ccr.sval.lang > 27 || ccr.sval.country > s_numCountryCodes) {
			char *xx = NULL; *xx = 0;
		}
		if(!ht.addKey(catid, ccr.lval)) {
			log( "init: Could not add %ld (%ld)\n", catid, ccr.lval);
			continue;
		}
		entries++;
	}

	ht.save(g_hostdb.m_dir, "catcountry.dat");
	log( "Added %llu country entries from DMOZ to %s/catcountry.dat.\n", entries,g_hostdb.m_dir);
	log( "Slots %ld, Used Slots %ld.\n", ht.getNumSlots(), ht.getNumSlotsUsed());

	freeRegexTable();
	return(1);
}
Exemplo n.º 3
0
// . returns 0.0 to 1.0
// . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1)
// . gets 50% points if has all single words, and the other 50% if all phrases
// . Scores class applies to w1 only, use NULL if none
// . use word popularity information for scoring rarer term matches more
// . ONLY CHECKS FIRST 1000 WORDS of w2 for speed
float Title::getSimilarity ( Words  *w1 , int32_t i0 , int32_t i1 ,
			     Words  *w2 , int32_t t0 , int32_t t1 ) {
	// if either empty, that's 0% contained
	if ( w1->getNumWords() <= 0 ) return 0;
	if ( w2->getNumWords() <= 0 ) return 0;
	if ( i0 >= i1 ) return 0;
	if ( t0 >= t1 ) return 0;

	// invalids vals
	if ( i0 < 0   ) return 0;
	if ( t0 < 0   ) return 0;

	// . for this to be useful we must use idf
	// . get the popularity of each word in w1
	// . w1 should only be a few words since it is a title candidate
	// . does not add pop for word #i if scores[i] <= 0
	// . take this out for now since i removed the unified dict,
	//   we could use this if we added popularity to g_wiktionary
	//   but it would have to be language dependent
	Pops pops1;
	Pops pops2;
	if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0;
	if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0;

	// now hash the words in w1, the needle in the haystack
	int32_t nw1 = w1->getNumWords();
	if ( i1 > nw1 ) i1 = nw1;
	HashTable table;

	// this augments the hash table
	int64_t lastWid   = -1;
	float     lastScore = 0.0;

	// but we cannot have more than 1024 slots then
	if ( ! table.set ( 1024 ) ) return -1.0;

	// and table auto grows when 90% full, so limit us here
	int32_t count    = 0;
	int32_t maxCount = 20;

	// sum up everything we add
	float sum = 0.0;

	// loop over all words in "w1" and hash them
	for ( int32_t i = i0 ; i < i1 ; i++ ) {
		// the word id
		int64_t wid = w1->getWordId(i);

		// skip if not indexable
		if ( wid == 0 ) {
			continue;
		}

		// no room left in table!
		if ( count++ > maxCount ) {
			//logf(LOG_DEBUG, "query: Hash table for title "
			//    "generation too small. Truncating words from w1.");
			break;
		}

		// . make this a float. it ranges from 0.0 to 1.0
		// . 1.0 means the word occurs in 100% of documents sampled
		// . 0.0 means it occurs in none of them
		// . but "val" is the complement of those two statements!
		float score = 1.0 - pops1.getNormalizedPop(i);

		// accumulate
		sum += score;

		// add to table
		if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) ) {
			return -1.0;
		}

		// if no last wid, continue
		if ( lastWid == -1LL ) {
			lastWid = wid;
			lastScore = score;
			continue;
		}

		// . what was his val?
		// . the "val" of the phrase: 
		float phrScore = score + lastScore;

		// do not count as much as single words
		phrScore *= 0.5;

		// accumulate
		sum += phrScore;

		// get the phrase id
		int64_t pid = hash64 ( wid , lastWid );

		// now add that
		if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) )
			return -1.0;
		// we are now the last wid
		lastWid   = wid;
		lastScore = score;
	}

	// sanity check. it can't grow cuz we keep lastWids[] 1-1 with it
	if ( table.getNumSlots() != 1024 ) {
		log(LOG_LOGIC,"query: Title has logic bug.");
		return -1.0;
	}

	// accumulate scores of words that are found
	float found = 0.0;

	// reset
	lastWid = -1LL;

	// loop over all words in "w1" and hash them
	for ( int32_t i = t0 ; i < t1 ; i++ ) {
		// the word id
		int64_t wid = w2->getWordId(i);

		// skip if not indexable
		if ( wid == 0 ) {
			continue;
		}

		// . make this a float. it ranges from 0.0 to 1.0
		// . 1.0 means the word occurs in 100% of documents sampled
		// . 0.0 means it occurs in none of them
		// . but "val" is the complement of those two statements!
		float score = 1.0 - pops2.getNormalizedPop(i);

		// accumulate
		sum += score;

		// is it in table? 
		int32_t slot = table.getSlot ( (int32_t)wid ) ;

		// . if in table, add that up to "found"
		// . we essentially find his wid AND our wid, so 2.0 times
		if ( slot >= 0 ) {
			found += 2.0 * score;
		}

		// now the phrase
		if ( lastWid == -1LL ) {
			lastWid = wid;
			lastScore = score;
			continue;
		}

		// . what was his val?
		// . the "val" of the phrase: 
		float phrScore = score + lastScore;

		// do not count as much as single words
		phrScore *= 0.5;

		// accumulate
		sum += phrScore;

		// get the phrase id
		int64_t pid = hash64 ( wid , lastWid );

		// is it in table? 
		slot = table.getSlot ( (int32_t)pid ) ;

		// . accumulate if in there
		// . we essentially find his wid AND our wid, so 2.0 times
		if ( slot >= 0 ) found += 2.0 * phrScore;

		// we are now the last wid
		lastWid   = wid;
		lastScore = score;
	}

	// do not divide by zero
	if ( sum == 0.0 ) return 0.0;
	// sanity check
	//if ( found > sum              ) { char *xx=NULL;*xx=0; }
	if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; }
	// . return the percentage matched
	// . will range from 0.0 to 1.0
	return found / sum;
}