void gbiconv_reset(){ for (long i=0;i<s_convTable.getNumSlots();i++){ long key = s_convTable.getKey(i); if (!key) continue; iconv_t conv = (iconv_t)s_convTable.getValueFromSlot(i); if (!conv) continue; //logf(LOG_DEBUG, "iconv: freeing iconv: 0x%x", (int)iconv); g_mem.rmMem((void*)conv, 52, "iconv"); libiconv_close(conv); } s_convTable.reset(); }
// Do not call this function lightly, it takes an hour to run int CountryCode::createHashTable(void) { if(!fillRegexTable()) return(0); char tmpbuf[2048]; HashTable ht; unsigned long long entries = 0UL; long catid; long numcats = g_categories->m_numCats; catcountryrec_t ccr; SafeBuf sb(tmpbuf, 2048); log( "cat: Creating category country/language table.\n"); if(!ht.set(2,NULL,0,"ctrycode")) { log( "cat: Could not allocate memory for table.\n"); return(0); } for(long idx = 0; idx < numcats; idx++) { catid = g_categories->m_cats[idx].m_catid; sb.reset(); g_categories->printPathFromId(&sb, catid, true); if(!sb.getBufStart()) continue; if(!(numcats % 1000)) log( "init: %ld/%ld Generated %llu so far...\n", numcats, idx, entries); ccr.lval = 0L; ccr.sval.country = lookupCountryFromDMOZTopic(sb.getBufStart(), sb.length()); ccr.sval.lang = s_getLangIdxFromDMOZ(sb.getBufStart(), sb.length()); if(!ccr.lval) continue; if(ccr.sval.lang > 27 || ccr.sval.country > s_numCountryCodes) { char *xx = NULL; *xx = 0; } if(!ht.addKey(catid, ccr.lval)) { log( "init: Could not add %ld (%ld)\n", catid, ccr.lval); continue; } entries++; } ht.save(g_hostdb.m_dir, "catcountry.dat"); log( "Added %llu country entries from DMOZ to %s/catcountry.dat.\n", entries,g_hostdb.m_dir); log( "Slots %ld, Used Slots %ld.\n", ht.getNumSlots(), ht.getNumSlotsUsed()); freeRegexTable(); return(1); }
// . returns 0.0 to 1.0 // . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1) // . gets 50% points if has all single words, and the other 50% if all phrases // . Scores class applies to w1 only, use NULL if none // . use word popularity information for scoring rarer term matches more // . ONLY CHECKS FIRST 1000 WORDS of w2 for speed float Title::getSimilarity ( Words *w1 , int32_t i0 , int32_t i1 , Words *w2 , int32_t t0 , int32_t t1 ) { // if either empty, that's 0% contained if ( w1->getNumWords() <= 0 ) return 0; if ( w2->getNumWords() <= 0 ) return 0; if ( i0 >= i1 ) return 0; if ( t0 >= t1 ) return 0; // invalids vals if ( i0 < 0 ) return 0; if ( t0 < 0 ) return 0; // . for this to be useful we must use idf // . get the popularity of each word in w1 // . w1 should only be a few words since it is a title candidate // . does not add pop for word #i if scores[i] <= 0 // . take this out for now since i removed the unified dict, // we could use this if we added popularity to g_wiktionary // but it would have to be language dependent Pops pops1; Pops pops2; if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0; if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0; // now hash the words in w1, the needle in the haystack int32_t nw1 = w1->getNumWords(); if ( i1 > nw1 ) i1 = nw1; HashTable table; // this augments the hash table int64_t lastWid = -1; float lastScore = 0.0; // but we cannot have more than 1024 slots then if ( ! table.set ( 1024 ) ) return -1.0; // and table auto grows when 90% full, so limit us here int32_t count = 0; int32_t maxCount = 20; // sum up everything we add float sum = 0.0; // loop over all words in "w1" and hash them for ( int32_t i = i0 ; i < i1 ; i++ ) { // the word id int64_t wid = w1->getWordId(i); // skip if not indexable if ( wid == 0 ) { continue; } // no room left in table! if ( count++ > maxCount ) { //logf(LOG_DEBUG, "query: Hash table for title " // "generation too small. Truncating words from w1."); break; } // . make this a float. it ranges from 0.0 to 1.0 // . 1.0 means the word occurs in 100% of documents sampled // . 0.0 means it occurs in none of them // . but "val" is the complement of those two statements! float score = 1.0 - pops1.getNormalizedPop(i); // accumulate sum += score; // add to table if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) ) { return -1.0; } // if no last wid, continue if ( lastWid == -1LL ) { lastWid = wid; lastScore = score; continue; } // . what was his val? // . the "val" of the phrase: float phrScore = score + lastScore; // do not count as much as single words phrScore *= 0.5; // accumulate sum += phrScore; // get the phrase id int64_t pid = hash64 ( wid , lastWid ); // now add that if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) ) return -1.0; // we are now the last wid lastWid = wid; lastScore = score; } // sanity check. it can't grow cuz we keep lastWids[] 1-1 with it if ( table.getNumSlots() != 1024 ) { log(LOG_LOGIC,"query: Title has logic bug."); return -1.0; } // accumulate scores of words that are found float found = 0.0; // reset lastWid = -1LL; // loop over all words in "w1" and hash them for ( int32_t i = t0 ; i < t1 ; i++ ) { // the word id int64_t wid = w2->getWordId(i); // skip if not indexable if ( wid == 0 ) { continue; } // . make this a float. it ranges from 0.0 to 1.0 // . 1.0 means the word occurs in 100% of documents sampled // . 0.0 means it occurs in none of them // . but "val" is the complement of those two statements! float score = 1.0 - pops2.getNormalizedPop(i); // accumulate sum += score; // is it in table? int32_t slot = table.getSlot ( (int32_t)wid ) ; // . if in table, add that up to "found" // . we essentially find his wid AND our wid, so 2.0 times if ( slot >= 0 ) { found += 2.0 * score; } // now the phrase if ( lastWid == -1LL ) { lastWid = wid; lastScore = score; continue; } // . what was his val? // . the "val" of the phrase: float phrScore = score + lastScore; // do not count as much as single words phrScore *= 0.5; // accumulate sum += phrScore; // get the phrase id int64_t pid = hash64 ( wid , lastWid ); // is it in table? slot = table.getSlot ( (int32_t)pid ) ; // . accumulate if in there // . we essentially find his wid AND our wid, so 2.0 times if ( slot >= 0 ) found += 2.0 * phrScore; // we are now the last wid lastWid = wid; lastScore = score; } // do not divide by zero if ( sum == 0.0 ) return 0.0; // sanity check //if ( found > sum ) { char *xx=NULL;*xx=0; } if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; } // . return the percentage matched // . will range from 0.0 to 1.0 return found / sum; }