// . init s_mimeTable in this call // . called from HttpServer::init // . returns false and sets g_errno on error bool HttpMime::init ( ) { // only need to call once if ( s_init ) return true; // make sure only called once s_init = true; //s_mimeTable.set ( 256 ); // set table from internal list for ( unsigned long i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) { long key = hash32n ( s_ext[i] ); if ( ! s_mimeTable.addKey ( key , (long)s_ext[i+1] ) ) return log("HttpMime::init: failed to set table."); } // quick text const char *tt = getContentTypeFromExtension ( "zip" ); if ( strcmp(tt,"application/zip") != 0 ) { g_errno = EBADENGINEER; return log("http: Failed to init mime table correctly."); } // a more thorough test for ( unsigned long i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) { tt = getContentTypeFromExtension ( s_ext[i] ); if ( strcmp(tt,s_ext[i+1]) == 0 ) continue; g_errno = EBADENGINEER; return log("http: Failed to do mime table correctly. i=%li",i); } // TODO: set it from a user supplied file here return true; }
bool canSubmit ( unsigned long h , long now , long maxAddUrlsPerIpDomPerDay ) { // . sometimes no limit // . 0 means no limit because if they don't want any submission they // can just turn off add url and we want to avoid excess // troubleshooting for why a url can't be added if ( maxAddUrlsPerIpDomPerDay <= 0 ) return true; // init the table if ( ! s_init ) { s_htable.set ( 50000 ); s_init = true; } // clean out table every 24 hours if ( now - s_lastTime > 24*60*60 ) { s_lastTime = now; s_htable.clear(); } // . if table almost full clean out ALL slots // . TODO: just clean out oldest slots if ( s_htable.getNumSlotsUsed() > 47000 ) s_htable.clear (); // . how many times has this IP domain submitted? // . allow 10 times per day long n = s_htable.getValue ( h ); // if over 24hr limit then bail if ( n >= maxAddUrlsPerIpDomPerDay ) return false; // otherwise, inc it n++; // add to table, will replace old values s_htable.addKey ( h , n ); return true; }
// Do not call this function lightly, it takes an hour to run int CountryCode::createHashTable(void) { if(!fillRegexTable()) return(0); char tmpbuf[2048]; HashTable ht; unsigned long long entries = 0UL; long catid; long numcats = g_categories->m_numCats; catcountryrec_t ccr; SafeBuf sb(tmpbuf, 2048); log( "cat: Creating category country/language table.\n"); if(!ht.set(2,NULL,0,"ctrycode")) { log( "cat: Could not allocate memory for table.\n"); return(0); } for(long idx = 0; idx < numcats; idx++) { catid = g_categories->m_cats[idx].m_catid; sb.reset(); g_categories->printPathFromId(&sb, catid, true); if(!sb.getBufStart()) continue; if(!(numcats % 1000)) log( "init: %ld/%ld Generated %llu so far...\n", numcats, idx, entries); ccr.lval = 0L; ccr.sval.country = lookupCountryFromDMOZTopic(sb.getBufStart(), sb.length()); ccr.sval.lang = s_getLangIdxFromDMOZ(sb.getBufStart(), sb.length()); if(!ccr.lval) continue; if(ccr.sval.lang > 27 || ccr.sval.country > s_numCountryCodes) { char *xx = NULL; *xx = 0; } if(!ht.addKey(catid, ccr.lval)) { log( "init: Could not add %ld (%ld)\n", catid, ccr.lval); continue; } entries++; } ht.save(g_hostdb.m_dir, "catcountry.dat"); log( "Added %llu country entries from DMOZ to %s/catcountry.dat.\n", entries,g_hostdb.m_dir); log( "Slots %ld, Used Slots %ld.\n", ht.getNumSlots(), ht.getNumSlotsUsed()); freeRegexTable(); return(1); }
iconv_t gbiconv_open( char *tocode, char *fromcode) { // get hash for to/from unsigned long hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0); unsigned long hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0); unsigned long hash = hash32h(hash1, hash2); g_errno = 0; iconv_t conv = (iconv_t)s_convTable.getValue(hash); //log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%lx: 0x%lx", // fromcode, tocode, // hash, conv); if (!conv){ //log(LOG_DEBUG, "uni: Allocating new convertor for " // "%s to %s (hash: 0x%lx)", // fromcode, tocode,hash); conv = iconv_open(tocode, fromcode); if (conv == (iconv_t) -1) { log(LOG_WARN, "uni: failed to open converter for " "%s to %s: %s (%d)", fromcode, tocode, strerror(errno), errno); // need to stop if necessary converters don't open //char *xx=NULL; *xx = 0; g_errno = errno; if (errno == EINVAL) g_errno = EBADCHARSET; return conv; } // add mem to table to keep track g_mem.addMem((void*)conv, 52, "iconv", 1); // cache convertor s_convTable.addKey(hash, (long)conv); //log(LOG_DEBUG, "uni: Saved convertor 0x%ld under hash 0x%lx", // conv, hash); } else{ // reset convertor char *dummy = NULL; size_t dummy2 = 0; // JAB: warning abatement //size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2); iconv(conv,NULL,NULL,&dummy,&dummy2); } return conv; }
// . returns 0.0 to 1.0 // . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1) // . gets 50% points if has all single words, and the other 50% if all phrases // . Scores class applies to w1 only, use NULL if none // . use word popularity information for scoring rarer term matches more // . ONLY CHECKS FIRST 1000 WORDS of w2 for speed float Title::getSimilarity ( Words *w1 , int32_t i0 , int32_t i1 , Words *w2 , int32_t t0 , int32_t t1 ) { // if either empty, that's 0% contained if ( w1->getNumWords() <= 0 ) return 0; if ( w2->getNumWords() <= 0 ) return 0; if ( i0 >= i1 ) return 0; if ( t0 >= t1 ) return 0; // invalids vals if ( i0 < 0 ) return 0; if ( t0 < 0 ) return 0; // . for this to be useful we must use idf // . get the popularity of each word in w1 // . w1 should only be a few words since it is a title candidate // . does not add pop for word #i if scores[i] <= 0 // . take this out for now since i removed the unified dict, // we could use this if we added popularity to g_wiktionary // but it would have to be language dependent Pops pops1; Pops pops2; if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0; if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0; // now hash the words in w1, the needle in the haystack int32_t nw1 = w1->getNumWords(); if ( i1 > nw1 ) i1 = nw1; HashTable table; // this augments the hash table int64_t lastWid = -1; float lastScore = 0.0; // but we cannot have more than 1024 slots then if ( ! table.set ( 1024 ) ) return -1.0; // and table auto grows when 90% full, so limit us here int32_t count = 0; int32_t maxCount = 20; // sum up everything we add float sum = 0.0; // loop over all words in "w1" and hash them for ( int32_t i = i0 ; i < i1 ; i++ ) { // the word id int64_t wid = w1->getWordId(i); // skip if not indexable if ( wid == 0 ) { continue; } // no room left in table! if ( count++ > maxCount ) { //logf(LOG_DEBUG, "query: Hash table for title " // "generation too small. Truncating words from w1."); break; } // . make this a float. it ranges from 0.0 to 1.0 // . 1.0 means the word occurs in 100% of documents sampled // . 0.0 means it occurs in none of them // . but "val" is the complement of those two statements! float score = 1.0 - pops1.getNormalizedPop(i); // accumulate sum += score; // add to table if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) ) { return -1.0; } // if no last wid, continue if ( lastWid == -1LL ) { lastWid = wid; lastScore = score; continue; } // . what was his val? // . the "val" of the phrase: float phrScore = score + lastScore; // do not count as much as single words phrScore *= 0.5; // accumulate sum += phrScore; // get the phrase id int64_t pid = hash64 ( wid , lastWid ); // now add that if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) ) return -1.0; // we are now the last wid lastWid = wid; lastScore = score; } // sanity check. it can't grow cuz we keep lastWids[] 1-1 with it if ( table.getNumSlots() != 1024 ) { log(LOG_LOGIC,"query: Title has logic bug."); return -1.0; } // accumulate scores of words that are found float found = 0.0; // reset lastWid = -1LL; // loop over all words in "w1" and hash them for ( int32_t i = t0 ; i < t1 ; i++ ) { // the word id int64_t wid = w2->getWordId(i); // skip if not indexable if ( wid == 0 ) { continue; } // . make this a float. it ranges from 0.0 to 1.0 // . 1.0 means the word occurs in 100% of documents sampled // . 0.0 means it occurs in none of them // . but "val" is the complement of those two statements! float score = 1.0 - pops2.getNormalizedPop(i); // accumulate sum += score; // is it in table? int32_t slot = table.getSlot ( (int32_t)wid ) ; // . if in table, add that up to "found" // . we essentially find his wid AND our wid, so 2.0 times if ( slot >= 0 ) { found += 2.0 * score; } // now the phrase if ( lastWid == -1LL ) { lastWid = wid; lastScore = score; continue; } // . what was his val? // . the "val" of the phrase: float phrScore = score + lastScore; // do not count as much as single words phrScore *= 0.5; // accumulate sum += phrScore; // get the phrase id int64_t pid = hash64 ( wid , lastWid ); // is it in table? slot = table.getSlot ( (int32_t)pid ) ; // . accumulate if in there // . we essentially find his wid AND our wid, so 2.0 times if ( slot >= 0 ) found += 2.0 * phrScore; // we are now the last wid lastWid = wid; lastScore = score; } // do not divide by zero if ( sum == 0.0 ) return 0.0; // sanity check //if ( found > sum ) { char *xx=NULL;*xx=0; } if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; } // . return the percentage matched // . will range from 0.0 to 1.0 return found / sum; }