// . init s_mimeTable in this call
// . called from HttpServer::init
// . returns false and sets g_errno on error
bool HttpMime::init ( ) {
	// only need to call once
	if ( s_init ) return true;
	// make sure only called once
	s_init = true;
	//s_mimeTable.set ( 256 );
	// set table from internal list
	for ( unsigned long i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) {
		long key = hash32n ( s_ext[i] );
		if ( ! s_mimeTable.addKey ( key , (long)s_ext[i+1] ) ) 
			return log("HttpMime::init: failed to set table.");
	}
	// quick text
	const char *tt = getContentTypeFromExtension ( "zip" );
	if ( strcmp(tt,"application/zip") != 0 ) {
		g_errno = EBADENGINEER;
		return log("http: Failed to init mime table correctly.");
	}
	// a more thorough test
	for ( unsigned long i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) {
		tt = getContentTypeFromExtension ( s_ext[i] );
		if ( strcmp(tt,s_ext[i+1]) == 0 ) continue;
		g_errno = EBADENGINEER;
		return log("http: Failed to do mime table correctly. i=%li",i);
	}

	// TODO: set it from a user supplied file here
	return true;
}
bool canSubmit ( unsigned long h , long now , long maxAddUrlsPerIpDomPerDay ) {
	// . sometimes no limit
	// . 0 means no limit because if they don't want any submission they
	//   can just turn off add url and we want to avoid excess 
	//   troubleshooting for why a url can't be added
	if ( maxAddUrlsPerIpDomPerDay <= 0 ) return true;
	// init the table
	if ( ! s_init ) {
		s_htable.set ( 50000 );
		s_init = true;
	}
	// clean out table every 24 hours
	if ( now - s_lastTime > 24*60*60 ) {
		s_lastTime = now;
		s_htable.clear();
	}
	// . if table almost full clean out ALL slots
	// . TODO: just clean out oldest slots
	if ( s_htable.getNumSlotsUsed() > 47000 ) s_htable.clear ();
	// . how many times has this IP domain submitted?
	// . allow 10 times per day
	long n = s_htable.getValue ( h );
	// if over 24hr limit then bail
	if ( n >= maxAddUrlsPerIpDomPerDay ) return false;
	// otherwise, inc it
	n++;
	// add to table, will replace old values
	s_htable.addKey ( h , n );
	return true;
}
// Do not call this function lightly, it takes an hour to run
int CountryCode::createHashTable(void) {
	if(!fillRegexTable()) return(0);

	char tmpbuf[2048];
	HashTable ht;
	unsigned long long entries = 0UL;
	long catid;
	long numcats = g_categories->m_numCats;
	catcountryrec_t ccr;
	SafeBuf sb(tmpbuf, 2048);

	log( "cat: Creating category country/language table.\n");

	if(!ht.set(2,NULL,0,"ctrycode")) {
		log( "cat: Could not allocate memory for table.\n");
		return(0);
	}
	for(long idx = 0; idx < numcats; idx++) {
		catid = g_categories->m_cats[idx].m_catid;
		sb.reset();
		g_categories->printPathFromId(&sb, catid, true);
		if(!sb.getBufStart()) continue;
		if(!(numcats % 1000))
			log( "init: %ld/%ld Generated %llu so far...\n",
					numcats,
					idx,
					entries);
		ccr.lval = 0L;
		ccr.sval.country = lookupCountryFromDMOZTopic(sb.getBufStart(), sb.length());
		ccr.sval.lang = s_getLangIdxFromDMOZ(sb.getBufStart(), sb.length());
		if(!ccr.lval) continue;
		if(ccr.sval.lang > 27 || ccr.sval.country > s_numCountryCodes) {
			char *xx = NULL; *xx = 0;
		}
		if(!ht.addKey(catid, ccr.lval)) {
			log( "init: Could not add %ld (%ld)\n", catid, ccr.lval);
			continue;
		}
		entries++;
	}

	ht.save(g_hostdb.m_dir, "catcountry.dat");
	log( "Added %llu country entries from DMOZ to %s/catcountry.dat.\n", entries,g_hostdb.m_dir);
	log( "Slots %ld, Used Slots %ld.\n", ht.getNumSlots(), ht.getNumSlotsUsed());

	freeRegexTable();
	return(1);
}
iconv_t gbiconv_open( char *tocode, char *fromcode) {
	// get hash for to/from
	unsigned long hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0);
	unsigned long hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0);
	unsigned long hash = hash32h(hash1, hash2);

	g_errno = 0;
	iconv_t conv = (iconv_t)s_convTable.getValue(hash);
	//log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%lx: 0x%lx",
	//    fromcode, tocode,
	//    hash, conv);
	if (!conv){
		//log(LOG_DEBUG, "uni: Allocating new convertor for "
		//    "%s to %s (hash: 0x%lx)",
		//    fromcode, tocode,hash);
		conv = iconv_open(tocode, fromcode);
		if (conv == (iconv_t) -1) {
			log(LOG_WARN, "uni: failed to open converter for "
			    "%s to %s: %s (%d)", fromcode, tocode, 
			    strerror(errno), errno);
			// need to stop if necessary converters don't open
			//char *xx=NULL; *xx = 0;
			g_errno = errno;
			if (errno == EINVAL)
				g_errno = EBADCHARSET;
			
			return conv;
		}
		// add mem to table to keep track
		g_mem.addMem((void*)conv, 52, "iconv", 1);
		// cache convertor
		s_convTable.addKey(hash, (long)conv);
		//log(LOG_DEBUG, "uni: Saved convertor 0x%ld under hash 0x%lx",
		//    conv, hash);
	}
	else{
		// reset convertor
		char *dummy = NULL;
		size_t dummy2 = 0;
		// JAB: warning abatement
		//size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2);
		iconv(conv,NULL,NULL,&dummy,&dummy2);
	}

	return conv;
}
예제 #5
0
// . returns 0.0 to 1.0
// . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1)
// . gets 50% points if has all single words, and the other 50% if all phrases
// . Scores class applies to w1 only, use NULL if none
// . use word popularity information for scoring rarer term matches more
// . ONLY CHECKS FIRST 1000 WORDS of w2 for speed
float Title::getSimilarity ( Words  *w1 , int32_t i0 , int32_t i1 ,
			     Words  *w2 , int32_t t0 , int32_t t1 ) {
	// if either empty, that's 0% contained
	if ( w1->getNumWords() <= 0 ) return 0;
	if ( w2->getNumWords() <= 0 ) return 0;
	if ( i0 >= i1 ) return 0;
	if ( t0 >= t1 ) return 0;

	// invalids vals
	if ( i0 < 0   ) return 0;
	if ( t0 < 0   ) return 0;

	// . for this to be useful we must use idf
	// . get the popularity of each word in w1
	// . w1 should only be a few words since it is a title candidate
	// . does not add pop for word #i if scores[i] <= 0
	// . take this out for now since i removed the unified dict,
	//   we could use this if we added popularity to g_wiktionary
	//   but it would have to be language dependent
	Pops pops1;
	Pops pops2;
	if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0;
	if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0;

	// now hash the words in w1, the needle in the haystack
	int32_t nw1 = w1->getNumWords();
	if ( i1 > nw1 ) i1 = nw1;
	HashTable table;

	// this augments the hash table
	int64_t lastWid   = -1;
	float     lastScore = 0.0;

	// but we cannot have more than 1024 slots then
	if ( ! table.set ( 1024 ) ) return -1.0;

	// and table auto grows when 90% full, so limit us here
	int32_t count    = 0;
	int32_t maxCount = 20;

	// sum up everything we add
	float sum = 0.0;

	// loop over all words in "w1" and hash them
	for ( int32_t i = i0 ; i < i1 ; i++ ) {
		// the word id
		int64_t wid = w1->getWordId(i);

		// skip if not indexable
		if ( wid == 0 ) {
			continue;
		}

		// no room left in table!
		if ( count++ > maxCount ) {
			//logf(LOG_DEBUG, "query: Hash table for title "
			//    "generation too small. Truncating words from w1.");
			break;
		}

		// . make this a float. it ranges from 0.0 to 1.0
		// . 1.0 means the word occurs in 100% of documents sampled
		// . 0.0 means it occurs in none of them
		// . but "val" is the complement of those two statements!
		float score = 1.0 - pops1.getNormalizedPop(i);

		// accumulate
		sum += score;

		// add to table
		if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) ) {
			return -1.0;
		}

		// if no last wid, continue
		if ( lastWid == -1LL ) {
			lastWid = wid;
			lastScore = score;
			continue;
		}

		// . what was his val?
		// . the "val" of the phrase: 
		float phrScore = score + lastScore;

		// do not count as much as single words
		phrScore *= 0.5;

		// accumulate
		sum += phrScore;

		// get the phrase id
		int64_t pid = hash64 ( wid , lastWid );

		// now add that
		if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) )
			return -1.0;
		// we are now the last wid
		lastWid   = wid;
		lastScore = score;
	}

	// sanity check. it can't grow cuz we keep lastWids[] 1-1 with it
	if ( table.getNumSlots() != 1024 ) {
		log(LOG_LOGIC,"query: Title has logic bug.");
		return -1.0;
	}

	// accumulate scores of words that are found
	float found = 0.0;

	// reset
	lastWid = -1LL;

	// loop over all words in "w1" and hash them
	for ( int32_t i = t0 ; i < t1 ; i++ ) {
		// the word id
		int64_t wid = w2->getWordId(i);

		// skip if not indexable
		if ( wid == 0 ) {
			continue;
		}

		// . make this a float. it ranges from 0.0 to 1.0
		// . 1.0 means the word occurs in 100% of documents sampled
		// . 0.0 means it occurs in none of them
		// . but "val" is the complement of those two statements!
		float score = 1.0 - pops2.getNormalizedPop(i);

		// accumulate
		sum += score;

		// is it in table? 
		int32_t slot = table.getSlot ( (int32_t)wid ) ;

		// . if in table, add that up to "found"
		// . we essentially find his wid AND our wid, so 2.0 times
		if ( slot >= 0 ) {
			found += 2.0 * score;
		}

		// now the phrase
		if ( lastWid == -1LL ) {
			lastWid = wid;
			lastScore = score;
			continue;
		}

		// . what was his val?
		// . the "val" of the phrase: 
		float phrScore = score + lastScore;

		// do not count as much as single words
		phrScore *= 0.5;

		// accumulate
		sum += phrScore;

		// get the phrase id
		int64_t pid = hash64 ( wid , lastWid );

		// is it in table? 
		slot = table.getSlot ( (int32_t)pid ) ;

		// . accumulate if in there
		// . we essentially find his wid AND our wid, so 2.0 times
		if ( slot >= 0 ) found += 2.0 * phrScore;

		// we are now the last wid
		lastWid   = wid;
		lastScore = score;
	}

	// do not divide by zero
	if ( sum == 0.0 ) return 0.0;
	// sanity check
	//if ( found > sum              ) { char *xx=NULL;*xx=0; }
	if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; }
	// . return the percentage matched
	// . will range from 0.0 to 1.0
	return found / sum;
}