// . get collectionRec from name
// . returns NULL if not available
CollectionRec *Collectiondb::getRec ( char *coll ) {
	if ( ! coll ) coll = "";
	return getRec ( coll , gbstrlen(coll) );
}
// we only come back up here 1) in the very beginning or 2) when a url 
// completes its pipeline of requests
bool Msge0::launchRequests ( long starti ) {
	// reset any error code
	g_errno = 0;
 loop:
	// stop if no more urls. return true if we got all replies! no block.
	if ( m_n >= m_numUrls ) return (m_numRequests == m_numReplies);
	// if all hosts are getting a diffbot reply with 50 spiders and they
	// all timeout at the same time we can very easily clog up the
	// udp sockets, so use this to limit... i've seen the whole
	// spider tables stuck with "getting outlink tag rec vector"statuses
	long maxOut = MAX_OUTSTANDING_MSGE0;
	if ( g_udpServer.m_numUsedSlots > 500 ) maxOut = 1;
	// if we are maxed out, we basically blocked!
	if (m_numRequests - m_numReplies >= maxOut ) return false;
	// . skip if "old"
	// . we are not planning on adding this to spiderdb, so Msg16
	//   want to skip the ip lookup, etc.
	if ( m_urlFlags && (m_urlFlags[m_n] & LF_OLDLINK) && m_skipOldLinks ) {
		m_numRequests++; 
		m_numReplies++; 
		m_n++; 
		goto loop; 
	}
	// if url is same host as the tagrec provided, just reference that!
	if ( m_urlFlags && (m_urlFlags[m_n] & LF_SAMEHOST) && m_baseTagRec) {
		m_tagRecPtrs[m_n] = (TagRec *)m_baseTagRec;
		m_numRequests++; 
		m_numReplies++; 
		m_n++; 
		goto loop; 
	}
	// . get the next url
	// . if m_xd is set, create the url from the ad id
	char *p = m_urlPtrs[m_n];
	// get the length
	long  plen = gbstrlen(p);
	// . grab a slot
	// . m_msg8as[i], m_msgCs[i], m_msg50s[i], m_msg20s[i]
	long i;
	// make this 0 since "maxOut" now changes!!
	for ( i = 0 /*starti*/ ; i < MAX_OUTSTANDING_MSGE0 ; i++ )
		if ( ! m_used[i] ) break;
	// sanity check
	if ( i >= MAX_OUTSTANDING_MSGE0 ) { char *xx = NULL; *xx = 0; }
	// normalize the url
	m_urls[i].set ( p , plen );
	// save the url number, "n"
	m_ns  [i] = m_n;
	// claim it
	m_used[i] = true;

	// note it
	//if ( g_conf.m_logDebugSpider )
	//	log(LOG_DEBUG,"spider: msge0: processing url %s",
	//	    m_urls[i].getUrl());

	// . start it off
	// . this will start the pipeline for this url
	// . it will set m_used[i] to true if we use it and block
	// . it will increment m_numRequests and NOT m_numReplies if it blocked
	sendMsg8a ( i );
	// consider it launched
	m_numRequests++;
	// inc the url count
	m_n++;
	// try to do another
	goto loop;
}
예제 #3
0
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( long long hostId      , // host to ask (-1 if none)
		     long      ip          , // info on hostId
		     short     port        ,
		     long      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     char     *coll        ,
		     RdbList  *list        ,
		     //key_t     startKey    , 
		     //key_t     endKey      , 
		     char     *startKey    ,
		     char     *endKey      ,
		     long      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     long      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     long      firstHostId   ,
		     long      startFileNum  ,
		     long      numFiles      ,
		     long      timeout       ,
		     long long syncPoint     ,
		     long      preferLocalReads ,
		     Msg5     *msg5             ,
		     Msg5     *msg5b            ,
		     bool      isRealMerge      ,
//#ifdef SPLIT_INDEXDB
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit , // doIndexdbSplit    ,
		     long      forceParitySplit  ) {
//#else
//		     bool      allowPageCache ) {
//#endif
	// this is obsolete! mostly, but we need it for PageIndexdb.cpp to 
	// show a "termlist" for a given query term in its entirety so you 
	// don't have to check each machine in the network. if this is true it
	// means to query each split and merge the results together into a
	// single unified termlist. only applies to indexdb/datedb.
	//if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; }
	// note this because if caller is wrong it hurts performance major!!
	//if ( doIndexdbSplit ) 
	//	logf(LOG_DEBUG,"net: doing msg0 with indexdb split true");
	// warning
	if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	//if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; }

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,
		    "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
		return true;
	}

	// debug msg
	//if ( niceness != 0 ) log("HEY start");
	// ensure startKey last bit clear, endKey last bit set
	//if ( (startKey.n0 & 0x01) == 0x01 ) 
	//	log("Msg0::getList: warning startKey lastbit set"); 
	//if ( (endKey.n0   & 0x01) == 0x00 ) 
	//	log("Msg0::getList: warning endKey lastbit clear"); 
	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	//m_ip            = ip;
	//m_port          = port;
	m_addToCache    = addToCache;
	// . these define our request 100%
	//m_startKey      = startKey;
	//m_endKey        = endKey;
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_coll          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// did they force it? core until i figure out what this is
	if ( forceParitySplit >= 0 ) 
		m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
	else
		m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
	// how is this used?
	if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId;

	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree))
		log(LOG_LOGIC,"net: msg0: "
		    "Weird. check but don't add... rdbid=%li.",(long)m_rdbId);
	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && g_hostdb.m_groupId == m_groupId );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	m_numSplit = 1;
	if ( g_hostdb.m_indexSplits > 1 &&
	     ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&&
	     ! forceLocalIndexdb && doIndexdbSplit ) {
		isLocal  = false;
		//m_numSplit = INDEXDB_SPLIT;
		m_numSplit = g_hostdb.m_indexSplits;
		char *xx=NULL;*xx=0;
	}
	*/
	/*
	long long singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		long long d1 = g_posdb.getDocId(m_startKey);
		long long d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// force a msg0 if doing a docid restrictive query like
	// gbdocid:xxxx|<query> so we call cacheTermLists() 
	//if ( singleDocIdQuery ) isLocal = false;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) { // && !g_conf.m_interfaceMachine ) {
		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		// same for msg5b
		if ( msg5b ) {
			m_msg5b = msg5b;
			m_deleteMsg5b = false;
		}
		else if ( m_rdbId == RDB_TITLEDB ) {
			try { m_msg5b = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request. 2.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" );
			m_deleteMsg5b = true;
		}
		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 coll ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 m_msg5b   ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) return false;
		// nuke it
		reset();
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "group=%li listPtr=%li minRecSizes=%li termId=%llu "
		    //"startKey.n1=%lx,n0=%llx (niceness=%li)",
		    "startKey.n1=%llx,n0=%llx (niceness=%li)",
		    g_hostdb.makeHostId ( m_groupId ) ,(long)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (long)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (long)m_niceness);

	char *replyBuf = NULL;
	long  replyBufMaxSize = 0;
	bool  freeReply = true;

	// adjust niceness for net transmission
	bool realtime = false;
	//if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true;

	// if we're niceness 0 we need to pre-allocate for reply since it
	// might be received within the asynchronous signal handler which
	// cannot call mmalloc()
	if ( realtime ) { // niceness <= 0 || netnice == 0 ) {
		// . we should not get back more than minRecSizes bytes since 
		//   we are now performing merges
		// . it should not slow things down too much since the hashing
		//   is 10 times slower than merging anyhow...
		// . CAUTION: if rdb is not fixed-datasize then this will
		//            not work for us! it can exceed m_minRecSizes.
		replyBufMaxSize = m_minRecSizes ;
		// . get a little extra to fix the error where we ask for 64 
		//   but get 72
		// . where is that coming from?
		// . when getting titleRecs we often exceed the minRecSizes 
		// . ?Msg8? was having trouble. was short 32 bytes sometimes.
		replyBufMaxSize += 36;
		// why add ten percent?
		//replyBufMaxSize *= 110 ;
		//replyBufMaxSize /= 100 ;
		// make a buffer to hold the reply
//#ifdef SPLIT_INDEXDB
/*
		if ( m_numSplit > 1 ) {
			m_replyBufSize = replyBufMaxSize * m_numSplit;
			replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0");
			m_replyBuf  = replyBuf;
			freeReply = false;
		}
		else
*/
//#endif
			replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0");
		// g_errno is set and we return true if it failed
		if ( ! replyBuf ) {
			log("net: Failed to pre-allocate %li bytes to hold "
			    "data read remotely from %s: %s.",
			    replyBufMaxSize,getDbnameFromId(m_rdbId),
			    mstrerror(g_errno));
			return true;
		}
	}

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(long long *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(long      *) p = m_minRecSizes    ; p += 4;
	*(long      *) p = startFileNum     ; p += 4;
	*(long      *) p = numFiles         ; p += 4;
	*(long      *) p = maxCacheAge      ; p += 4;
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %lli.",
			    m_hostId);
			return true;
		}
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		unsigned short port;
		QUICKPOLL(m_niceness);
		//if ( niceness <= 0 || netnice == 0 ) { 
		//if ( realtime ) {
		//	us = &g_udpServer2; port = h->m_port2; }
		//else                 { 
		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) // cback niceness
			return true;
		// return false cuz it blocked
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;
	//if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. "
	//			"termId=%llu, "
	//			"groupNum=%lu",
	//			g_indexdb.getTermId(m_startKey) ,
	//			g_hostdb.makeHostId ( m_groupId ) );

	/*
	// make the cache key so we can see what remote host cached it, if any
	char cacheKey[MAX_KEY_BYTES];
	//key_t cacheKey = makeCacheKey ( startKey     ,
	makeCacheKey ( startKey     ,
		       endKey       ,
		       includeTree  ,
		       minRecSizes  ,
		       startFileNum ,
		       numFiles     ,
		       cacheKey     ,
		       m_ks         );
	*/

	// . get the top long of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	long keyTop = hash32 ( (char *)startKey , m_ks );

	/*
	// allocate space
	if ( m_numSplit > 1 ) {
		long  need = m_numSplit * sizeof(Multicast) ;
		char *buf  = (char *)mmalloc ( need,"msg0mcast" );
		if ( ! buf ) return true;
		m_mcasts = (Multicast *)buf;
		for ( long i = 0; i < m_numSplit ; i++ )
			m_mcasts[i].constructor();
	}
	*/

        // . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
//#ifdef SPLIT_INDEXDB
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( long i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	long gr;
	char *buf;
	/*
	if ( m_numSplit > 1 ) {
		gr  = g_indexdb.getSplitGroupId ( baseGroupId, i );
		buf = &replyBuf[i*replyBufMaxSize];
	}
	else {
	*/
	gr  = m_groupId;
	buf = replyBuf;
	//}

	// get the multicast
	Multicast *m = &m_mcast;
	//if ( m_numSplit > 1 ) m = &m_mcasts[i];

        if ( ! m->send ( m_request    , 
//#else
//        if ( ! m_mcast.send ( m_request    , 
//#endif
			      m_requestSize, 
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
//#ifdef SPLIT_INDEXDB
			      gr           , // group + offset
//#else
//			      m_groupId    , // group to send to (groupKey)
//#endif
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout      , // timeout in seconds (was 30)
			      niceness     ,
			      realtime     ,
			      firstHostId  ,
//#ifdef SPLIT_INDEXDB
//			      &replyBuf[i*replyBufMaxSize] ,
//#else
//			      replyBuf        ,
//#endif
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) {
		log("net: Failed to send request for data from %s in group "
		    "#%li over network: %s.",
		    getDbnameFromId(m_rdbId),m_groupId, mstrerror(g_errno));
		// no, multicast will free this when it is destroyed
		//if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" );
		// but speed it up
//#ifdef SPLIT_INDEXDB
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 )
			return false;
//#else
//		m_mcast.reset();
//#endif
		return true;
	}
//#ifdef SPLIT_INDEXDB
	m_numRequests++;

//#endif
	// we blocked
	return false;
}
char *getMatches2 ( Needle *needles          , 
		    int32_t    numNeedles       ,
		    char   *haystack         , 
		    int32_t    haystackSize     ,
		    char   *linkPos          ,
		    int32_t   *needleNum        ,
		    bool    stopAtFirstMatch ,
		    bool   *hadPreMatch      ,
		    bool    saveQuickTables  ,
		    int32_t    niceness         ) {

	// assume not
	if ( hadPreMatch ) *hadPreMatch = false;
	// empty haystack? then no matches
	if ( ! haystack || haystackSize <= 0 ) return NULL;
	// JAB: no needles? then no matches
	if ( ! needles  || numNeedles   <= 0 ) return NULL;

	//char tmp[8192];
	//char *t    = tmp;
	//char *tend = tmp + 8192;

	// reset counts to 0
	//if ( ! stopAtFirstMatch )
	//	for ( int32_t i=0 ; i < numNeedles ; i++ ) 
	//		needles[i].m_count = 0;

	// are we responsible for init'ing string lengths? this is much
	// faster than having to specify lengths manually.
	for ( int32_t i=0 ; i < numNeedles; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		// clear
		needles[i].m_count      = 0;
		needles[i].m_firstMatch = NULL;
		// set the string size in bytes if not provided
		if ( needles[i].m_stringSize == 0 )
			needles[i].m_stringSize = gbstrlen(needles[i].m_string);
	}

	// . set up the quick tables.
	// . utf16 is not as effective here because half the bytes are zeroes!
	// . TODO: use a static cache of like 4 of these tables where the key
	//         is the Needles ptr ... done
	int32_t numNeedlesToInit = numNeedles;
	char space[256 * 4 * sizeof(BITVEC)];
	char *buf = NULL;

	BITVEC *s0;
	BITVEC *s1;
	BITVEC *s2;
	BITVEC *s3;

	/*
	static bool s_quickTableInit = false;
	static char s_qtbuf[128*(12+1)*2];

	int32_t slot = -1;
	if(saveQuickTables) {
		if ( ! s_quickTableInit ) {
			s_quickTableInit = true;
			s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
		}
		uint64_t key = (uint32_t)needles;
		slot = s_quickTables.getSlot(&key);
		if ( slot >= 0 ) {
			buf = s_quickTables.getValueFromSlot(slot);
			numNeedlesToInit = 0;
		}
	}
	*/

	if(!buf) {
		buf = space;
		memset ( buf , 0 , sizeof(BITVEC)*256*4);
	}

	/*
	if( useQuickTables && slot == -1 ) {
		//buf = (char*)mcalloc(sizeof(uint32_t)*256*5,
		//		     "matches");
		if(buf) s_quickTables.addKey(&key, &buf);
		//sanity check, no reason why there needs to be a 
		//limit, I just don't expect there to be this many
		//static needles at this point.
		if(s_quickTables.getNumSlotsUsed() > 32){
			char *xx=NULL; *xx = 0;
		}
	}
	*/

	// try 64 bit bit vectors now since we doubled # of needles
	int32_t offset = 0;
	s0 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s1 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s2 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s3 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;

	BITVEC mask;

	// set the letter tables, s0[] through sN[], for each needle
	for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		unsigned char *w    = (unsigned char *)needles[i].m_string;
		unsigned char *wend = w + needles[i].m_stringSize;
		// BITVEC is now 64 bits
		mask = (1<<(i&0x3f)); // (1<<(i%64));
		// if the needle is small, fill up the remaining letter tables
		// with its mask... so it matches any character in haystack.
		s0[(unsigned char)to_lower_a(*w)] |= mask;
		s0[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s1[j] |= mask;
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s1[(unsigned char)to_lower_a(*w)] |= mask;
		s1[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s2[(unsigned char)to_lower_a(*w)] |= mask;
		s2[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s3[j] |= mask;
			}
			continue;
		}

		s3[(unsigned char)to_lower_a(*w)] |= mask;
		s3[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
	}

	// return a ptr to the first match if we should, this is it
	char *retVal = NULL;
	// debug vars
	//int32_t debugCount = 0;
	//int32_t pp = 0;
	// now find the first needle in the haystack
	unsigned char *p    = (unsigned char *)haystack;
	unsigned char *pend = (unsigned char *)haystack + haystackSize;
	char          *dend = (char *)pend;

	// do not breach!
	pend -= 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// analytics...
		
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		mask &= s1[*(p+1)];
		if ( ! mask ) continue;
		mask &= s2[*(p+2)];
		if ( ! mask ) continue;
		mask &= s3[*(p+3)];
		if ( ! mask ) continue;
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//gbmemcpy ( xx , p , 8 );
		for ( int32_t k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		gbmemcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain... 
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			int32_t msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection && 
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// see if we match another needle, fixes bug
			// of matching "anal" but not "analy[tics]"
			continue;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}

	//
	// HACK:
	// 
	// repeat above loop but for the last 4 characters in haystack!!
	// this fixes a electric fence mem breach core
	//
	// it is slower because we check for \0
	//
	pend += 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		if ( p+1 < pend ) {
			mask &= s1[*(p+1)];
			if ( ! mask ) continue;
		}
		if ( p+2 < pend ) {
			mask &= s2[*(p+2)];
			if ( ! mask ) continue;
		}
		if ( p+3 < pend ) {
			mask &= s3[*(p+3)];
			if ( ! mask ) continue;
		}
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//gbmemcpy ( xx , p , 8 );
		for ( int32_t k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		gbmemcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain... 
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (!p[1] ||
				    needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (!p[2] ||
				    needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (!p[3] ||
				    needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			int32_t msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection && 
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}


	//if ( debugCount > 0 ) pp = haystackSize / debugCount;
	//log("build: debug count = %"INT32" uc=%"INT32" hsize=%"INT32" "
	//    "1 in %"INT32" chars matches.",
	//    debugCount,(int32_t)isHaystackUtf16,haystackSize,pp);

	// before we exit, clean up
	return retVal;
}
int main ( int argc , char *argv[] ) {
	bool addWWW = true;
	bool stripSession = true;
	// check for arguments
	for (int32_t i = 1; i < argc; i++) {
		if (strcmp(argv[i], "-w") == 0)
			addWWW = false;
		else if (strcmp(argv[i], "-s") == 0)
			stripSession = false;
	}
	// initialize
	//g_mem.init(100*1024);
	hashinit();
	//g_conf.m_tfndbExtBits = 23;
 loop:
	// read a url from stddin
	char sbuf[1024];
	if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1);
	char *s = sbuf;
	char fbuf[1024];
	// decode if we should
	if ( strncmp(s,"http%3A%2F%2F",13) == 0 ||
	     strncmp(s,"https%3A%2F%2F",13) == 0 ) {
		urlDecode(fbuf,s,gbstrlen(s));
		s = fbuf;
	}
	// old url
	printf("###############\n");
	printf("old: %s",s);
	int32_t slen = gbstrlen(s);
	// remove any www. if !addWWW
	if (!addWWW) {
		if (slen >= 4 &&
		    strncasecmp(s, "www.", 4) == 0) {
			slen -= 4;
			memmove(s, &s[4], slen);
		}
		else {
			// get past a ://
			int32_t si = 0;
			while (si < slen &&
			       ( s[si] != ':' ||
				 s[si+1] != '/' ||
				 s[si+2] != '/' ) )
				si++;
			// remove the www.
			if (si + 7 < slen) {
				si += 3;
				if (strncasecmp(&s[si], "www.", 4) == 0) {
					slen -= 4;
					memmove(&s[si], &s[si+4], slen-si);
				}
			}
		}
	}
	// set it
	Url u;
	u.set ( s , slen ,
		addWWW   ,      /*add www?*/
		stripSession ); /*strip session ids?*/
	// print it
	char out[1024*4];
	char *p = out;
	p += sprintf(p,"tld: ");
	gbmemcpy ( p, u.getTLD(),u.getTLDLen());
	p += u.getTLDLen();
	char c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	

	// dom
	p = out;
	sprintf ( p , "dom: ");
	p += gbstrlen ( p );
	gbmemcpy ( p , u.getDomain() , u.getDomainLen() );
	p += u.getDomainLen();
	c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	// host
	p = out;
	sprintf ( p , "host: ");
	p += gbstrlen ( p );
	gbmemcpy ( p , u.getHost() , u.getHostLen() );
	p += u.getHostLen();
	c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	// then the whole url
	printf("url: %s\n", u.getUrl() );

	/*
	int32_t  siteLen;
	char *site = u.getSite ( &siteLen , NULL , false );
	if ( site ) {
		c = site[siteLen];
		site[siteLen] = '\0';
	}
	printf("site: %s\n", site );
	if ( site ) site[siteLen] = c;
	*/
	SiteGetter sg;
	sg.getSite ( u.getUrl() ,
		     NULL , // tagrec
		     0 , // timestamp
		     NULL, // coll
		     0 , // niceness
		     //false , // addtags
		     NULL , // state
		     NULL ); // callback
	if ( sg.m_siteLen )
		printf("site: %s\n",sg.m_site);

	printf("isRoot: %"INT32"\n",(int32_t)u.isRoot());

	/*
	bool perm = ::isPermalink ( NULL        , // coll
				    NULL        , // Links ptr
				    &u          , // the url
				    CT_HTML     , // contentType
				    NULL        , // LinkInfo ptr
				    false       );// isRSS?
	printf ("isPermalink: %"INT32"\n",(int32_t)perm);
	*/

	// print the path too
	p = out;

	p += sprintf ( p , "path: " );
	gbmemcpy ( p , u.getPath(), u.getPathLen() );
	p += u.getPathLen();

	if ( u.getFilename() ) {
		p += sprintf ( p , "\nfilename: " );
		gbmemcpy ( p , u.getFilename(), u.getFilenameLen() );
		p += u.getFilenameLen();
		*p = '\0';
		printf("%s\n", out );
	}

	// encoded
	char dst[MAX_URL_LEN+200];
	urlEncode ( dst,MAX_URL_LEN+100,
				u.getUrl(), u.getUrlLen(), 
				false ); // are we encoding a request path?
	printf("encoded: %s\n",dst);

	// the probable docid
	int64_t pd = g_titledb.getProbableDocId(&u);
	printf("pdocid: %"UINT64"\n", pd );
	printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) );
	//printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u));
	if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
	else                  printf("islinkloop: no\n");
	int64_t hh64 = u.getHostHash64();
	printf("hosthash64: 0x%016"XINT64"\n",hh64);
	uint32_t hh32 = u.getHostHash32();
	printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32);
	int64_t dh64 = u.getDomainHash64();
	printf("domhash64: 0x%016"XINT64"\n",dh64);
	int64_t uh64 = u.getUrlHash64();
	printf("urlhash64: 0x%016"XINT64"\n",uh64);
	//if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n");
	//else                            printf("unregulated: no\n");
	goto loop;
}
// returns false on bad mime
bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) {
	// reset locUrl to 0
	m_locUrl.reset();
	// return if we have no valid complete mime
	if ( mimeLen == 0 ) return false;
	// status is on first line
	m_status = -1;
	// skip HTTP/x.x till we hit a space
	char *p = mime;
	char *pend = mime + mimeLen;
	while ( p < pend && !is_wspace_a(*p) ) p++;
	// then skip over spaces
	while ( p < pend &&  is_wspace_a(*p) ) p++;
	// return false on a problem
	if ( p == pend ) return false;
	// then read in the http status
	m_status = atol2 ( p , pend - p );
	// if no Content-Type: mime field was provided, assume html
	m_contentType = CT_HTML;
	// assume default charset
	m_charset    = NULL;
	m_charsetLen = 0;
	// set contentLen, lastModifiedDate, m_cookie
	p = mime;
	while ( p < pend ) {
		// compute the length of the string starting at p and ending
		// at a \n or \r
		long len = 0;
		while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++;
		// . if we could not find a \n or \r there was an error
		// . MIMEs must always end in \n or \r
		if ( &p[len] >= pend ) return false;
		// . stick a NULL at the end of the line 
		// . overwrites \n or \r TEMPORARILY
		char c = p [ len ];
		p [ len ] = '\0';
		// parse out some meaningful data
		if      ( strncasecmp ( p , "Content-Length:" ,15) == 0 ) {
			m_contentLengthPos = p + 15;
			m_contentLen = atol( m_contentLengthPos);
		}
		else if ( strncasecmp ( p , "Last-Modified:"  ,14) == 0 ) {
			m_lastModifiedDate=atotime(p+14);
			// do not let them exceed current time for purposes
			// of sorting by date using datedb (see Msg16.cpp)
			time_t now = time(NULL);
			if (m_lastModifiedDate > now) m_lastModifiedDate = now;
		}
		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) 
			m_contentType = getContentTypePrivate ( p + 13 );
		else if ( strncasecmp ( p , "Set-Cookie: "   ,11) == 0 ) {
			m_cookie = p + 11;
			m_cookieLen = gbstrlen ( p + 11 );
		}
		else if ( strncasecmp ( p , "Location:"       , 9) == 0 ) {
			// point to it
			char *tt = p + 9;
			// skip if space
			if ( *tt == ' ' ) tt++;
			if ( *tt == ' ' ) tt++;
			// at least set this for Msg13.cpp to use
			m_locationField    = tt;
			m_locationFieldLen = gbstrlen(tt);
			// . we don't add the "www." because of slashdot.com
			// . we skip initial spaces in this Url::set() routine
			if(url)
				m_locUrl.set ( url, p + 9, len - 9,
					       false/*addWWW?*/);
		}
		else if ( strncasecmp ( p , "Content-Encoding:", 17) == 0 ) {
			//only support gzip now, it doesn't seem like servers
			//implement the other types much
			m_contentEncodingPos = p+17;
			if(strstr(m_contentEncodingPos, "gzip")) {
				m_contentEncoding = ET_GZIP;
			}
			else if(strstr(m_contentEncodingPos, "deflate")) {
				//zlib's compression
				m_contentEncoding = ET_DEFLATE;
			}
		}
		//else if ( strncasecmp ( p, "Cookie:", 7) == 0 )
		//	log (LOG_INFO, "mime: Got Cookie = %s", (p+7));
		// re-insert the character that we replaced with a '\0'
		p [ len ] = c;
		// go to next line
		p += len;
		// skip over the cruft at the end of this line
		while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++;
	}
	return true;
}				
예제 #7
0
// this should be called when all docs have finished spidering
void Test::stopIt ( ) {

	// sanity
	if ( m_isAdding ) { char *xx=NULL;*xx=0; }
	// flag that we are done
	m_isRunning = false;

	// print time
	log("test: took %lli ms to complete injections.",
	    gettimeofdayInMilliseconds() - m_testStartTime );

	// get this before setting testParserEnabled to false
	char *testDir = g_test.getTestDir();

	// turn this off now too
	g_conf.m_testParserEnabled = false;
	g_conf.m_testSpiderEnabled = false;



	// save all!
	bool disabled = g_threads.m_disabled;
	g_threads.disableThreads();
	// save it blocking style
	g_process.save();
	if ( ! disabled ) g_threads.enableThreads();

	// save ips.txt
	saveTestBuf ( testDir );

	log("test: test completed. making qa.html");

	//
	//
	// NOW MAKE THE qa.html FILE
	//
	//

	// only analyze up to last 7 runs
	long start = m_runId - 7;
	if ( start < 0 ) start = 0;

	SafeBuf sb;
	sb.safePrintf("<table border=1>\n");
	sb.safePrintf("<tr>"
		      "<td><b><nobr>run id</nobr></b></td>"
		      "<td><b><nobr>conf diff</nobr></b></td>"
		      "<td><b><nobr>coll diff</nobr></b></td>"
		      "<td><b><nobr>run info</nobr></b></td>"
		      "</tr>\n");

	// take diffs between this run and the last run for confparms
	for ( long i = m_runId ; i > start ; i-- ) {
		// shortcut
		char *dir = g_hostdb.m_dir;
		// make diff filename
		char diff1[200];
		sprintf(diff1,"%s/%s/run.%li.confparms.txt.diff",dir,
			testDir,i);
		File f1;
		f1.set(diff1);
		if ( ! f1.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%li.confparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%li.confparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff1);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		long fs1 = f1.getFileSize();
		sb.safePrintf("<tr><td>%li</td><td>%li</td>", i,fs1);

		// make diff filename
		char diff2[200];
		sprintf(diff2,"%s/%s/run.%li.collparms.txt.diff",dir,
			testDir,i);
		File f2;
		f2.set(diff2);
		if ( ! f2.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%li.collparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%li.collparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff2);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		long fs2 = f2.getFileSize();
		sb.safePrintf("<td>%li</td>", fs2);

		// the version
		char vf[200]; 
		sprintf(vf,"%s/%s/run.%li.version.txt",dir,testDir,i);
		File f3; 
		f3.set ( vf );
		long fs3 = f3.getFileSize();
		char vbuf[1000];
		vbuf[0] = 0;
		if ( fs3 > 0 ) {
			f3.open(O_RDONLY);
			long rs = f3.read(vbuf,fs3,0);
			vbuf[fs3] = '\0';
			if ( rs <= 0 ) continue;
			f3.close();
		}
		// show it
		sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf);
	}
	sb.safePrintf("</table>\n");
	sb.safePrintf("<br>\n");


	//
	// now diff each parser output file for each url in urls.txt
	//


	//
	// loop over url buf first so we can print one table per url
	//

	char *next = NULL;
	// reset the url buf ptr
	m_urlPtr = m_urlBuf;
	// count em
	long count = 0;

	// ptrs to each url table
	long  un = 0;
	long  uptr [5000]; // offsets now, not char ptr since buf gets reallocd
	char  udiff[5000];
	long  ulen [5000];
	long  uhits[5000]; // critical errors! validateOutput() choked!
	long  uunchecked[5000]; // events/addresses found but were not validatd
	long  umiss[5000];
	long  usort[5000];
	long  uevents[5000];
	SafeBuf tmp;

	long niceness = MAX_NICENESS;

	// advance to next url
	for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) {
		// breathe
		QUICKPOLL(niceness);
		// we converted all non-url chars into \0's so skip those!
		for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ );
		// breach check
		if ( m_urlPtr >= m_urlEnd ) break;
		// set this up
		next = m_urlPtr;
		// compute next url ptr
		for ( ; next < m_urlEnd && *next ; next++ );
		// point to this url
		char *u = m_urlPtr;
		// get hash
		long long h = hash64 ( u , gbstrlen(u) );
		// shortcut
		char *dir = g_hostdb.m_dir;


		// print into a secondary safe buf with a ptr to
		// it so we can sort that and transfer into the
		// primary safebuf later
		uptr[un] = tmp.length();
		// assume no diff
		udiff[un] = 0;

		// print number
		tmp.safePrintf("%li) ",count++);
		// . link to our stored http server reply
		// . TODO: link it to our [cached] copy in the test coll!!!
		char local[1200];
		sprintf(local,"/%s/doc.%llu.html",testDir,h);
		tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u);
		// link to live page
		tmp.safePrintf(" <a href=\"%s\">live</a> ",u);
		// link to page parser
		char ubuf[2000];
		urlEncode(ubuf,2000,u,gbstrlen(u),true);
		tmp.safePrintf(" <a href=\"/master/parser?c=test&"
			       "u=%s\">parser</a> ",ubuf);
		//tmp.safePrintf(" (%llu)",h);
		tmp.safePrintf("<br>\n");
		//tmp.safePrintf("<br>\n");
		tmp.safePrintf("<table border=1>\n");
		tmp.safePrintf("<tr>"
			      "<td><b><nobr>run id</nobr></b></td>"
			      "<td><b><nobr>crit hits</nobr></b></td>"
			      "<td><b><nobr>crit errors</nobr></b></td>"
			      "<td><b><nobr># e</nobr></b></td>"
			      "<td><b><nobr>unchecked</nobr></b></td>"
			      "<td><b><nobr>diff chars</nobr></b></td>"
			      "<td><b><nobr>diff file</nobr></b></td>"
			      "<td><b><nobr>full output</nobr></b></td>"
			      "</tr>\n");

		//SafeBuf sd;

		// loop over all the runs now, starting with latest run first
		for ( long ri = m_runId ; ri >= start ; ri-- ) {

			QUICKPOLL(niceness);

			// the diff filename
			char pdiff[200];
			sprintf(pdiff,"%s/%s/parse.%llu.%li.html.diff",dir,
				testDir,h,ri);
			File f;
			f.set(pdiff);
			long fs = f.getFileSize();
			if ( ! f.doesExist() && ri > 0 ) {
				// make the parse filename
				char pbuf1[200];
				char pbuf2[200];
				sprintf(pbuf1,"%s/%s/parse.%llu.%li.html",
					dir,testDir,h,ri);
				sprintf(pbuf2,"%s/%s/parse.%llu.%li.html",
					dir,testDir,h,ri-1);
				// sanity check
				//File tf; tf.set(pbuf1);
				//if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;}
				// tmp file name
				char tmp1[200];
				char tmp2[200];
				sprintf(tmp1,"%s/%s/t1.html",dir,testDir);
				sprintf(tmp2,"%s/%s/t2.html",dir,testDir);
				// filter first
				char cmd[600];
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf1,tmp1);
				system(cmd);
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf2,tmp2);
				system(cmd);
				// make the system cmd to do the diff
				sprintf(cmd,
					"echo \"<pre>\" > %s ; "
					"diff -w --text %s %s "
					// ignore this table header row
					//" | grep -v \"R#4\""
					" >> %s",
					pdiff,
					tmp1,tmp2,pdiff);
				log("test: system(\"%s\")",cmd);
				system(cmd);
				// try again
				f.set(pdiff);
				fs = f.getFileSize();
			}

			QUICKPOLL(niceness);

			// this means 0 . it just has the <pre> tag in it!
			if ( fs < 0 || fs == 6 ) fs = 0;
			// . if no diff and NOT current run, do not print it
			// . print it if the run right before the current 
			//   now always too
			if ( ri != m_runId && ri != m_runId-1 && fs == 0 ) 
				continue;
			// relative filename
			char rel[200];
			sprintf(rel,"/%s/parse.%llu.%li.html.diff",
				testDir,h,ri);
			char full[200];
			sprintf(full,"/%s/parse.%llu.%li.html",
				testDir,h,ri);
			char validate[200];
			sprintf(validate,
				"/%s/parse-shortdisplay.%llu.%li.html",
				testDir,h,ri);
			// use red font for current run that has a diff!
			char *t1 = "";
			char *t2 = "";
			if ( ri == m_runId && fs != 0 ) {
				t1 = "<font color=pink><b>";
				t2 = "</b></font>";
				// a diff
				udiff[un] = 1;
			}

			// . get critical errors
			// . i.e. XmlDoc::validateOutput() could not validate
			//   a particular event or address that was in the
			//   url's "validated.uh64.txt" file since the admin
			//   clicked on the checkbox in the page parser output
			// . if we do not find such a tag in the parser output
			//   any more then Spider.cpp creates this file!
			if ( ri == m_runId ) {
				char cfile[256];
				sprintf(cfile,"%s/%s/critical.%llu.%li.txt",
					g_hostdb.m_dir,testDir,h,ri);
				SafeBuf ttt;
				ttt.fillFromFile(cfile);
				// first long is misses, then hits then events
				umiss[un] = 0;
				uhits[un] = 0;
				uevents[un] = 0;
				uunchecked[un] = 0;
				if ( ttt.length() >= 3 )
					sscanf(ttt.getBufStart(),
					       "%li %li %li %li",
					       &umiss[un],
					       &uhits[un],
					       &uevents[un],
					       &uunchecked[un]);
				usort[un] = umiss[un] + uunchecked[un];
				//File cf;
				//cf.set(cfile);
				//if ( cf.doesExist()) ucrit[un] = 1;
				//else                 ucrit[un] = 0;
			}

			// more critical?
			if ( ri == m_runId && umiss[un] != 0 ) {
				t1 = "<font color=red><b>";
				t2 = "</b></font>";
			}

			// . these are good to have
			// . if you don't have 1+ critical hits then you
			//   probably need to be validate by the qa guy
			char *uhb1 = "";
			char *uhb2 = "";
			if ( ri == m_runId && uhits[un] != 0 ) {
				uhb1 = "<font color=green><b>**";
				uhb2 = "**</b></font>";
			}

			QUICKPOLL(niceness);

			char *e1 = "<td>";
			char *e2 = "</td>";
			long ne = uevents[un];
			if ( ne ) { 
				e1="<td bgcolor=orange><b><font color=brown>"; 
				e2="</font></b></td>"; 
			}
			char *u1 = "<td>";
			char *u2 = "</td>";
			if ( uunchecked[un] ) {
				u1="<td bgcolor=purple><b><font color=white>"; 
				u2="</font></b></td>"; 
			}
				
			// print the row!
			tmp.safePrintf("<tr>"
				      "<td>%s%li%s</td>"
				       "<td>%s%li%s</td>" // critical hits
				       "<td>%s%li%s</td>" // critical misses
				       "%s%li%s" // # events
				       "%s%li%s" // unchecked
				       "<td>%s%li%s</td>" // filesize of diff
				      // diff filename
				      "<td><a href=\"%s\">%s%s%s</a></td>"
				      // full parser output
				      "<td>"
				       "<a href=\"%s\">full</a> | "
				       "<a href=\"%s\">validate</a> "
				       "</td>"
				      "</tr>\n",
				      t1,ri,t2,
				      uhb1,uhits[un],uhb2,
				      t1,umiss[un],t2,
				      e1,ne,e2,
				       u1,uunchecked[un],u2,
				      t1,fs,t2,
				      rel,t1,rel,t2,
				      full,
				       validate);


			// only fill "sd" for the most recent guy
			if ( ri != m_runId ) continue;

			// now concatenate the parse-shortdisplay file
			// to this little table so qa admin can check/uncheck
			// validation checkboxes for addresses and events
			//sprintf(cfile,
			//	"%s/test/parse-shortdisplay.%llu.%li.html",
			//	g_hostdb.m_dir,h,ri);
			//sd.fillFromFile ( cfile );
		}
		// end table
		tmp.safePrintf("</table>\n");

		// . and a separate little section for the checkboxes
		// . should already be in tables, etc.
		// . each checkbox should provide its own uh64 when it
		//   calls senddiv() when clicked now
		//tmp.cat ( sd );

		tmp.safePrintf("<br>\n");
		tmp.safePrintf("<br>\n");
		// set this
		ulen[un] = tmp.length() - uptr[un] ;
		// sanity check
		if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; }
		// inc it
		un++;
		// increase the 5000!!
		if ( un >= 5000 ) { char *xx=NULL; *xx=0; }
	}


	char flag ;
 bubble:
	flag = 0;
	// sort the url tables
	for ( long i = 0 ; i < un - 1 ; i++ ) {
		QUICKPOLL(niceness);
		if ( usort[i] >  usort[i+1] ) continue;
		if ( usort[i] == usort[i+1] ) 
			if ( udiff[i] >= udiff[i+1] ) continue;
		// swap em
		long  tp = uptr[i];
		long  td = udiff[i];
		long  um = umiss[i];
		long  us = usort[i];
		long  uh = uhits[i];
		long  tl = ulen [i];
		uptr[i] = uptr[i+1];
		umiss[i] = umiss[i+1];
		usort[i] = usort[i+1];
		uhits[i] = uhits[i+1];
		udiff[i] = udiff[i+1];
		ulen[i]  = ulen[i+1];
		uptr[i+1] = tp;
		umiss[i+1] = um;
		usort[i+1] = us;
		uhits[i+1] = uh;
		udiff[i+1] = td;
		ulen [i+1] = tl;
		flag = 1;
	}
	if ( flag ) goto bubble;

	// transfer into primary safe buf now
	for ( long i = 0 ; i < un ; i++ ) 
		sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]);


	sb.safePrintf("</html>\n");

	char dfile[200];
	sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir);
	sb.dumpToFile ( dfile );

	// free the buffer of urls
	reset();

	// turn off spiders
	g_conf.m_spideringEnabled = 0;

	// all done
	return;
}
void Blaster::startBlastering(){
	int64_t now=gettimeofdayInMilliseconds();
	if(m_print && m_totalDone>0 && (m_totalDone % 20)==0){
		log("blaster: Processed %"INT32" urls in %"INT32" ms",m_totalDone,
		    (int32_t) (now-m_startTime));
		m_print=false;
	}
	//Launch the maximum number of threads that are allowed
	while ( m_p1 < m_p1end && m_launched < m_maxNumThreads && m_totalUrls){
		// clear any error
		g_errno = 0;
		// make a new state
		StateBD *st;
		try { st = new (StateBD); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("blaster: Failed. "
			    "Could not allocate %"INT32" bytes for query. "
			    "Returning HTTP status of 500.",
			    (int32_t)sizeof(StateBD));
			return;
		}
		mnew ( st , sizeof(StateBD) , "BlasterDiff3" );
		st->m_buf1=NULL;
		m_totalUrls--;
		// make into a url class. Set both u1 and u2 here.
		//st->m_u1.set ( m_p1 , gbstrlen(m_p1) );
		st->m_u1 = m_p1;
		// is it an injection url
		if ( m_doInjection || m_doInjectionWithLinks ) {
			// get host #0 i guess
			Host *h0 = g_hostdb.getHost(0);
			if ( ! h0 ) { char *xx=NULL;*xx=0; }
			static bool s_flag = true;
			if ( s_flag ) {
				s_flag = false;
				log("blaster: injecting to host #0 at %s on "
				    "http/tcp port %"INT32"",
				    iptoa(h0->m_ip),
				    (int32_t)h0->m_httpPort);
			}
			// use spiderlinks=1 so we add the outlinks to spiderdb
			// but that will slow the spider rate down since it 
			// will have to do a dns lookup on the domain of every
			// outlink.
			st->m_injectUrl.safePrintf("http://127.0.0.1:8000/"
						   "admin/inject?");
			if ( m_doInjectionWithLinks )
				st->m_injectUrl.safePrintf("spiderlinks=1&");
			else
				st->m_injectUrl.safePrintf("spiderlinks=0&");
			st->m_injectUrl.safePrintf("u=");
			st->m_injectUrl.urlEncode(m_p1);
			st->m_injectUrl.pushChar('\0');
			st->m_u1 = st->m_injectUrl.getBufStart();
		}
		// skip to next url
		m_p1 += gbstrlen ( m_p1 ) + 1;
		if (m_blasterDiff){
			//st->m_u2.set ( m_p2 , gbstrlen(m_p2) );
			st->m_u2 = m_p2;
			m_p2 += gbstrlen ( m_p2 ) + 1;
		}

		//		log(LOG_WARN,"\n");
		log(LOG_WARN,"blaster: Downloading %s",st->m_u1);
		// set port if port switch is true
		//if ( m_portSwitch ) {
		//	int32_t r = rand() % 32;
		//	u.setPort ( 8000 + r );
		//}

		// count it
		m_launched++;
		int32_t ip=0;
		int32_t port=0;
		if (m_useProxy){
			ip=atoip("66.154.102.20",13);
			port=3128;
		}
		// get it
		bool status = g_httpServer.getDoc ( st->m_u1 , // url
						    0, // ip
						    0 ,  // offset
						    -1 ,  // size
						    0 , // ifModifiedSince
						    st ,  // state
						    gotDocWrapper1, // callback
						    60*1000, // timeout
						    ip,
						    port,
						    30*1024*1024, //maxLen
						    30*1024*1024);
		// continue if it blocked
		if ( ! status ) continue;
		// If not blocked, there is an error.
		m_launched--;
		// log msg
		log("From file1, got doc1 %s: %s", st->m_u1 , 
		    mstrerror(g_errno) );
		// we gotta wait
		break;
	}
	// bail if not done yet
	//if ( m_launched > 0 ) return;
	if (m_totalUrls) return;
	//otherwise return if launched have not come back
	if (m_launched) return;
	// exit now
	//	g_conf.save();
	//	closeALL(NULL,NULL);
	exit ( 0 );
}
예제 #9
0
bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) {
	          //char rdbId ) {
	// assume url does not have a rec in tagdb
	m_hadRec = false;
	// set our collection
	//if ( coll ) memcpy ( m_coll , coll , collLen );
	//m_collLen = collLen;
	// . if "data" is i guess the rec did not exist... so make a dummy rec
	// . MDW: why?
	if ( ! data || dataSize <= 0 ) {
		// default m_site to the hostname
		m_site.set (url->getHost(),url->getHostLen(),false/*addwww?*/);
		// steal ip from url
		m_site.setIp ( url->getIp() );
		// default xml for this collection
		//m_xml = g_tagdb.getSiteXml ( 0,/*filenum*/
		//			      coll, collLen); //, NULL , 0 );
		m_filenum = 0 ;
		//if ( m_xml ) return true;
		//g_errno = ENODATA;
		//return log("db: Could not find the ruleset file "
		//	   "%stagdb0.xml.",g_hostdb.m_dir);
		return true;
	}
	// return false and set g_errno if buf too small
	if ( dataSize >= CATREC_BUF_SIZE ) {
		g_errno = EBUFTOOSMALL;
		return false;
	}
	// copy the raw data
	memcpy(m_data, data, dataSize);
	m_dataSize = dataSize;
	// set up a parsing ptr into "data"
	//char *p = data;
	char *p = m_data;
	// get the catids if using catdb
	//if (rdbId == RDB_CATDB) {
	m_numCatids = *(unsigned char*)p;
	p++;
	m_catids = (long*)p;
	p += 4*m_numCatids;
	//}
	// point to the filenum so we can mod it!
	//m_filenumPtr = p;
	// get the filenum (0 is default)
	//m_filenum  = *(long *) p ;  p += 4;
	m_filenum  = *(long *) p ;  p += 3;
	// get the version
	if ( m_filenum == -1 ) {
		m_version = 0;
		p++;
	}
	else {
		m_filenum &= 0x00FFFFFF;
		m_version = *p;
		p++;
	}
	// calc site url length
	if ( m_version == 0 ) {
		m_urlLen = dataSize - 4;
		//if (rdbId == RDB_CATDB)
		m_urlLen -= (4*m_numCatids) + 1;
	}
	else
		m_urlLen = gbstrlen(p);
	// set our site url
	m_url = p;
	m_site.set ( p , m_urlLen , false/*addwww?*/);
	// move p to end of url
	p += m_urlLen;
	if ( m_version >= 1 )
		p++;
	// add time stamp, comment, username
	/*
	if ( m_version >= 2 && rdbId != RDB_CATDB ) {
		// time stamp
		m_timeStamp = *(long*)p;
		p += 4;
		// comment
		m_comment = p;
		p += gbstrlen(m_comment) + 1;
		// username
		m_username = p;
		p += gbstrlen(m_username) + 1;
	}
	unsigned char siteFlags = 0;
	m_spamBits   = 0;
	m_adultLevel = 0;

	if ( m_version >= 3 && rdbId != RDB_CATDB ) {
		siteFlags = *p++;
		m_spamBits = siteFlags & 0xc0;  
	}

	//we've added a 1 byte quality and 2 bits for adult content level.
	if ( m_version >= 4 && rdbId != RDB_CATDB ) {
		m_siteQuality = *p++;
		m_adultLevel  = (siteFlags & 0x30);
	}

	m_incHere = NULL;
	m_addHere = NULL;

	if ( m_version >= 5 && rdbId != RDB_CATDB ) {

		// a marker for addSiteType() function below
		m_incHere = (long *)p;

		m_numTypes = *(uint8_t*)p;
		p += sizeof(uint8_t);
			
		for(long i = 0; i < m_numTypes; i++) {
			m_siteTypes[i].m_type = *(uint8_t*)p;
			p += sizeof(uint8_t);

			// version 6 adds 32-bit scores to site type
			if (m_version >= 6 &&
			    SiteType::isType4Bytes(m_siteTypes[i].m_type)) {
				m_siteTypes[i].m_score = *(uint32_t*)p;
				p += sizeof(uint32_t);
			}
			else {
				m_siteTypes[i].m_score = (uint32_t)*(uint8_t*)p;
				p += sizeof(uint8_t);
			}
		}

		// save ptr for addSiteTypes()
		m_addHere = p;

		//now for the languages
		m_numLangs = *(uint8_t*)p;
		p += sizeof(uint8_t);
			
		for(long i = 0; i < m_numLangs; i++) {
			m_siteLangs[i].m_type = *(uint8_t*)p;
			p += sizeof(uint8_t);
			m_siteLangs[i].m_score = (uint32_t)*(uint8_t*)p;
			p += sizeof(uint8_t);
		}
	}
	*/

	// sanity check
	if ( p - m_data != m_dataSize ) {
		log ( "tagdb: Deserialized datasize %i != %li for url %s so "
		      "ignoring tagdb record.",
		      p - m_data, m_dataSize , url->getUrl() );
		return false;
		char *xx = NULL; *xx = 0;
	}

	// if hostname is same as url we can use the ip from url
	if ( url && m_site.getHostLen() == url->getHostLen() )
		m_site.setIp ( url->getIp() );
	// . this url had it's own rec in the db
	// . Msg16 needs to know this so it won't auto-detect p**n/spam in
	//   this url itself and delete it from tfndb
	m_hadRec = true;
	// if rec was in tagdb, data will be non-null.. did we get the rec
	// from tagdb by matching an IP? (as oppossed to canonical name)
	m_gotByIp = gotByIp;
	// get the xml for this filenum
	//m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen );
	//if ( m_xml ) return true;
	// should NEVER be NULL
	//g_errno = ENODATA;
	//return log("db: Could not find the ruleset file %stagdb%li.xml.",
	//	   g_hostdb.m_dir,m_filenum);
	return true;
}
void Blaster::runBlaster(char *file1,char *file2,
			 int32_t maxNumThreads, int32_t wait, bool isLogFile,
			 bool verbose,bool justDisplay,
			 bool useProxy ,
			 bool injectUrlWithLinks ,
			 bool injectUrl ) {
	if (!init())
		return;
	m_blasterDiff=true;
	if (!file2)
		m_blasterDiff=false;
	
	// set File class
	File f1;
	f1.set ( file1 );

	// open files
	if ( ! f1.open ( O_RDONLY ) ) {
		log("blaster:open: %s %s",file1,mstrerror(g_errno)); 
		return; 
	}

	// get file size
	int32_t fileSize1 = f1.getFileSize() ;
	// store a \0 at the end
	int32_t m_bufSize1 = fileSize1 + 1;

	m_doInjectionWithLinks = injectUrlWithLinks;
	m_doInjection = injectUrl;

	// make buffers to hold all
	m_buf1 = (char *) mmalloc ( m_bufSize1 , "blaster1" );
	if ( ! m_buf1) {
		log("blaster:mmalloc: %s",mstrerror(errno));
		return;
	}

	//char *bufEnd = buf + bufSize;

	// set m_p1
	m_p1    = m_buf1;
	m_p1end = m_buf1 + m_bufSize1 - 1;

	// read em all in
	if ( ! f1.read ( m_buf1 , fileSize1 , 0 ) ) {
		log("blaster:read: %s %s",file1,mstrerror(g_errno));
		return;
	}

	// change \n to \0
	//char *p = buf;
	int32_t  n = 0;
	for ( int32_t i = 0 ; i < m_bufSize1 ; i++ ) {
		if ( m_buf1[i] != '\n' ) continue;
		m_buf1[i] = '\0';
		n++;
	}


	if (m_blasterDiff){
		File f2;
		f2.set ( file2 );
		if ( ! f2.open ( O_RDONLY ) ) {
			log("blaster:open: %s %s",file2,mstrerror(g_errno)); 
			return; 
		}
		int32_t fileSize2 = f2.getFileSize() ;
		int32_t m_bufSize2 = fileSize2 + 1;
		m_buf2 = (char *) mmalloc ( m_bufSize2 , "blaster2" );
		if ( ! m_buf2) {
			log("blaster:mmalloc: %s",mstrerror(errno));
			return;
		}
		// set m_p2
		m_p2    = m_buf2;
		m_p2end = m_buf2 + m_bufSize2 - 1;
		if ( ! f2.read ( m_buf2 , fileSize2 , 0 ) ) {
			log("blaster:read: %s %s",file2,mstrerror(g_errno));
			return;
		}
		int32_t m=0;
		for ( int32_t i = 0 ; i < m_bufSize2 ; i++ ) {
			if ( m_buf2[i] != '\n' ) continue;
			m_buf2[i] = '\0';
			m++;
		}
		// Working on only the least number of urls from both files, 
		//because we need to work in pairs
		if (m<n) n=m;
		else m=n;
		m_totalUrls=n;

		// should we print out all the logs?
		m_verbose=verbose;
		// Should we use the proxy for getting the first Doc
		m_useProxy=useProxy;
		// Should we just display the not present links and not fetch
		// the page to see if they are actually present ?
		m_justDisplay=justDisplay;
	}
	else{
		m_isLogFile=isLogFile;
		
		/*if reading a gigablast log file, find the lines that have 
		  GET and POST commands for search, and register a sleep
		  callback for those lines with sleepWrapperLog*/
		if(!isLogFile)
			m_totalUrls=n;
		else {
			m_totalUrls=0;
			char *p=m_buf1;
			char *pend=p+m_bufSize1;
			
			// start is the time in milliseconds of the first log 
			// message
			int64_t start=atoll(m_buf1);
			while(p<pend) {
				char *lineStart=p;
				char *urlStart=strstr(p," GET /search");
				if (!urlStart)
					urlStart=strstr(p," POST /search");
				if(!urlStart){
					p+=gbstrlen(p)+1; //goto next line
					continue;
				}
				urlStart++;
				m_wait=atoll(lineStart)-start;
				// register it here
				g_loop.registerSleepCallback(m_wait , 
							     urlStart, 
							     sleepWrapperLog);
				m_totalUrls++;
				p+=gbstrlen(p)+1;
			}
		}
	}
	log(LOG_INIT,"blaster: read %"INT32" urls into memory", 
	    m_totalUrls );

	if(!isLogFile){
		// get min time bewteen each spider in milliseconds
		m_wait = wait;
			
		// # of threads
		m_maxNumThreads = maxNumThreads;
		
		m_launched=0;
		
		m_portSwitch = 0;
		//if ( argc == 4 ) m_portSwitch = 1;
		//else             m_portSwitch = 0;
			
		// start our spider loop
		//startSpidering( );
		
		// wakeup wrapper every X ms
		g_loop.registerSleepCallback ( m_wait , NULL , 
					       sleepWrapper );
	}
	// this print to print how many docs have been processed
	m_print=false;
	m_startTime=gettimeofdayInMilliseconds();
	m_totalDone=0;
	// . now start g_loops main interrupt handling loop
	// . it should block forever
	// . when it gets a signal it dispatches to a server or db to handle it
	if ( ! g_loop.runLoop()    ) {
		log("blaster::runLoop failed" ); return; }
	// dummy return (0-->normal exit status for the shell)
	return;
}
void Blaster::gotDoc4 ( void *state, TcpSocket *s){
	StateBD *st=(StateBD *)state;
	st->m_numUrlDocsReceived++;
	if (!s) {
		//Shouldn't happen, but still putting a checkpoint
		log (LOG_WARN,"blaster: Got a null s in gotDoc4."
		     "Happened because ip could not be found for gigablast"
		     "server");
		if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
			m_launched--;
			// Free stateBD
			freeStateBD(st);
		}
		return;
	}
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blasterDiff : lost the Request in gotDoc4");
		if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
			m_launched--;
			freeStateBD(st);
		}
		return;
	}
	char *reply = s->m_readBuf ;
	int32_t  size  = s->m_readOffset;
	HttpMime mime;
	mime.set ( reply , size , NULL );
	char *content    = reply + mime.getMimeLen();
	int32_t  contentLen = size  - mime.getMimeLen();

	//int16_t csEnum = get_iana_charset(mime.getCharset(), 
	//				mime.getCharsetLen());
	/*	if (csEnum == csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/
	
	Xml xml;
	if (!xml.set(
		     content, 
		     contentLen,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION,
		     true, // setparents
		     0, // niceness
		     CT_XML )){
		log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
	}
	Links links;
	Url *url=mime.getLocationUrl();
	if (!links.set(0,//siterec xml
		       &xml,
		       url,
		       false,
		       NULL,
		       TITLEREC_CURRENT_VERSION,
		       0,
		       false,
		       NULL)){
		log(LOG_WARN, "blaster: Coudn't set Links class in gotDoc4");
	}
	for (int32_t i=0;i<links.getNumLinks();i++){
		char *ss=links.getLink(i);
		char *p;
		// This page *should* always be a gigablast page. So not adding
		// checks for msn or yahoo or google page.
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
       		if (m_verbose)
			log(LOG_WARN,"blaster: Link Present on server2=%s",ss);
	}
	
	// So if one of the links that is returned is the exact url,
	// then we know that the url is present.So get the url from the
	// mime, search for it in the links that are returned.
	char tmp[1024];
	char *sendBuf=s->m_sendBuf;
	char *p1,*p2;

	// First get the Host, which is the domain. Since socket s is going to
	// be useless after this function, changing m_sendBuf instead of using 
	// more space
	p1=strstr(sendBuf,"%3A");
	if(p1){
		p1+=3;
		p2=strstr(p1," HTTP");
		if (p2){
			//Since I do not care about the sendbuf anymore
			*p2='\0';
		}
	}
	if (!p1 || !p2){
		log(LOG_WARN,"blasterdiff: Could not find search link"
		    "from m_sendBuf in gotdoc4");
	}
	else{
		sprintf(tmp,"%s",p1);
		//log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp);
		bool isFound=false;
		// So now we search for tmp in the links
		for (int32_t i=0;i<links.getNumLinks();i++){
			if(strstr(links.getLink(i),tmp) && 
			   links.getLinkLen(i)==(int)gbstrlen(tmp)){
				isFound=true;
				log(LOG_WARN,"blaster: %s in results1 but not"
				    " in results2 for query %s but does exist"
				    " in server2",tmp,st->m_u1);//->getQuery()
			}
		}
		if (!isFound)
			log(LOG_WARN,"blaster: %s in results1 but not"
			    " in results2 for query %s and does NOT exist"
			    " in server2",tmp,st->m_u1); // ->getQuery()
	}
	

      	if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
		m_launched--;
		// Free stateBD
		freeStateBD(st);
	}
	return;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . make a web page displaying the config of this host
// . call g_httpServer.sendDynamicPage() to send it
bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) {
	// . get fields from cgi field of the requested url
	// . get the search query
	long  queryLen = 0;
	char *query = r->getString ( "q" , &queryLen , NULL /*default*/);
	// ensure query not too big
	if ( queryLen >= MAX_QUERY_LEN ) { 
		g_errno = EQUERYTOOBIG; 
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// get the collection
	long  collLen = 0;
	char *coll    = r->getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	// ensure collection not too big
	if ( collLen >= MAX_COLL_LEN ) { 
		g_errno = ECOLLTOOBIG; 
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); 
	}
	CollectionRec *cr = g_collectiondb.getRec(coll);
	if ( ! cr ) {
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); 
	}
	// make a state
	State10 *st ;
	try { st = new (State10); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageIndexdb: new(%i): %s", 
		    sizeof(State10),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( st , sizeof(State10) , "PageIndexdb" );
	// password, too
	long pwdLen = 0 ;
	char *pwd = r->getString ( "pwd" , &pwdLen );
	if ( pwdLen > 31 ) pwdLen = 31;
	if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
	st->m_pwd[pwdLen]='\0';
	// get # of records to retreive from IndexList
	st->m_numRecs  = r->getLong ( "numRecs" , 100 );
	// use disk, tree, or cache?
	st->m_useDisk  = r->getLong ("ud" , 0 );
	st->m_useTree  = r->getLong ("ut" , 0 );
	st->m_useCache = r->getLong ("uc" , 0 );
	st->m_useDatedb= r->getLong ("ub" , 0 );
	st->m_add      = r->getLong ("add", 0 );
	st->m_del      = r->getLong ("del", 0 );
	// get the termId, if any, from the cgi vars
	st->m_termId = r->getLongLong ("t", 0LL ) ;
	// get docid and score
	st->m_docId  = r->getLongLong ("d", 0LL );
	st->m_score  = r->getLong ("score", 0 );
	// copy query/collection
	memcpy ( st->m_query , query , queryLen );
	st->m_queryLen = queryLen;
	st->m_query [ queryLen ] ='\0';
	//memcpy ( st->m_coll , coll , collLen );
	//st->m_collLen  = collLen;
	//st->m_coll [ collLen ] ='\0';
	st->m_coll = coll;
	st->m_collnum = cr->m_collnum;
	// save the TcpSocket
	st->m_socket = s;
	// and if the request is local/internal or not
	st->m_isAdmin = g_conf.isCollAdmin ( s , r );
	st->m_isLocal = r->isLocal();
	st->m_r.copy ( r );
	// . check for add/delete request
	if ( st->m_add || st->m_del ) {
		key_t startKey = g_indexdb.makeStartKey ( st->m_termId );
		key_t endKey   = g_indexdb.makeEndKey   ( st->m_termId );
		// construct the key to add/delete
		st->m_key = g_indexdb.makeKey ( st->m_termId,
						st->m_score ,
						st->m_docId ,
						st->m_del   );
		// make an RdbList out of the key
		st->m_keyList.set ( (char*)&st->m_key,
				    sizeof(key_t),
				    (char*)&st->m_key,
				    sizeof(key_t),
				    startKey,
				    endKey,
				    0,
				    false,
				    true  );
		log ( LOG_INFO, "build: adding indexdb key to indexdb: "
				"%lx %llx", st->m_key.n1, st->m_key.n0 );
		// call msg1 to add/delete key
		if ( ! st->m_msg1.addList ( &st->m_keyList,
					     RDB_INDEXDB,
					     st->m_collnum,
					     st,
					     addedKeyWrapper,
					     false,
					     MAX_NICENESS ) )
			return false;
		// continue to page if no block
		return gotIndexList ( st );
	}

	if ( ! st->m_query[0] ) return gotIndexList(st);

	// . set query class
	// . a boolFlag of 0 means query is not boolean
	Query q;
	q.set2 ( query , langUnknown , true ); // 0 = boolFlag, not boolean!
	// reset 
	st->m_msg36.m_termFreq = 0LL;
	// if query was provided, use that, otherwise use termId
	if ( q.getNumTerms() > 0 ) st->m_termId = q.getTermId(0);
	// skip if nothing
	else return gotTermFreq ( st );
	// get the termfreq of this term!
	if ( ! st->m_msg36.getTermFreq ( st->m_collnum ,
					 0 , 
					 st->m_termId,
					 st ,
					 gotTermFreqWrapper ) ) return false;
	// otherwise, we didn't block
	return gotTermFreq ( st );
}
예제 #13
0
int32_t getContentTypeFromStr ( const char *s ) {

	int32_t slen = gbstrlen(s);

	// trim off spaces at the end
	char tmp[64];
	if ( s[slen-1] == ' ' ) {
		strncpy(tmp,s,63);
		tmp[63] = '\0';
		int32_t newLen = gbstrlen(tmp);
		s = tmp;
		char *send = tmp + newLen;
		for ( ; send>s && send[-1] == ' '; send-- );
		*send = '\0';
	}

	int32_t ct = CT_UNKNOWN;
	if ( !strncasecmp(s, "text/", 5) ) {
		if ( !strcasecmp(s,"text/html") ) {
			ct = CT_HTML;
		} else if ( !strcasecmp(s,"text/plain" ) ) {
			ct = CT_TEXT;
		} else if ( !strcasecmp(s,"text/xml" ) ) {
			ct = CT_XML;
		} else if ( !strcasecmp(s,"text/txt" ) ) {
			ct = CT_TEXT;
		} else if ( !strcasecmp(s,"text/javascript" ) ) {
			ct = CT_JS;
		} else if ( !strcasecmp(s,"text/x-js" ) ) {
			ct = CT_JS;
		} else if ( !strcasecmp(s,"text/js" ) ) {
			ct = CT_JS;
		} else if ( !strcasecmp(s,"text/css" ) ) {
			ct = CT_CSS;
		} else if ( !strcasecmp(s,"text/x-vcard" ) ) {
			// . semicolon separated list of info, sometimes an element is html
			// . these might have an address in them...
			ct = CT_HTML;
		} else {
			ct = CT_TEXT;
		}
	}
	else if (!strcasecmp(s,"text"                    ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"txt"                     ) ) ct = CT_TEXT;
	else if (!strcasecmp(s,"application/xml"         ) ) ct = CT_XML;
	// we were not able to spider links on an xhtml doc because
	// this was set to CT_XML, so try CT_HTML
	else if (!strcasecmp(s,"application/xhtml+xml"   ) ) ct = CT_HTML;
	else if (!strcasecmp(s,"application/rss+xml"     ) ) ct = CT_XML;
	else if (!strcasecmp(s,"rss"                     ) ) ct = CT_XML;
	else if (!strcasecmp(s,"application/rdf+xml"     ) ) ct = CT_XML;
	else if (!strcasecmp(s,"application/atom+xml"    ) ) ct = CT_XML;
	else if (!strcasecmp(s,"atom+xml"                ) ) ct = CT_XML;
	else if (!strcasecmp(s,"application/pdf"         ) ) ct = CT_PDF;
	else if (!strcasecmp(s,"application/msword"      ) ) ct = CT_DOC;
	else if (!strcasecmp(s,"application/vnd.ms-excel") ) ct = CT_XLS;
	else if (!strcasecmp(s,"application/vnd.ms-powerpoint")) ct = CT_PPT;
	else if (!strcasecmp(s,"application/mspowerpoint") ) ct = CT_PPT;
	else if (!strcasecmp(s,"application/postscript"  ) ) ct = CT_PS;
	else if (!strcasecmp(s,"application/warc"        ) ) ct = CT_WARC;
	else if (!strcasecmp(s,"application/arc"         ) ) ct = CT_ARC;
	else if (!strcasecmp(s,"image/gif"               ) ) ct = CT_GIF;
	else if (!strcasecmp(s,"image/jpeg"              ) ) ct = CT_JPG;
	else if (!strcasecmp(s,"image/png"               ) ) ct = CT_PNG;
	else if (!strcasecmp(s,"image/tiff"              ) ) ct = CT_TIFF;
	else if (!strncasecmp(s,"image/",6               ) ) ct = CT_IMAGE;
	else if (!strcasecmp(s,"application/javascript"  ) ) ct = CT_JS;
	else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS;
	else if (!strcasecmp(s,"application/x-gzip"      ) ) ct = CT_GZ;
	else if (!strcasecmp(s,"application/json"        ) ) ct = CT_JSON;
	// facebook.com:
	else if (!strcasecmp(s,"application/vnd.wap.xhtml+xml") ) ct =CT_HTML;
	else if (!strcasecmp(s,"binary/octet-stream") ) ct = CT_UNKNOWN;
	else if (!strcasecmp(s,"application/octet-stream") ) ct = CT_UNKNOWN;
	else if (!strcasecmp(s,"application/binary" ) ) ct = CT_UNKNOWN;
	else if (!strcasecmp(s,"application/x-tar" ) ) ct = CT_UNKNOWN;
	else if ( !strncmp ( s , "audio/",6)  ) ct = CT_UNKNOWN;

	return ct;
}
bool Collectiondb::load ( bool isDump ) {
	char dname[1024];
	// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
	sprintf ( dname , "%s" , g_hostdb.m_dir );
	Dir d; 
	d.set ( dname );
	if ( ! d.open ()) return log("admin: Could not load collection config "
				     "files.");
	// note it
	log(LOG_INIT,"admin: Loading collection config files.");
	// . scan through all subdirs in the collections dir
	// . they should be like, "coll.main/" and "coll.mycollection/"
	char *f;
	while ( ( f = d.getNextFilename ( "*" ) ) ) {
		// skip if first char not "coll."
		if ( strncmp ( f , "coll." , 5 ) != 0 ) continue;
		// must end on a digit (i.e. coll.main.0)
		if ( ! is_digit (f[gbstrlen(f)-1]) ) continue;
		// point to collection
		char *coll = f + 5;
		// NULL terminate at .
		char *pp = strchr ( coll , '.' );
		if ( ! pp ) continue;
		*pp = '\0';
		// get collnum
		collnum_t collnum = atol ( pp + 1 );
		// add it
		if ( !addRec ( coll , NULL , 0 , false , collnum , isDump ,
			       true ) )
			return false;
	}
	// note it
	log(LOG_INIT,"admin: Loaded data for %li collections. Ranging from "
	    "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
	// update the time
	updateTime();
	// don't clean the tree if just dumpin
	if ( isDump ) return true;
	// remove any nodes with illegal collnums
	Rdb *r;
	//r = g_indexdb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	r = g_posdb.getRdb();
	r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_datedb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);

	r = g_titledb.getRdb();
	r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_revdb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_sectiondb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_checksumdb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	//r = g_tfndb.getRdb();
	//r->m_tree.cleanTree    ((char **)r->m_bases);
	r = g_spiderdb.getRdb();
	r->m_tree.cleanTree    ((char **)r->m_bases);
	r = g_doledb.getRdb();
	r->m_tree.cleanTree    ((char **)r->m_bases);
	// success
	return true;
}
void gotDocWrapper ( void *state , TcpSocket *s ) {
	// no longer launched
	s_launched--;
	char* url = (char*)state;
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("lost %s",(char *) state);
		if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
		return;
	}
	// got one more result page
	s_total++;
	// allow printing
	s_printIt = true;
	// get time now
	long long now = gettimeofdayInMilliseconds();
	// get hash
	char *reply = s->m_readBuf ;
	long  size  = s->m_readOffset;
	HttpMime mime;
	mime.set ( reply , size , NULL );
	char *content    = reply + mime.getMimeLen();
	long  contentLen = size  - mime.getMimeLen();
	long status      = mime.getHttpStatus();
	unsigned long h = hash32 ( content , contentLen );
	char *p = mime.getMime();
	char *pend = p + mime.getMimeLen();
	char message[256];
	long mlen = 0;

	// parse status message out of response

	// HTTP/1.0
	while ( p < pend && !is_space(*p) ) p++;
	// skip space
	while ( p < pend &&  is_space(*p) ) p++;
	// copy to end of line
	while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
		message[mlen++] = *p;
	}
	message[mlen] = '\0';

	// log msg
	if ( g_errno ) 
		logf(LOG_INFO,"blaster: got doc (status=%li) (%li) (%lims) %s : "
		     "%s", status,
		      s->m_readOffset      , 
		      (long)(now - s->m_startTime) , 
		      (char *)state        , 
		      mstrerror(g_errno)   );
	else
		logf(LOG_INFO,"blaster: got doc (status=%li) (%li) (%lims) "
		     "(hash=%lx) %s", status,
		      s->m_readOffset      , 
		      (long)(now - s->m_startTime) , 
		      h ,
		      (char *)state        );

	if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
	// try to launch another
	startSpidering();
}
예제 #16
0
// . return false if blocked, true otherwise
// . sets g_errno on error
bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) {
	// debug msg
	//log("sendData: mcast=%lu listSize=%li",
	//    (long)&m_mcast,(long)listSize);

	// bail if this is an interface machine, don't write to the main
	if ( g_conf.m_interfaceMachine ) return true;
	// return true if no data
	if ( listSize == 0 ) return true;
	// how many hosts in this group
	//long numHosts = g_hostdb.getNumHostsPerShard();
	// . NOTE: for now i'm removing this until I handle ETRYAGAIN errors
	//         properly... by waiting and retrying...
	// . if this is local data just for us just do an addList to OUR rdb
	/*
	if ( groupId == g_hostdb.m_groupId  && numHosts == 1 ) {
		// this sets g_errno on error
		Msg0 msg0;
		Rdb *rdb = msg0.getRdb ( (char) m_rdbId );
		if ( ! rdb ) return true;
		// make a list from this data
		RdbList list;
		list.set (listData,listSize,listSize,rdb->getFixedDataSize(),
			  false) ; // ownData?
		// this returns false and sets g_errno on error
		rdb->addList ( &list );
		// . if we got a ETRYAGAIN cuz the buffer we add to was full
		//   then we should sleep and try again!
		// . return false cuz this blocks for a period of time
		//   before trying again
		if ( g_errno == ETRYAGAIN ) {
			// try adding again in 1 second
			registerSleepCallback ( 1000, slot, tryAgainWrapper1 );
			// return now
			return false;
		}
		// . always return true cuz we did not block
		// . g_errno may be set
		return true;
	}
	*/
	// if the data is being added to our group, don't send ourselves
	// a msg1, if we can add it right now
	bool sendToSelf = true;
	if ( shardNum == getMyShardNum() &&
	     ! g_conf.m_interfaceMachine ) {
		// get the rdb to which it belongs, use Msg0::getRdb()
		Rdb *rdb = getRdbFromId ( (char) m_rdbId );
		if ( ! rdb ) goto skip;
		// key size
		long ks = getKeySizeFromRdbId ( m_rdbId );
		// reset g_errno
		g_errno = 0;
		// . make a list from this data
		// . skip over the first 4 bytes which is the rdbId
		// . TODO: embed the rdbId in the msgtype or something...
		RdbList list;
		// set the list
		list.set ( listData ,
			   listSize ,
			   listData ,
			   listSize ,
			   rdb->getFixedDataSize() ,
			   false                   ,  // ownData?
			   rdb->useHalfKeys()      ,
			   ks                      ); 
		// note that
		//log("msg1: local addlist niceness=%li",m_niceness);
		// this returns false and sets g_errno on error
		rdb->addList ( m_coll , &list , m_niceness );
		// if titledb, add tfndb recs to map the title recs
		//if ( ! g_errno && rdb == g_titledb.getRdb() && m_injecting ) 
		//	// this returns false and sets g_errno on error
		//	updateTfndb ( m_coll , &list , true , m_niceness);
		// if no error, no need to use a Msg1 UdpSlot for ourselves
		if ( ! g_errno ) sendToSelf = false;
		else {
			log("rdb: msg1 had error: %s",mstrerror(g_errno));
			// this is messing up generate catdb's huge rdblist add
			// why did we put it in there??? from msg9b.cpp
			//return true;
		}
		
 		QUICKPOLL(m_niceness);
		// if we're the only one in the group, bail, we're done
		if ( ! sendToSelf &&
		     g_hostdb.getNumHostsPerShard() == 1 ) return true;
	}
skip:
	// . make an add record request to multicast to a bunch of machines
	// . this will alloc new space, returns NULL on failure
	//char *request = makeRequest ( listData, listSize, groupId , 
	//m_rdbId , &requestLen );
	long collLen = gbstrlen ( m_coll );
	// . returns NULL and sets g_errno on error
	// . calculate total size of the record
	// . 1 byte for rdbId, 1 byte for flags,
	//   then collection NULL terminated, then list
	long requestLen = 1 + 1 + collLen + 1 + listSize ;
	// make the request
	char *request = (char *) mmalloc ( requestLen ,"Msg1" );
	if ( ! request ) return true;
	char *p = request;
	// store the rdbId at top of request
	*p++ = m_rdbId;
	// then the flags
	*p = 0;
	if ( m_injecting ) *p |= 0x80;
	p++;
	// then collection name
	memcpy ( p , m_coll , collLen );
	p += collLen;
	*p++ = '\0';
	// sanity check
	if ( collLen <= 0 ) {
		log(LOG_LOGIC,"net: No collection specified for list add.");
		//char *xx = NULL; *xx = 0;
		g_errno = ENOCOLLREC;
		return true;
	}
	//if ( m_deleteRecs    ) request[1] |= 0x80;
	//if ( m_overwriteRecs ) request[1] |= 0x40;
	// store the list after coll
	memcpy ( p , listData , listSize );
 	QUICKPOLL(m_niceness);
	// debug msg
	//if ( ! m_waitForReply ) // (m_rdbId == RDB_SPIDERDB || 
	//m_rdbId == RDB_TFNDB)  )
	//	// if we don't get here we lose it!!!!!!!!!!!!!!!!!!!!!
	//	log("using mcast=%lu rdbId=%li listData=%lu listSize=%lu "
	//	    "gid=%lu",
	//	   (long)&m_mcast,(long)m_rdbId,(long)listData,(long)listSize,
	//	    groupId);
	// for small packets
	//long niceness = 2;
	//if ( requestLen < TMPBUFSIZE - 32 ) niceness = 0;
	//log("msg1: sending mcast niceness=%li",m_niceness);
	// . multicast to all hosts in group "groupId"
	// . multicast::send() returns false and sets g_errno on error
	// . we return false if we block, true otherwise
	// . will loop indefinitely if a host in this group is down
	key_t k; k.setMin();
	if ( m_mcast.send ( request    , // sets mcast->m_msg    to this
			    requestLen , // sets mcast->m_msgLen to this
			    0x01       , // msgType for add rdb record
			    true       , // does multicast own msg?
			    shardNum   , // group to send to (groupKey)
			    true       , // send to whole group?
			    0          , // key is useless for us
			    this       , // state data
			    NULL       , // state data
			    gotReplyWrapper1 ,
			    60         , // timeout in secs
			    m_niceness , // niceness 
			    false    , // realtime
			    -1    , // first host to try
			    NULL  , // replyBuf        = NULL ,
			    0     , // replyBufMaxSize = 0 ,
			    true  , // freeReplyBuf    = true ,
			    false , // doDiskLoadBalancing = false ,
			    -1    , // no max cache age limit
			    //(key_t)0 , // cache key
			    k    , // cache key
			    RDB_NONE , // bogus rdbId
			    -1    , // unknown minRecSizes read size
			    sendToSelf ))
		return false;

 	QUICKPOLL(m_niceness);
	// g_errno should be set
	log("net: Had error when sending request to add data to %s in shard "
	    "#%lu: %s.", getDbnameFromId(m_rdbId),shardNum,mstrerror(g_errno));
	return true;	
}
int main ( int argc , char *argv[] ) {
	// let's ensure our core file can dump
	struct rlimit lim;
	lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
	if ( setrlimit(RLIMIT_CORE,&lim) )
		log("blaster::setrlimit: %s", mstrerror(errno) );

	g_conf.m_maxMem = 500000000;

	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("blaster::hashinit failed" ); return 1; }

	// init the memory class after conf since it gets maxMem from Conf
	//if ( ! g_mem.init ( 20000000 ) ) {
	//	log("blaster::Mem init failed" ); return 1; }
	g_mem.m_maxMem = 200000000;
	// start up log file
	if ( ! g_log.init( "/tmp/blasterLog" )        ) {
		log("blaster::Log open /tmp/blasterLog failed" ); return 1; }

	// get dns ip from /etc/resolv.conf
	g_conf.m_dnsIps[0] = 0;
	FILE *fd = fopen ( "/etc/resolv.conf" , "r" );
	if ( ! fd ) {
		log("blaster::fopen: /etc/resolve.conf %s",
		    mstrerror(errno)); return 1; }

	char tmp[1024];
	while ( fgets ( tmp , 1024 , fd ) ) {
		// tmp buf ptr
		char *p = tmp;
		// skip comments
		if ( *p == '#' ) continue;
		// skip nameserver name
		if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ;
		// skip spaces
		while ( isspace ( *p ) ) p++;
		// if this is not a digit, continue
		if ( ! isdigit(*p) ) continue;
		// get ip
		g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) );
		// done
		break;
	}
	fclose ( fd );


	// if no dns server found, bail
	if ( g_conf.m_dnsIps[0] == 0 ) {
		log("blaster:: no dns ip found in /etc/resolv.conf");return 1;}

	// hack # of dns servers
	g_conf.m_numDns         = 1;
	g_conf.m_dnsPorts[0]    = 53;
	//g_conf.m_dnsIps  [0]    = atoip ( "192.168.0.1", 11 );
	//g_conf.m_dnsClientPort  = 9909;
	g_conf.m_dnsMaxCacheMem = 1024*10;
	// hack http server port to -1 (none)
	//g_conf.m_httpPort           = 0;
	g_conf.m_httpMaxSockets     = 200;
	//g_conf.m_httpMaxReadBufSize = 102*1024*1024;
	g_conf.m_httpMaxSendBufSize = 16*1024;
	//g_conf.m_httpMaxDownloadSockets = 200;

	if ( argc != 4 && argc != 5 && argc !=6 ) {
	printUsage:
		log("USAGE: blaster [fileOfUrls | -r<num random words><server>] [maxNumThreads] [wait in ms] " 
		    "<lines to skip> <string to append>");
		log("USAGE: examples:");
		log("USAGE:  ./blaster queries.fromlog 10 1");
		log("USAGE:  ./blaster -r3http://www.gigablast.com/index.php?q= 1 100\n");
		return 1; 
	}


	// init the loop
	if ( ! g_loop.init() ) {
		log("blaster::Loop init failed" ); return 1; }
	// . then dns client
	// . server should listen to a socket and register with g_loop
	if ( ! g_dns.init(6000)        ) {
		log("blaster::Dns client init failed" ); return 1; }
	// . then webserver
	// . server should listen to a socket and register with g_loop
	for(long i = 0; i < 50; i++) {
		if ( ! g_httpServer.init( 8333 + i, 9334+i ) ) {
			log("blaster::HttpServer init failed" ); 
			//return 1; 
		}
		else break;
	}
	// set File class
	char *fname = argv[1];
	long fnameLen = gbstrlen(fname);
	long fileSize = 0;
	long bufSize = 0;
	char *buf = NULL;
	long  n = 0;

	//should we generate random queries?
	if(fnameLen > 2 && fname[0] == '-' && fname[1] == 'r') {
		char *p = fname + 2;
		s_numRandWords = atoi( p );
		while(is_digit(*p)) p++;
		getWords();
		
		if(*p == '\0') goto printUsage;
		s_server = p;
		log("blaster server is %s", s_server);
		//		char x[1024];
		// 		while(1) {
		// 			long l = getRandomWords(x, x + 1024, s_numRandWords);
		// 			*(x + l) = '\0';
		// 			log("blaster: %s", x);
		// 		}
		//		exit(1);
	}
	else { //it is a real file
		File f;
		f.set ( fname );

		// open file
		if ( ! f.open ( O_RDONLY ) ) {
			log("blaster::open: %s %s",fname,mstrerror(g_errno)); 
			return 1; 
		}

		// get file size
		fileSize = f.getFileSize() ;

		// store a \0 at the end
		bufSize = fileSize + 1;

		// make buffer to hold all
		buf = (char *) mmalloc ( bufSize , "blaster" );
		if ( ! buf) {log("blaster::mmalloc: %s",mstrerror(errno));return 1;}

		//char *bufEnd = buf + bufSize;

		// set s_p
		s_p    = buf;
		s_pend = buf + bufSize - 1;

		// read em all in
		if ( ! f.read ( buf , fileSize , 0 ) ) {
			log("blaster::read: %s %s",fname,mstrerror(g_errno));return 1;}

		// change \n to \0
		//char *p = buf;
		for ( long i = 0 ; i < bufSize ; i++ ) {
			if ( buf[i] != '\n' ) continue;
			buf[i] = '\0';
			n++;
		}

		f.close();
	}
	// log a msg
	log(LOG_INIT,"blaster: read %li urls into memory", n );

	long linesToSkip = 0;
	if ( argc >=  5 ) {
		linesToSkip = atoi ( argv[4] );
		log (LOG_INIT,"blaster: skipping %li urls",linesToSkip);
	}
	for ( long i = 0; i < linesToSkip && s_p < s_pend; i++ )
		s_p += gbstrlen(s_p) + 1;
	
	if ( argc == 6 ) {
		long len  = gbstrlen ( argv[5] );
		if ( len > 512 )
			len = 512;
		strncpy ( s_append , argv[5] , gbstrlen (argv[5]) );
	}
	else
		s_append[0] = '\0';

	// get min time bewteen each spider in milliseconds
	s_wait = atoi( argv[3] );

	// # of threads
	s_maxNumThreads = 1;
	s_maxNumThreads = atoi ( argv[2] );

	s_portSwitch = 0;
	//if ( argc == 4 ) s_portSwitch = 1;
	//else             s_portSwitch = 0;

	// start our spider loop
	//startSpidering( );

	// wakeup wrapper every X ms
	g_loop.registerSleepCallback ( s_wait , NULL , sleepWrapper );

	//msg10.addUrls ( uu , gbstrlen(uu)+1, NULL,0,time(0),4,true,NULL,NULL);
	// . now start g_loops main interrupt handling loop
	// . it should block forever
	// . when it gets a signal it dispatches to a server or db to handle it
	if ( ! g_loop.runLoop()    ) {
		log("blaster::runLoop failed" ); return 1; }
	// dummy return (0-->normal exit status for the shell)
	return 0;
}
예제 #18
0
// . destroys the slot if false is returned
// . this is registered in Msg1::set() to handle add rdb record msgs
// . seems like we should always send back a reply so we don't leave the
//   requester's slot hanging, unless he can kill it after transmit success???
// . TODO: need we send a reply back on success????
// . NOTE: Must always call g_udpServer::sendReply or sendErrorReply() so
//   read/send bufs can be freed
void handleRequest1 ( UdpSlot *slot , long netnice ) {


	// extract what we read
	char *readBuf     = slot->m_readBuf;
	long  readBufSize = slot->m_readBufSize;
	long niceness = slot->m_niceness;

	// select udp server based on niceness
	UdpServer *us = &g_udpServer;
	// must at least have an rdbId
	if ( readBufSize <= 4 ) {
		g_errno = EREQUESTTOOSHORT;
		us->sendErrorReply ( slot , g_errno );
		return;
	}
	char *p    = readBuf;
	char *pend = readBuf + readBufSize;
	// extract rdbId
	char rdbId = *p++;
	// get the rdb to which it belongs, use Msg0::getRdb()
	Rdb *rdb = getRdbFromId ( (char) rdbId );
	if ( ! rdb ) { us->sendErrorReply ( slot, EBADRDBID ); return;}
	// keep track of stats
	rdb->readRequestAdd ( readBufSize );
	// reset g_errno
	g_errno = 0;
	// are we injecting some title recs?
	bool injecting;
	if ( *p & 0x80 ) injecting = true;
	else             injecting = false;
	p++;
	// then collection
	char *coll = p;
	p += gbstrlen (p) + 1;
	// . make a list from this data
	// . skip over the first 4 bytes which is the rdbId
	// . TODO: embed the rdbId in the msgtype or something...
	RdbList list;
	// set the list
	list.set ( p        , // readBuf     + 4         ,
		   pend - p , // readBufSize - 4         ,
		   p        , // readBuf     + 4         ,
		   pend - p , // readBufSize - 4         ,
		   rdb->getFixedDataSize() ,
		   false                   ,  // ownData?
		   rdb->useHalfKeys()      ,
		   rdb->getKeySize ()      ); 
	// note it
	//log("msg1: handlerequest1 calling addlist niceness=%li",niceness);
	//log("msg1: handleRequest1 niceness=%li",niceness);
	// this returns false and sets g_errno on error
	rdb->addList ( coll , &list , niceness);
	// if titledb, add tfndb recs to map the title recs
	//if ( ! g_errno && rdb == g_titledb.getRdb() && injecting ) 
	//	updateTfndb ( coll , &list , true, 0);
	// but if deleting a "new" and unforced record from spiderdb
	// then only delete tfndb record if it was tfn=255
	//if ( ! g_errno && rdb == g_spiderdb.getRdb() )
	//	updateTfndb2 ( coll , &list , false );
	// retry on some errors
	addedList ( slot , rdb );
}
void Scraper::gotPhrase ( ) {
	// error getting random phrase? bail!
	if ( g_errno ) log("scraper: got error getting random phrase: %s",
			   mstrerror(g_errno));

	CollectionRec *cr = g_collectiondb.getRec ( m_coll );

 loop:
	// what type of query should we do?
	m_qtype = rand() % 3;

	// make sure web, news, blog is enabled
	if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb   ) goto loop;
	if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews  ) goto loop;
	if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop;

	// scraping is off when repairing obviously
	if ( g_repairMode ) return;

	// get it
	char *s = g_wiki.m_randPhrase;
	// convert _'s to spaces
	for ( char *p = s ; *p ; p++ )
		if ( *p == '_' ) *p = ' ';
	// . url encode the random phrase
	// . truncate it to 200 bytes to keep things sane
	// . Wiki::doneReadingWiki() keeps it below 128 i think anyway
	char qe[400];
	urlEncode(qe, 200, s , gbstrlen(s) );
	char *end = qe + 390;

	// half the time append a random word from dictionary so that we 
	// discovery those tail-end sites better
	if ( m_qtype == 0 && (rand() % 2) ) { 
		// point into it for appending
		char *p = qe + gbstrlen(qe);
		// add a space, url encoded
		*p++ = '+';
		// append a random word to it from dictionary
		char *rw = g_speller.getRandomWord();
		// append that in
		urlEncode( p , end - p - 1 , rw , gbstrlen(rw) );
	}

	// make a query to scrape
	char buf[2048];

	char *uf ;
	if      ( m_qtype == 0 )
		uf="http://www.google.com/search?num=50&q=%s&scoring=d"
			"&filter=0";
	// google news query? sort by date.
	else if ( m_qtype == 1 )
		uf="http://news.google.com/news?num=50&q=%s&sort=n"
			"&filter=0";
	// google blog query?
	else if ( m_qtype == 2 ) 
		uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d"
			"&filter=0";
	// sanity check
	else { char *xx=NULL;*xx=0; }

	// make the url we will download
	sprintf ( buf , uf , qe );

	SpiderRequest sreq;
	// set the SpiderRequest
	strcpy(sreq.m_url, uf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	int32_t firstIp = hash32n(uf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// call this when index completes
	m_xd.setCallback ( NULL , indexedDocWrapper );

	// assume it blocked
	m_numSent++;

	// scraper is special
	m_xd.m_usePosdb     = false;
	//m_xd.m_useDatedb    = false;
	m_xd.m_useClusterdb = false;
	m_xd.m_useLinkdb    = false;
	m_xd.m_useSpiderdb  = true; // only this one i guess
	m_xd.m_useTitledb   = false;
	m_xd.m_useTagdb     = false;
	m_xd.m_usePlacedb   = false;
	//m_xd.m_useTimedb    = false;
	//m_xd.m_useSectiondb = false;
	//m_xd.m_useRevdb     = false;

	// . return false if this blocks
	// . will add the spider recs to spiderdb of the outlinks
	// . will add "ingoogle", etc. tags for each outlink
	if ( ! m_xd.indexDoc ( ) ) return ;

	// we didn't block
	indexedDoc ( );
}
// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {

	// advance round now in case we return early
	m_round++;

	InjectionRequest *ir = &m_injectionRequest;

	// error?
	char *qts = ir->ptr_queryToScrape;
	if ( ! qts ) { char *xx=NULL;*xx=0; }

	if ( gbstrlen(qts) > 500 ) {
		g_errno = EQUERYTOOBIG;
		return true;
	}

	// first encode the query
	SafeBuf ebuf;
	ebuf.urlEncode ( qts ); // queryUNEncoded );
	ebuf.nullTerm();

	char *uf;
	if ( m_round == 1 )
		// set to 1 for debugging
		uf="http://www.google.com/search?num=20&"
			"q=%s&scoring=d&filter=0";
		//uf = "https://startpage.com/do/search?q=%s";
		//uf = "http://www.google.com/"
		//	"/cse?cx=013269018370076798483%3A8eec3papwpi&"
		//	"ie=UTF-8&q=%s&"
		//	"num=20";
	else
		uf="http://www.bing.com/search?q=%s";

	// skip bing for now
	//if ( m_round == 2 )
	//	return true;
	//if ( m_round == 1 )
	//	return true;
		
	// make the url we will download
	char ubuf[2048];
	sprintf ( ubuf , uf , ebuf.getBufStart() );

	// log it
	log("inject: SCRAPING %s",ubuf);

	SpiderRequest sreq;
	sreq.reset();
	// set the SpiderRequest
	strcpy(sreq.m_url, ubuf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	int32_t firstIp = hash32n(ubuf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	//char *coll2 = ir->m_coll;
	CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );//coll2 );

	// need to make a new one now
	XmlDoc *xd;
	try { xd = new (XmlDoc); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("PageInject: scrape failed: new(%i): %s", 
		    (int)sizeof(XmlDoc),mstrerror(g_errno));
		return true;
	}
	mnew ( xd, sizeof(XmlDoc) , "PageInject" );

	// save it
	m_xd = xd;

	// forceDEl = false, niceness = 0
	m_xd->set4 ( &sreq , NULL , cr->m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd->m_useRobotsTxt = false;

	// this will tell it to index ahrefs first before indexing
	// the doc. but do NOT do this if we are from ahrefs.com
	// ourselves to avoid recursive explosion!!
	if ( m_useAhrefs )
		m_xd->m_useAhrefs = true;

	m_xd->m_reallyInjectLinks = true;//ir->m_injectLinks;

	//
	// rather than just add the links of the page to spiderdb,
	// let's inject them!
	//
	m_xd->setCallback ( this , doneInjectingLinksWrapper );

	// niceness is 0
	m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");

	// do we actually inject the links, or just scrape?
	if ( ! m_xd->injectLinks ( &m_linkDedupTable ,
				  NULL,
				  this , 
				  doneInjectingLinksWrapper ) ) 
		return false;
	// otherwise, just download the google/bing search results so we
	// can display them in xml
	//else if ( m_xd.getUtf8Content() == (char **)-1 )
	//	return false;
		
	// print reply..
	//printReply();
	return true;
}
bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
	SafeBuf p;
	char getBuf[64]; // holds extra values for GET method
	char formBuf[256]; // holds extra values for forms
	snprintf(getBuf, 64, "c=%s", 
		 r->getString("c", 0, ""));
	snprintf(formBuf, 256, 
		 "<input type=hidden name=\"c\" value=\"%s\">",
		 //"<input type=hidden name=\"pwd\" value=\"%s\">",
		 r->getString("c", 0, ""));
	g_pages.printAdminTop( &p, s, r);
	
	if (r->getLong("cancel", 0) != 0) {
		g_thesaurus.cancelRebuild();
		p.safePrintf("<br><br>\n");
		p.safePrintf(
		  "<center><b><font color=#ff0000>"
		  "rebuild canceled"
		  "</font></b></center>");
	}

	if (r->getLong("rebuild", 0) != 0) {
		bool full = r->getLong("full", 0);
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.rebuild(0, full)) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error starting rebuild, check log for details"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "rebuild started"
			  "</font></b></center>");
		}
	}
	
	if (r->getLong("rebuildaff", 0) != 0) {
		bool full = r->getLong("full", 0);
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.rebuildAffinity(0, full)) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error starting rebuild, check log for details"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "rebuild started"
			  "</font></b></center>");
		}
	}

	if (r->getLong("distribute", 0) != 0) {
		char cmd[1024];
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.m_affinityState) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "cannot distribute during rebuild"
			  "</font></b></center>");
		} else {
			for ( long i = 0; i < g_hostdb.getNumHosts() ; i++ ) {
				Host *h = g_hostdb.getHost(i);
				snprintf(cmd, 512,
					"rcp -r "
					"./dict/thesaurus.* "
					"%s:%s/dict/ &",
					iptoa(h->m_ip),
					h->m_dir);
				log(LOG_INFO, "admin: %s", cmd);
				system( cmd );
			}
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "data distributed"
			  "</font></b></center>");
		}	
	}

	if (r->getLong("reload", 0) != 0) {
		p.safePrintf("<br><br>\n");
		if (r->getLong("cast", 0) != 0) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "reload command broadcast"
			  "</font></b></center>");
		} else if (g_thesaurus.init()) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "thesaurus data reloaded"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error reloading thesaurus data"
			  "</font></b></center>");
		}
	}

	long manualAddLen = 0;
	char *manualAdd = NULL;
	SafeBuf manualAddBuf;
	if ((manualAdd = r->getString("manualadd", &manualAddLen))) {
		trimWhite(manualAdd);
		manualAddLen = gbstrlen(manualAdd);
		File manualFile;
		manualFile.set(g_hostdb.m_dir, "dict/thesaurus-manual.txt");
		if (manualFile.open(O_WRONLY | O_CREAT | O_TRUNC) &&
			(manualFile.write(manualAdd, manualAddLen, 0) ==
			 manualAddLen)) {
			char newl = '\n'; // for write()
			if (manualAdd[manualAddLen-1] != '\n')
				manualFile.write(&newl, 1, manualAddLen);
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "updated manual add file sucessfully"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error writing manual add file"
			  "</font></b></center>");
		}
	} else {
		char ff[PATH_MAX];
		snprintf(ff, PATH_MAX, "%sdict/thesaurus-manual.txt",
			g_hostdb.m_dir);
		if (manualAddBuf.fillFromFile(ff)) {
			if (*(manualAddBuf.getBuf()-1) != '\n')
				manualAddBuf.pushChar('\n');
			manualAdd = manualAddBuf.getBufStart();
			manualAddLen = manualAddBuf.length();
		}
	}

	long affinityAddLen = 0;
	char *affinityAdd = NULL;
	SafeBuf affinityAddBuf;
	if ((affinityAdd = r->getString("affinityadd", &affinityAddLen))) {
		trimWhite(affinityAdd);
		affinityAddLen = gbstrlen(affinityAdd);
		File affinityFile;
		affinityFile.set(g_hostdb.m_dir, 
			"dict/thesaurus-affinity.txt");
		if (affinityFile.open(O_WRONLY | O_CREAT | O_TRUNC) &&
			(affinityFile.write(affinityAdd, affinityAddLen, 0) ==
			 affinityAddLen)) {
			char newl = '\n'; // for write()
			if (affinityAdd[affinityAddLen-1] != '\n')
				affinityFile.write(&newl, 1, affinityAddLen);
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "updated affinity add file sucessfully"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error writing affinity add file"
			  "</font></b></center>");
		}
	} else {
		char ff[PATH_MAX];
		snprintf(ff, PATH_MAX, "%sdict/thesaurus-affinity.txt",
			g_hostdb.m_dir);
		if (affinityAddBuf.fillFromFile(ff)) {
			if (*(affinityAddBuf.getBuf()-1) != '\n')
				affinityAddBuf.pushChar('\n');
			affinityAdd = affinityAddBuf.getBufStart();
			affinityAddLen = affinityAddBuf.length();
		}
	}
	

	char *syn = r->getString("synonym");
	long len = 0;
	if (syn) len = gbstrlen(syn);

	if (len) {
		SynonymInfo info;
		bool r = g_thesaurus.getAllInfo(syn, &info, len, SYNBIT_ALL);
		p.safePrintf("<br><br>\n");
		p.safePrintf ( 
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Synonym List (%ld)</b></center>"
		  "</td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE, info.m_numSyns);
		if (r) {
			p.safePrintf("<tr>"
			  "<td align=right><tt>%s</tt></td>"
			  "<td align=left>"
			  "<tt>1.000/%08lX (1.000/%08lX)</tt>"
			  "</td>"
			  "</tr>\n", syn, MAX_AFFINITY, MAX_AFFINITY);
			for (long i = 0; i < info.m_numSyns; i++) {
				// get the reverse affinity as well
				long aff = g_thesaurus.getAffinity(
					info.m_syn[i], syn,
					info.m_len[i], len);
				p.safePrintf( 
				  "<tr>"
				  "<td width=40%% align=right>"
				  "<tt>");
				p.safeMemcpy(info.m_syn[i], info.m_len[i]);
				p.safePrintf("</tt>"
				  "</td>"
				  "<td width=60%% align=left>"
				  "<tt>");
				if (info.m_affinity[i] >= 0) {
					p.safePrintf("%0.3f/%08lX ",
				  	  (float)info.m_affinity[i] 
					  	/ MAX_AFFINITY,
					  info.m_affinity[i]);
				} else {
					p.safePrintf("u ");
				}
				if (aff >= 0) {
					p.safePrintf("(%0.3f/%08lX) ",
					  (float)aff / MAX_AFFINITY, 
					  aff);
				} else {
					p.safePrintf("(u) ");
				}
				p.safePrintf("(%ld) (%ld) (%ld) (%ld) "
					     "(%lld) (%lld)",
				  (long)info.m_type[i], (long)info.m_sort[i],
				  info.m_firstId[i], info.m_lastId[i],
				  info.m_leftSynHash[i], 
				  info.m_rightSynHash[i]);
				for (int j = info.m_firstId[i]; 
					j <= info.m_lastId[i];
					j++) {
					p.safePrintf(" (%lld)",
						info.m_termId[j]);
				}
				p.safePrintf(
				  "</tt>"
				  "</td>"
				  "</tr>\n");
			}
			p.safePrintf("</table>");
		} else {
			p.safePrintf("<tr>"
			  "<td align=center><font color=#FF0000>"
			  "synonym not found: %s"
			  "</font></td>"
			  "</tr>\n",
			  syn);
		}
	}

	p.safePrintf ( "<br><br>\n" );

	p.safePrintf ( 
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Thesaurus Controls"
		  "</b></center></td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE);
	
	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>rebuild all data</b><br>"
		  "<font size=1>"
		  "rebuilds synonyms and then begins the rebuild process for "
		  "affinity data; this should only be run on one host, as the "
		  "data is copied when the process is finished; full rebuild "
		  "does not use existing affinity data"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">"
		  "rebuild all data</a> <a href=\"/master/thesaurus?"
		  "rebuild=1&full=1&%s\">(full)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf, getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>distribute data</b><br>"
		  "<font size=1>"
		  "distributes all thesaurus data to all hosts, this is "
		  "normally done automatically but if there was a problem "
		  "with the copy, this lets you do it manually"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?distribute=1&%s\">"
		  "distribute data</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>reload data</b><br>"
		  "<font size=1>"
		  "reloads the synonyms and affinity table on this host only"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b>"
		  "<a href=\"/master/thesaurus?reload=1&cast=0&%s\">"
		  "reload data</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>reload data (all hosts)</b><br>"
		  "<font size=1>"
		  "reloads the synonyms and affinity table on all hosts"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b>"
		  "<a href=\"/master/thesaurus?reload=1&cast=1&%s\">"
		  "reload data (all hosts)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>list synonyms</b><br>"
		  "<font size=1>"
		  "enter a word here to list all synonym entries and their "
		  "affinities"
		  "</font>"
		  "</td>"
		  "<td width=12%%>"
		  "<form action=\"/master/thesaurus>\">"
		  "<input type=text name=synonym size=20>"
		  "<input type=submit value=Submit>"
		  "%s"
		  "</form></td>"
		  "</tr>\n", formBuf);
		
	p.safePrintf (
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Affinity Controls"
		  "</b></center></td>"
		  "</tr>\n",
		  DARK_BLUE);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>cancel running rebuild</b><br>"
		  "<font size=1>"
		  "cancels the rebuild and throws all intermediate data away"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?cancel=1&%s\">"
		  "cancel running rebuild</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);
	
	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>rebuild affinity only</b><br>"
		  "<font size=1>"
		  "begins the rebuild process for affinity data, has no "
		  "effect if a rebuild is already in progress; full rebuild "
		  "does not reuse existing affinity data"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">"
		  "rebuild affinity</a> <a href=\"/master/thesaurus?"
		  "rebuildaff=1&full=1&%s\">(full)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf, getBuf);
	
	p.safePrintf (
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Manual File Controls"
		  "</b></td>"
		  "</tr>\n",
		  DARK_BLUE);

	p.safePrintf (
		  "<tr>"
		  "<td align=center colspan=2>");
	
	p.safePrintf(
		  "<b>manually added pairs</b><br>\n"
		  "<font size=1>place word pairs here that should be linked "
		  "as synonyms, one pair per line, seperated by a pipe '|' "
		  "character, optionally followed by another pipe and a type "
		  "designation; any badly formatted lines will be silently "
		  "ignored</font><br>\n"
		  "<form action=\"/master/thesaurus\" method=post>"
		  "<textarea name=\"manualadd\" rows=20 cols=80>");

	if (manualAdd && manualAddLen) {
		p.htmlEncode(manualAdd, manualAddLen, true);
	}
	
	p.safePrintf (
		  "</textarea><br>"
		  "<input type=submit value=Submit>"
		  "<input type=reset value=Reset>"
		  "%s"
		  "</form></td>"
		  "</tr>\n",
		  formBuf);

	
	p.safePrintf (
		  "<tr>"
		  "<td align=center colspan=2>"
		  "<b>affinity value overrides</b><br>\n"
		  "<font size=1>place word/phrase pairs here that should have "
		  "there affinity values overridden, format is "
		  "\"word1|word2|value\", where value is a floating point, "
		  "integer (either decimal or hex), or the word \"max\"; "
		  "any badly formatted lines will be silently ignored; note "
		  "that these pairs will only work if the thesaurus otherwise "
		  "has an entry for them, so add them to the manual add file "
		  "above if need be</font><br>\n"
		  "<form action=\"/master/thesaurus\" method=post>"
		  "<textarea name=\"affinityadd\" rows=20 cols=80>");

	if (affinityAdd && affinityAddLen) {
		p.htmlEncode(affinityAdd, affinityAddLen, true);
	}
	
	p.safePrintf (
		  "</textarea><br>"
		  "<input type=submit value=Submit>"
		  "<input type=reset value=Reset>"
		  "%s"
		  "</form></td>"
		  "</tr>\n", 
		  formBuf);


	p.safePrintf ( "</table>\n" );
	p.safePrintf ( "<br><br>\n" );

	p.safePrintf (
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Affinity Builder Status"
		  "</b></td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE);

	long long a, b, c, d, e, f, g, h, i, j, k;
	StateAffinity *aff = g_thesaurus.m_affinityState;
	if (!aff) {
		p.safePrintf (
		  "<tr><td colspan=2>"
		  "<center><b>Not running</b></center>"
		  "</td></tr>\n");
		a = b = c = d = e = f = g = h = i = j = k = 0;
	} else {
		a = aff->m_oldTable->getNumSlotsUsed();
		b = aff->m_oldTable->getNumSlotsUsed() - aff->m_n;
		c = aff->m_n;
		d = (gettimeofdayInMilliseconds() - aff->m_time) / 1000;
		if (!d || !(c / d)) { 
			e = 0;
		} else {
			e = b / (c / d);
		}
		f = aff->m_sent;
		g = aff->m_recv;
		h = aff->m_errors;
		i = aff->m_old;
		j = aff->m_cache;
		k = aff->m_hitsTable.getNumSlotsUsed();
	}
	p.safePrintf (
		  "<tr><td><b># of total pairs</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of pairs remaining</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of pairs processed</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>elapsed time in seconds</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>estimated remaining time in seconds</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of requests sent</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of requests received</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of request errors</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of old values reused</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of cache hits</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>cache size</b></td>"
		  "<td>%lli</td></tr>\n",
		  a, b, c, d, e, f, g, h, i, j, k);
	p.safePrintf ( "</table>\n" );

	return g_httpServer.sendDynamicPage ( s, p.getBufStart(), p.length() );
}
// . sets m_fileOffset and m_bf
// . returns false and sets g_errno on error
// . returns false if nothing to read too... but does not set g_errno
bool ImportState::setCurrentTitleFileAndOffset ( ) {

	// leave m_bf and m_fileOffset alone if there is more to read
	if ( m_fileOffset < m_bfFileSize )
		return true;

	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
	if ( ! cr ) return false;

	log("import: import finding next file");
	
	// if ( m_offIsValid ) {
	// 	//*off = m_fileOffset;
	// 	return &m_bf; 
	// }
	//m_offIsValid = true;

	// look for titledb0001.dat etc. files in the 
	// workingDir/inject/ subdir
	SafeBuf ddd;
	ddd.safePrintf("%sinject",cr->m_importDir.getBufStart());
	// now use the one provided. we should also provide the # of threads
	if ( cr->m_importDir.getBufStart() && 
	     cr->m_importDir.getBufStart()[0] ) {
		ddd.reset();
		ddd.safeStrcpy ( cr->m_importDir.getBufStart() );
	}

	//
	// assume we are the first filename
	// set s_fileId to the minimum
	//
	Dir dir;
	dir.set(ddd.getBufStart());

	if ( ! dir.open() ) return false;

	// assume none
	int32_t minFileId = -1;

	// getNextFilename() writes into this
	char pattern[64]; strcpy ( pattern , "titledb*" );
	char *filename;
	while ( ( filename = dir.getNextFilename ( pattern ) ) ) {
		// filename must be a certain length
		int32_t filenameLen = gbstrlen(filename);
		// we need at least "titledb0001.dat"
		if ( filenameLen < 15 ) continue;
		// ensure filename starts w/ our m_dbname
		if ( strncmp ( filename , "titledb", 7 ) != 0 )
			continue;
		// skip if not .dat file
		if ( ! strstr ( filename , ".dat" ) )
			continue;
		// then a 4 digit number should follow
		char *s = filename + 7;
		if ( ! isdigit(*(s+0)) ) continue;
		if ( ! isdigit(*(s+1)) ) continue;
		if ( ! isdigit(*(s+2)) ) continue;
		if ( ! isdigit(*(s+3)) ) continue;
		// convert digit to id
		int32_t id = atol(s);
		// . do not accept files we've already processed
		// . -1 means we haven't processed any yet
		if ( m_bfFileId >= 0 && id <= m_bfFileId ) continue;
		// the min of those we haven't yet processed/injected
		if ( id < minFileId || minFileId < 0 ) minFileId = id;
	}

	// get where we left off
	if ( ! m_loadedPlaceHolder ) {
		// read where we left off from file if possible
		char fname[256];
		sprintf(fname,"%slasttitledbinjectinfo.dat",g_hostdb.m_dir);
		SafeBuf ff;
		ff.fillFromFile(fname);
		if ( ff.length() > 1 ) {
			m_loadedPlaceHolder = true;
			// get the placeholder
			sscanf ( ff.getBufStart() 
				 , "%"UINT64",%"INT32""
				 , &m_fileOffset
				 , &minFileId
				 );
		}
	}

	// if no files! return false to indicate we are done
	if ( minFileId == -1 ) return false;

	// set up s_bf then
	//if ( m_bfFileId != minFileId ) {
	SafeBuf tmp;
	tmp.safePrintf("titledb%04"INT32"-000.dat"
		       //,dir.getDirname()
		       ,minFileId);
	m_bf.set ( dir.getDirname() ,tmp.getBufStart() );
	if ( ! m_bf.open( O_RDONLY ) ) {
		log("inject: import: could not open %s%s for reading",
		    dir.getDirname(),tmp.getBufStart());
		return false;
	}
	m_bfFileId = minFileId;
	// reset ptr into file
	//*off = 0;
	// and set this
	m_bfFileSize = m_bf.getFileSize();

	m_fileOffset = 0;
	//}

	log("import: importing from file %s",m_bf.getFilename());

	return true;//&m_bf;
}
// . returns false if blocked, true otherwise
// . returns true on error and sets g_errno
bool SiteGetter::getSiteList ( ) {

top:
	// . setSite() will return TRUE and set g_errno on error, and returns
	//   false if it blocked adding a tag, which will call callback once
	//   tag is added
	// . stop at this point
	if ( m_pathDepth >= 3 ) return setSite();
	// or if no more
	if ( m_pathDepth >= m_maxPathDepth ) return setSite();

	// . make the termid
	// . but here we get are based on "m_pathDepth" which ranges
	//   from 1 to N
	// . if m_pathDepth==0 use "www.xyz.com" as site
	// . if m_pathDepth==1 use "www.xyz.com/foo/" as site ...
	char *pend = getPathEnd ( m_url , m_pathDepth );
	// hash up to that
	//char *host = m_u.getHost();
	char *host = getHostFast ( m_url , NULL );
	// hash the prefix first to match XmlDoc::hashNoSplit()
	char *prefix = "siteterm";
	// hash that and we will incorporate it to match XmlDoc::hashNoSplit()
	int64_t ph = hash64 ( prefix , gbstrlen(prefix) );
	// . this should match basically what is in XmlDoc.cpp::hash()
	// . and this now does not include pages that have no outlinks 
	//   "underneath" them.
	int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK;

	// get all pages that have this as their termid!
	key144_t start ;
	key144_t end   ;
	g_posdb.makeStartKey ( &start, termId );
	g_posdb.makeEndKey   ( &end  , termId );

	// . now see how many urls art at this path depth from this hostname
	// . if it is a huge # then we know they are all subsites!
	//   because it is too bushy to be anything else
	// . i'd say 100 nodes is good enough to qualify as a homestead site

	int32_t minRecSizes = 5000000;
	// get the group this list is in
	//uint32_t gid ;
	//gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split?
	//uint32_t shardNum ;
	//shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split?

	// i guess this is split by termid and not docid????
	int32_t shardNum = g_hostdb.getShardNumByTermId ( &start );

	// we need a group #. the column #.
	//int32_t split = g_hostdb.getGroupNum ( gid );
	// int16_tcut
	Msg0 *m = &m_msg0;
	// get the list. returns false if blocked.
	if ( ! m->getList ( -1                 , // hostId
			    0                  , // ip
			    0                  , // port
			    0                  , // maxCacheAge
			    false              , // addToCache
			    RDB_POSDB        ,
			    m_collnum             ,
			    &m_list            ,
			    (char *)&start     ,
			    (char *)&end       ,
			    minRecSizes        ,
			    this               ,
			    gotSiteListWrapper ,
			    m_niceness         , // MAX_NICENESS
			    // default parms follow
			    true  ,  // doErrorCorrection?
			    true  ,  // includeTree?
			    true  ,  // doMerge?
			    -1    ,  // firstHostId
			    0     ,  // startFileNum
			    -1    ,  // numFiles
			    999999,  // timeout
			    -1    ,  // syncPoint
			    -1    ,  // preferLocalReads
			    NULL  ,  // msg5
			    NULL  ,  // msg5b
			    false ,  // isrealmerge?
			    true  ,  // allowpagecache?
			    false ,  // forceLocalIndexdb?
			    false ,  // doIndexdbSplit? nosplit
			    shardNum ) )//split ))
		return false;

	// return false if this blocked
	if ( ! gotSiteList() ) return false;
	// error?
	if ( g_errno ) return true;
	// or all done
	if ( m_allDone ) return true;
	// otherwise, try the next path component!
	goto top;
}
// . "sir" is the serialized injectionrequest
// . this is called from the http interface, as well as from
//   XmlDoc::indexWarcOrArc() to inject individual recs/docs from the warc/arc
// . returns false and sets g_errno on error, true on success
bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir , 
					void *state ,
					void (* callback)(void *) ) {

	// ensure it is our own
	if ( &m_injectionRequest != ir ) { char *xx=NULL;*xx=0; }

	//if ( strcmp ( ir->ptr_url , "http://www.indyweek.com/durham/current/news.html" )  == 0 )
	//	fprintf(stderr,"ey\n");

	// ensure url not beyond limit
	if ( ir->ptr_url &&
	     gbstrlen(ir->ptr_url) > MAX_URL_LEN ) {
		g_errno = EURLTOOBIG;
		return log("inject: url too big.");
	}

	// hack fix core
	if ( ir->size_metadata == 0 ) ir->ptr_metadata = NULL;

	int32_t sirSize = 0;
	char *sir = serializeMsg2 ( ir ,
				    sizeof(InjectionRequest),
				    &ir->ptr_url,
				    &ir->size_url ,
				    &sirSize );
	// oom?
	if ( ! sir ) 
		return log("inject: failed to serialize request");

	// free any old one if we are being reused
	if ( m_sir ) {
		mfree ( m_sir , m_sirSize , "m7ir" );
		m_sir = NULL;
	}

	m_state = state;
	m_callback = callback;

	// save it for freeing later
	m_sir = sir;
	m_sirSize = sirSize;

	// forward it to another shard?
	Host *host = getHostToHandleInjection ( ir->ptr_url );

	log("inject: sending injection request of url %s reqsize=%i "
	    "to host #%"INT32"",
	    ir->ptr_url,(int)sirSize,host->m_hostId);

	// . ok, forward it to another host now
	// . and call got gotForwardedReplyWrapper when reply comes in
	// . returns false and sets g_errno on error
	// . returns true on success
	if ( g_udpServer.sendRequest ( sir , // req ,
					 sirSize,
					 0x07 , // msgtype
					 host->m_ip , // ip
					 host->m_port , // port
					 host->m_hostId,
					 NULL, // retslot
					 this,//state,
					 gotUdpReplyWrapper,//acallback,
					 99999999 , // timeout
					 -1 , // backoff
					 -1 , // maxwait
					 NULL, // replybuf
					 0, // replybufmaxsize
					 MAX_NICENESS // niceness
				       ) )
		// we also return true on success, false on error
		return true;

	if ( ! g_errno ) { char *xx=NULL;*xx=0; }
	// there was an error, g_errno should be set
	return false;
}
예제 #25
0
bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
	SafeBuf sb(512 * 512,"autobbuf");
	//read in all of the possible cgi parms off the bat:
	//long  user     = g_pages.getUserType( s , r );
	//char *username = g_users.getUsername(r);
	//char *pwd  = r->getString ("pwd");

	char *coll = r->getString ("c");

	long banIpsLen;
	char *banIps = r->getString ("banIps" , &banIpsLen , NULL);

	long allowIpsLen;
	char *allowIps = r->getString ("allowIps" , &allowIpsLen , NULL);

 	long clearLen;
 	char *clear = r->getString ("clear" , &clearLen , NULL);

	bool changed = false;

 	long validCodesLen;
 	char *validCodes = r->getString ("validCodes", &validCodesLen, NULL);

	long showAllIps = r->getLong("showAllIps", 0);
	long showLongView = r->getLong("longview", 0);

	// do it all from parm now
	//long banRegexLen;
	//char *banRegex = r->getString("banRegex", &banRegexLen, NULL);
	

// 	char *ss = sb.getBuf();
// 	char *ssend = sb.getBufEnd();
	g_pages.printAdminTop ( &sb, s , r );

	//sb.incrementLength(sss - ss);

	// MDW: moved to here

	long now = getTime();
	
	long days;
	long hours;
	long minutes;
	long secs;
	long msecs;

	if(r->getLong("resetcodes", 0)) {
		setCodesFromConf();
	}

	sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);

	getCalendarFromMs((now - m_codeResetTime) * 1000,
			  &days, 
			  &hours, 
			  &minutes, 
			  &secs,
			  &msecs);
	sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>"
		      "<center><b>Code Usage "
		      "(<a href=\"/admin/"
		      "autoban?c=%s&resetcodes=1\">reset</a> "
		      "%li days %li hours %li "
		      "minutes %li sec ago)"
		      "</b></center></td></tr>", 
		      DARK_BLUE,
		      coll,
		      days, 
		      hours, 
		      minutes, 
		      secs);
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>Code</b></center></td>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Query Count</b></center></td>"

		      "<td><center><b>Bytes Read</b></center></td>"
		      "<td><center><b>Bytes Sent</b></center></td>"
		      
		      "<td><center><b>Outstanding Count</b></center></td>"
		      "<td><center><b>Most Ever Outstanding</b></center></td>"
		      "<td><center><b>Max Outstanding</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);


	for(long i = 0; i < m_ht.getNumSlots(); i++) {
		if ( m_ht.getKey ( i ) == 0 ) continue;
		CodeVal *cv = m_ht.getValuePointerFromSlot ( i );
		if ( ! cv ) continue;
		
		sb.safePrintf("<tr>");
		sb.safePrintf("<td>");
		sb.copyToken(cv->m_code);//m_codeVals[i].m_code);
		sb.safePrintf("</td>");
		sb.safePrintf("<td><center>%s</center> </td>",
			      iptoa(cv->m_ip));
		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_count);

		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_bytesRead);
		sb.safePrintf("<td><center>%lli</center></td>", 
			      cv->m_bytesSent);

		sb.safePrintf("<td><center>%li</center></td>", 
			      cv->m_outstanding);
		sb.safePrintf("<td><center>%li</center></td>", 
			      cv->m_maxEver);
		if ( cv->m_maxOutstanding != 50 )
			sb.safePrintf("<td><center><b>%li</b></center></td>", 
				      cv->m_maxOutstanding);
		else
			sb.safePrintf("<td><center>%li</center></td>", 
				      cv->m_maxOutstanding);

		sb.safePrintf("</tr>");
		
	}
	sb.safePrintf ("</table><br><br>\n" );


 	if(clear && clearLen < 64) {
 		long ip = atoip(clear, clearLen);
 		if(ip) {
			removeIp(ip);
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, clear, clearLen);
			ipbuf[clearLen] = '\0';
			beginning = findToken(g_conf.m_banIps, ipbuf, 
					      clearLen);
			if(beginning) {
				char *to = beginning;
				char *from = beginning + clearLen;
				while(*to) *to++ = *from++;
			}
			beginning = findToken(g_conf.m_allowIps, ipbuf,
					      clearLen);
			if(beginning) {
				char *to = beginning;
				char *from = beginning + clearLen;
				while(*to) *to++ = *from++;
			}
			changed = true;
 		}
 	}

 	long allowLen;
 	char *allow = r->getString ( "allow" , &allowLen , NULL );
 	if(allow && allowLen < 64) {
 		long ip = atoip(allow, allowLen);
		
 		if(ip) {
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, allow, allowLen);
			ipbuf[allowLen] = '\0';
			beginning = findToken(g_conf.m_allowIps, ipbuf, 
					      allowLen);
			if(!beginning) {
				//its not present, so add it.
				char *p = g_conf.m_allowIps;
				while(*p) p++;
				if(p - g_conf.m_allowIps + allowLen + 2 
				   < AUTOBAN_TEXT_SIZE) {
					*p++ = '\n';
					memcpy(p, ipbuf,allowLen);
					*(p + allowLen) = '\0';
				}
				else {
					sb.safePrintf("<font color=red>"
						      "Not enough stack space "
						      "to fit allowIps.  "
						      "Increase "
						      "AUTOBAN_TEXT_SIZE in "
						      "Conf.h. "
						      "Had %i need %li."
						      "</font>", 
						      AUTOBAN_TEXT_SIZE,
						      p - g_conf.m_allowIps + 
						      allowLen + 2);
					goto dontRemove1;
				}
			}
			beginning = findToken(g_conf.m_banIps, ipbuf, 
					      allowLen);
			if(beginning) {
				//remove it from banned if present.
				char *to = beginning;
				char *from = beginning + allowLen;
				while(*to) *to++ = *from++;
			}

			changed = true;
 		}
 	}
 dontRemove1:
 	long denyLen;
 	char *deny = r->getString ( "deny" , &denyLen , NULL );
 	if(deny && denyLen < 64) {
 		long ip = atoip(deny, denyLen);
		
 		if(ip) {
			char *beginning;
			char ipbuf[64];//gotta NULL terminate for strstr
			memcpy(ipbuf, deny, denyLen);
			ipbuf[denyLen] = '\0';
			beginning = findToken(g_conf.m_banIps, ipbuf, denyLen);
			if(!beginning) {
				//its not present, so add it.
				char *p =g_conf.m_banIps;
				while(*p) p++;
				if(p - g_conf.m_banIps + denyLen + 2 < 
				   AUTOBAN_TEXT_SIZE) {
					*p++ = '\n';
					memcpy(p, ipbuf,denyLen);
					*(p + denyLen) = '\0';
				}
				else {
					sb.safePrintf("<font color=red>Not "
						      "enough stack space "
						      "to fit bannedIPs.  "
						      "Increase "
						      "AUTOBAN_TEXT_SIZE in "
						      "Conf.h. "
						      "Had %i need %li."
						      "</font>", 
						      AUTOBAN_TEXT_SIZE,
						      p - g_conf.m_banIps +
						      denyLen + 2);
					goto dontRemove2;
				}
			}
			beginning = findToken(g_conf.m_allowIps, ipbuf,
					      denyLen);
			if(beginning) {
				//remove it from allowed list if present.
				char *to = beginning;
				char *from = beginning + denyLen;
				while(*to) *to++ = *from++;
			}
			changed = true;
 		}
 	}
 dontRemove2:

	if(!g_conf.m_doAutoBan) {
		sb.safePrintf("<center><font color=red><b>Autoban is disabled, "
			      "turn it on in Master Controls.</b></font></center><br>");
	}

 	if(validCodes) {
		if(validCodesLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit codes.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      validCodesLen);
			validCodes = NULL;
			validCodesLen = 0;
		}
		else {
			memcpy(g_conf.m_validCodes, validCodes, validCodesLen);
			g_conf.m_validCodes[validCodesLen] = '\0';
			trimWhite(g_conf.m_validCodes);
			setCodesFromConf();
		}
	}



	//first remove all of the ips in the conf, then add the passed in 
	//  ones to the conf parm; 
	if (banIps) {
		//ack, the browser puts in crlf when this comes back, so
		//we will have a longer string here than the one we sent 
		//out. trim back all extrainious whitespace before we do
		//bounds checking.
		trimWhite(banIps);
		banIpsLen = gbstrlen(banIps);
		if(banIpsLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit bannedIps.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      banIpsLen);
			banIpsLen = AUTOBAN_TEXT_SIZE - 1;
		}
		for(long i = 0; i < m_tableSize; i++) {
			if(m_detectKeys[i] == 0) continue;
			//check the 'set from conf' bit, and clear those.
			if(m_detectVals[i].m_flags & FROMCONF) {
				removeIp(m_detectKeys[i]);
			}
		}
		memcpy(g_conf.m_banIps, banIps, banIpsLen);
		g_conf.m_banIps[banIpsLen] = '\0';
		changed = true;
	}
	if (allowIps) {
		trimWhite(allowIps);
		allowIpsLen = gbstrlen(allowIps);

		if(allowIpsLen >= AUTOBAN_TEXT_SIZE) {
			sb.safePrintf("<font color=red>Not enough stack space "
				      "to fit allowIps.  "
				      "Increase AUTOBAN_TEXT_SIZE in Conf.h. "
				      "Had %i need %li.</font>", 
				      AUTOBAN_TEXT_SIZE,
				      allowIpsLen);
			allowIpsLen = AUTOBAN_TEXT_SIZE - 1;
		}
		for(long i = 0; i < m_tableSize; i++) {
			if(m_detectKeys[i] == 0) continue;
			//check the 'set from conf' bit, and clear those.
			if(m_detectVals[i].m_flags & FROMCONF) {
				removeIp(m_detectKeys[i]);
			}
		}
		memcpy(g_conf.m_allowIps, allowIps, allowIpsLen);
		g_conf.m_allowIps[allowIpsLen] = '\0';
		changed = true;
	}
	if(changed) {
		trimWhite(g_conf.m_allowIps);
		trimWhite(g_conf.m_banIps);
		setFromConf();
	}



	sb.safePrintf("\n<table %s>\n",TABLE_STYLE);
	sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>"
		      "<center><b>Add IPs</b></center></td></tr>", 
		      DARK_BLUE);

// 	ss = sb.getBuf();
// 	ssend = sb.getBufEnd();
	g_parms.printParms (&sb, s, r);
	//	sb.incrementLength(sss - ss);



	sb.safePrintf ("<tr><td>"
		       "<center>" 
		       "<input type=submit value=\"Update\" "
		       "method=\"POST\" border=0>"
		       "</center></td></tr>");

	sb.safePrintf ("</table><br><br>\n" );



	if(!showLongView) {
		sb.safePrintf("<b><a href=\"autoban"
			      "?c=%s"
			      "&showAllIps=%li"
			      "&longview=1\">Show watched ips table...</a></b>",
			      coll,
			      showAllIps);
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart() , 
						      sb.length() , 
						      -1 , 
						      false);
	}

	/////////////////////////////////////////////////////////////////////

	sb.safePrintf("\n<table %s>\n",TABLE_STYLE);

	sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>"
		      "<center><b>Watched Ips</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Description</b></center></td>"
		      //		      "<td><center><b>Time Added</b></center></td>"
		      "<td><center><b>Allow/Deny/Clear</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);




	long *sortedIndices = (long*)mmalloc(m_tableSize * sizeof(long), 
					     "AutoBanH");

	if(!sortedIndices) {
		return g_httpServer.sendErrorReply(s,500,mstrerror(ENOMEM));
	}

	long numEntries = 0;
	for(long i = 0; i < m_tableSize; i++) {
		if(m_detectKeys[i] == 0) continue;
		sortedIndices[numEntries++] = i;
	}
	SorterTable = m_detectKeys;

        gbsort(sortedIndices, numEntries, sizeof(long), ip_cmp);


	//lets put each class of watched ip in its own safebuf then cat 
	//them together at the end.
	
	SafeBuf allowed;
	SafeBuf banned; 
	SafeBuf feedLeachers; 
	SafeBuf cowBots; 
	SafeBuf *e;

	for(long j = 0; j < numEntries; j++) {
		long i = sortedIndices[j];
		if(m_detectKeys[i] == 0) continue;
		//if(!(m_detectVals[i].m_flags & FROMCONF)) continue;
		bool allow =  m_detectVals[i].m_flags & ALLOW && 
			m_detectVals[i].m_flags & FROMCONF;
		bool deny  =  m_detectVals[i].m_flags & DENY && 
			m_detectVals[i].m_flags & FROMCONF;
		bool explicitban = deny && m_detectVals[i].m_flags & FROMCONF;
		unsigned short dayCount = m_detectVals[i].m_dayCount;
		unsigned char minuteCount = m_detectVals[i].m_minuteCount;

		bool day =    dayCount >= g_conf.m_numFreeQueriesPerDay;
		bool minute = minuteCount >= g_conf.m_numFreeQueriesPerMinute;

		char *description;
		char *color;

		if(allow) {
			color = GREEN;
			description = "Allowed";
			e = &allowed;
		} 
		else if(explicitban) {
			color = RED;
			description = "Banned";
			e = &banned;
		}
		else if(minute) {
			color = RED;
			description = "Cow Bot";
			e = &cowBots;
		}
		else if(day) {
			color = RED;
			description = "Feed Leacher";
			e = &feedLeachers;
		}
		else {
			//this can happen when someone was banned due to 
			//exceeding the quota, then the quota was lowered.
			
			m_detectVals[i].m_flags &= ~DENY;
			//log("autoban: ohshit-banning %s",iptoa(s->m_ip));
			continue;
		}

		
		e->safePrintf("<tr>");

		e->safePrintf("<td bgcolor=#%s><center>%s</center></td><td>"
			      "<center>%s</center></td>"

// 			      "<td><center>"
// 			      "%li days %li hrs %li min ago"
// 			      "</center></td>"

			      "<td><center><a href=\"/admin/"
			      "autoban?c=%s&allow=%s&showAllIps=%li\">" 
			      "allow/</a>"

			      "<a href=\"/admin/"
			      "autoban?c=%s&deny=%s&showAllIps=%li\">" 
			      "deny/</a>"

			      "<a href=\"/admin/"
			      "autoban?c=%s&clear=%s&showAllIps=%li\">"
			      "clear</a></center>"
			      "</td>",color, 
			      iptoa(m_detectKeys[i]),
			      description,

			      //      days,hours,minutes,

			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps);
		e->safePrintf("</tr>");
	}

	sb.cat(allowed);
	sb.cat(banned); 
	sb.cat(feedLeachers); 
	sb.cat(cowBots); 

	sb.safePrintf ("</table><br><br>\n" );


	// MDW moved from here

	sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);

	sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>"
		      "<center><b>Control Panel</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr>"
		      "<td bgcolor=#%s><center><b>Show Ips by Number of Queries"
		      "</b></center></td>",
		      LIGHT_BLUE);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
		      "autoban?c=%s&showAllIps=0\">"
		      "0 Queries</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
		      "autoban?c=%s&showAllIps=1\">"
		      "1 Query</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
		      "autoban?c=%s&showAllIps=10\">"
		      "10 Queries</a></b>"
		      "</font></center></td>",
		      coll);
	sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/"
		      "autoban?c=%s&showAllIps=100\">"
		      "100 Queries</a></b>"
		      "</font></center></td></tr>",
		      coll);

	sb.safePrintf ("</table><br><br>\n");



	if(!showAllIps) {

		char* ss = (char*) sb.getBufStart();
		long sslen = sb.length();
		mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH");

		return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false);
	}
	

	sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE);

	sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>"
		      "<center><b>Queries Today</b></center></td></tr>", 
		      DARK_BLUE);

	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td><center><b>IP</b></center></td>"
		      "<td><center><b>Minute count</b></center></td>"
		      "<td><center><b>Day count</b></center></td>"
		      "<td><center><b>Time Until Reset</b></center></td>"
		      "<td><center><b>Times Banned</b></center></td>"
		      "<td><center><b>Allow/Deny</b></center></td>"
		      "</tr>", 
		      LIGHT_BLUE);


	char minBuf[128];
	char dayBuf[128];
	unsigned long lastIpGroup = 0;
	for(long j = 0; j < numEntries; j++) {
		long i = sortedIndices[j];
		long  dayCount = m_detectVals[i].m_dayCount;
		unsigned char minuteCount = m_detectVals[i].m_minuteCount;

		if(!(m_detectVals[i].m_flags & FROMCONF)) {
			if(m_detectVals[i].m_minuteExpires < now) 
				minuteCount = 0;
			if(!(m_detectVals[i].m_flags & DENY) && 
			   m_detectVals[i].m_dayExpires < now) 
				dayCount = 0;
		}
		//a hack:
		if( dayCount < showAllIps) continue;

		char *color = YELLOW;
		
		if(m_detectVals[i].m_flags & ALLOW) {
			color = GREEN;
			snprintf(minBuf, 128, "--");
			snprintf(dayBuf, 128, "%li", dayCount);
		}
		else if(m_detectVals[i].m_flags & DENY) {
			color = RED;
			snprintf(minBuf, 128, "--");
			snprintf(dayBuf, 128, "%li", dayCount);
		} 
		else {
			snprintf(minBuf, 128, "%li", (long)minuteCount);
			snprintf(dayBuf, 128, "%li", (long)dayCount);
		}

		unsigned long thisIpGroup = (unsigned long)m_detectKeys[i] & 
			0x00ffffff;

		sb.safePrintf("<tr><center>");

		if(m_detectVals[i].m_flags & FROMCONF) {
			sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>"
				      "<td><center>%s</center> </td>"
				      "<td><center>%s</center></td>" 
				      "<td><center><font color=red>"
				      "<b>NEVER</b>"
				      "</font></center></td>"
				      "<td><center>--</center></td>",
				      color, 
				      (thisIpGroup == lastIpGroup)?"<b>":"",
				      iptoa(m_detectKeys[i]),
				      (thisIpGroup == lastIpGroup)?"</b>":"",
				      minBuf,
				      dayBuf);
		}
		else {
			//they haven't done a query since being unbanned,
			//unban them now so we don't get negative resets displayed.
			/*
			  no, don't unban the bots!!! MDW yippy project
			if(m_detectVals[i].m_dayExpires < now) {
				m_detectVals[i].m_flags &= ~DENY; 
				//log("autoban: dayexpire-unbanning %s",
				//    iptoa(ip));
				m_detectVals[i].m_dayExpires = now + ONE_DAY;
				m_detectVals[i].m_minuteExpires = now + 60;
				m_detectVals[i].m_dayCount = 0;
				m_detectVals[i].m_minuteCount = 0;
				sb.safePrintf("</center></tr>");
				continue;
			}
			*/

			getCalendarFromMs((m_detectVals[i].m_dayExpires - now)* 1000,
					  &days, 
					  &hours, 
					  &minutes, 
					  &secs,
					  &msecs);

			sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>"
				      "<td><center>%s</center> </td>"
				      "<td><center>%s</center></td>" 
				      "<td><center><font color=red>"
				      "<b>%li days %li hrs %li min %li sec</b>"
				      "</font></center></td>"
				      "<td><center>%i</center></td>",
				      color, 
				      (thisIpGroup == lastIpGroup)?"<b>":"",
				      iptoa(m_detectKeys[i]),
				      (thisIpGroup == lastIpGroup)?"</b>":"",
				      minBuf,
				      dayBuf,
				      days, hours, minutes, secs,
				      m_detectVals[i].m_timesBanned);
		}
		sb.safePrintf("<td><center>"
			      "<a href=\"/admin/"
			      "autoban?c=%s&allow=%s&showAllIps=%li\">" 
			      "allow/</a>"
			      "<a href=\"/admin/"
			      "autoban?c=%s&deny=%s&showAllIps=%li\">" 
			      "deny</a></center>"
			      "</td>",
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps,
			      coll,
			      iptoa(m_detectKeys[i]),
			      showAllIps);

		sb.safePrintf("</center></tr>");
		lastIpGroup = thisIpGroup;
	}


	sb.safePrintf ("</table><br><br>\n" );


	char* ss = (char*) sb.getBufStart();
	long sslen = sb.length();

	mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH");

	return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false);
}
bool sendHttpReply ( void *state ) {
	// get the state properly
	Msg7 *msg7= (Msg7 *) state;

	InjectionRequest *ir = &msg7->m_injectionRequest;

	// extract info from state
	TcpSocket *sock = msg7->m_socket;

	//XmlDoc *xd = msg7->m_xd;

	int64_t docId  = msg7->m_replyDocId; // xd->m_docId;

	// might already be EURLTOOBIG set from above
	if ( ! g_errno ) g_errno = msg7->m_replyIndexCode;

	int32_t      hostId = 0;//msg7->m_msg7.m_hostId;

	// set g_errno to index code
	//if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno )
	//	g_errno = xd->m_indexCode;


	char format = msg7->m_format;

	// no url parm?
	if ( ! g_errno && ! ir->ptr_url && format != FORMAT_HTML )
		g_errno = EMISSINGINPUT;

	if ( g_errno && g_errno != EDOCUNCHANGED ) {
		int32_t save = g_errno;
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		g_errno = save;
		char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,save,msg,NULL);
	}

	char abuf[320];
	SafeBuf am(abuf,320,0,false);
	am.setLabel("injbuf");
	char *ct = NULL;

	// a success reply, include docid and url i guess
	if ( format == FORMAT_XML ) {
		am.safePrintf("<response>\n");
		am.safePrintf("\t<statusCode>%"INT32"</statusCode>\n",
			      (int32_t)g_errno);
		am.safePrintf("\t<statusMsg><![CDATA[");
		am.cdataEncode(mstrerror(g_errno));
		am.safePrintf("]]></statusMsg>\n");
		// if xmldoc was a container of subdocs that XmlDoc::indexDoc()
		// call indexWarcOrArc() on then docid is not valid since
		// we do not index container docs.
		//int64_t docId = xd->m_docId;
		//if ( ! xd->m_docIdValid ) docId = 0;
		am.safePrintf("\t<docId>%"INT64"</docId>\n",docId);
		// this will have to be re-tooled if we deem necessary.
		// was being use to do section voting for diffbot
		// upon a url being injected.
		/*
		if ( ir->m_getSections ) {
			SafeBuf *secBuf = xd->getInlineSectionVotingBuf();
			am.safePrintf("\t<htmlSrc><![CDATA[");
			if ( secBuf->length() ) 
				am.cdataEncode(secBuf->getBufStart());
			am.safePrintf("]]></htmlSrc>\n");
		}
		*/
		am.safePrintf("</response>\n");
		ct = "text/xml";
	}

	if ( format == FORMAT_JSON ) {
		am.safePrintf("{\"response\":{\n");
		am.safePrintf("\t\"statusCode\":%"INT32",\n",(int32_t)g_errno);
		am.safePrintf("\t\"statusMsg\":\"");
		am.jsonEncode(mstrerror(g_errno));
		am.safePrintf("\",\n");
		am.safePrintf("\t\"docId\":%"INT64",\n",docId);//xd->m_docId);
		// this will have to be re-tooled if we deem necessary.
		// was being use to do section voting for diffbot
		// upon a url being injected.
		/*

		if ( ir->m_getSections ) {
			SafeBuf *secBuf = xd->getInlineSectionVotingBuf();
			am.safePrintf("\t\"htmlSrc\":\"");
			if ( secBuf->length() ) 
				am.jsonEncode(secBuf->getBufStart());
			am.safePrintf("\",\n");
		}
		*/
		// subtract ",\n"
		am.m_length -= 2;
		am.safePrintf("\n}\n}\n");
		ct = "application/json";
	}

	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage(sock,
						    am.getBufStart(),
						    am.length(),
						    0,
						    false,
						    ct );
	}

	//
	// debug
	//

	/*
	// now get the meta list, in the process it will print out a 
	// bunch of junk into msg7->m_pbuf
	if ( xd->m_docId ) {
		char *metalist = xd->getMetaList ( 1,1,1,1,1,1 );
		if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;}
		// print it out
		SafeBuf *pbuf = &msg7->m_sbuf;
		xd->printDoc( pbuf );
		bool status = g_httpServer.sendDynamicPage( msg7->m_socket , 
							   pbuf->getBufStart(),
							    pbuf->length() ,
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		// delete the state now
		mdelete ( st , sizeof(Msg7) , "PageInject" );
		delete (st);
		// return the status
		return status;
	}
	*/
	//
	// end debug
	//

	char *url = ir->ptr_url;
	
	// . if we're talking w/ a robot he doesn't care about this crap
	// . send him back the error code (0 means success)
	if ( url && ir->m_shortReply ) {
		char buf[1024*32];
		char *p = buf;
		// return docid and hostid
		if ( ! g_errno ) p += sprintf ( p , 
						"0,docId=%"INT64","
						"hostId=%"INT32"," , 
						docId , hostId );
		// print error number here
		else  p += sprintf ( p , "%"INT32",0,0,", (int32_t)g_errno );
		// print error msg out, too or "Success"
		p += sprintf ( p , "%s", mstrerror(g_errno));
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage ( sock,buf, gbstrlen(buf) ,
						      -1/*cachetime*/);
	}

	SafeBuf sb;

	// print admin bar
	g_pages.printAdminTop ( &sb, sock , &msg7->m_hr );

	// print a response msg if rendering the page after a submission
	if ( g_errno )
		sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>"
				"</center>", 
				mstrerror(g_errno) , g_errno);
	else if ( (ir->ptr_url && ir->ptr_url[0]) ||
		  (ir->ptr_queryToScrape&&ir->ptr_queryToScrape[0]) )
		sb.safePrintf ( "<center><b>Sucessfully injected %s"
				"</center><br>"
				, ir->ptr_url
				//, xd->m_firstUrl.m_url
				);


	// print the table of injection parms
	g_parms.printParmTable ( &sb , sock , &msg7->m_hr );


	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// calculate buffer length
	//int32_t bufLen = p - buf;
	// nuke state
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	// . i thought we need -2 for cacheTime, but i guess not
	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(),
					     sb.length(), 
					     -1/*cachetime*/);
}
예제 #27
0
static bool isTLD ( char *tld , int32_t tldLen ) {

	int32_t pcount = 0;
	// now they are random!
	for ( int32_t i = 0 ; i < tldLen ; i++ ) {
		// period count
		if ( tld[i] == '.' ) { pcount++; continue; }
		if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false;
	}

	if ( pcount == 0 ) return true;
	if ( pcount >= 2 ) return false;

	// otherwise, if one period, check table to see if qualified

	// we use this as our hashtable
	static bool       s_isInitialized = false;
	// . i shrunk this list a lot
	// . see backups for the hold list
	static const char * const s_tlds[] = {

	  // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
	"AAA",
	"AARP",
	"ABB",
	"ABBOTT",
	"ABBVIE",
	"ABOGADO",
	"ABUDHABI",
	"AC",
	"ACADEMY",
	"ACCENTURE",
	"ACCOUNTANT",
	"ACCOUNTANTS",
	"ACO",
	"ACTIVE",
	"ACTOR",
	"AD",
	"ADAC",
	"ADS",
	"ADULT",
	"AE",
	"AEG",
	"AERO",
	"AF",
	"AFL",
	"AG",
	"AGAKHAN",
	"AGENCY",
	"AI",
	"AIG",
	"AIRFORCE",
	"AIRTEL",
	"AKDN",
	"AL",
	"ALIBABA",
	"ALIPAY",
	"ALLFINANZ",
	"ALLY",
	"ALSACE",
	"AM",
	"AMICA",
	"AMSTERDAM",
	"ANALYTICS",
	"ANDROID",
	"ANQUAN",
	"AO",
	"APARTMENTS",
	"APP",
	"APPLE",
	"AQ",
	"AQUARELLE",
	"AR",
	"ARAMCO",
	"ARCHI",
	"ARMY",
	"ARPA",
	"ARTE",
	"AS",
	"ASIA",
	"ASSOCIATES",
	"AT",
	"ATTORNEY",
	"AU",
	"AUCTION",
	"AUDI",
	"AUDIO",
	"AUTHOR",
	"AUTO",
	"AUTOS",
	"AVIANCA",
	"AW",
	"AWS",
	"AX",
	"AXA",
	"AZ",
	"AZURE",
	"BA",
	"BABY",
	"BAIDU",
	"BAND",
	"BANK",
	"BAR",
	"BARCELONA",
	"BARCLAYCARD",
	"BARCLAYS",
	"BAREFOOT",
	"BARGAINS",
	"BAUHAUS",
	"BAYERN",
	"BB",
	"BBC",
	"BBVA",
	"BCG",
	"BCN",
	"BD",
	"BE",
	"BEATS",
	"BEER",
	"BENTLEY",
	"BERLIN",
	"BEST",
	"BET",
	"BF",
	"BG",
	"BH",
	"BHARTI",
	"BI",
	"BIBLE",
	"BID",
	"BIKE",
	"BING",
	"BINGO",
	"BIO",
	"BIZ",
	"BJ",
	"BLACK",
	"BLACKFRIDAY",
	"BLOOMBERG",
	"BLUE",
	"BM",
	"BMS",
	"BMW",
	"BN",
	"BNL",
	"BNPPARIBAS",
	"BO",
	"BOATS",
	"BOEHRINGER",
	"BOM",
	"BOND",
	"BOO",
	"BOOK",
	"BOOTS",
	"BOSCH",
	"BOSTIK",
	"BOT",
	"BOUTIQUE",
	"BR",
	"BRADESCO",
	"BRIDGESTONE",
	"BROADWAY",
	"BROKER",
	"BROTHER",
	"BRUSSELS",
	"BS",
	"BT",
	"BUDAPEST",
	"BUGATTI",
	"BUILD",
	"BUILDERS",
	"BUSINESS",
	"BUY",
	"BUZZ",
	"BV",
	"BW",
	"BY",
	"BZ",
	"BZH",
	"CA",
	"CAB",
	"CAFE",
	"CAL",
	"CALL",
	"CAMERA",
	"CAMP",
	"CANCERRESEARCH",
	"CANON",
	"CAPETOWN",
	"CAPITAL",
	"CAR",
	"CARAVAN",
	"CARDS",
	"CARE",
	"CAREER",
	"CAREERS",
	"CARS",
	"CARTIER",
	"CASA",
	"CASH",
	"CASINO",
	"CAT",
	"CATERING",
	"CBA",
	"CBN",
	"CC",
	"CD",
	"CEB",
	"CENTER",
	"CEO",
	"CERN",
	"CF",
	"CFA",
	"CFD",
	"CG",
	"CH",
	"CHANEL",
	"CHANNEL",
	"CHASE",
	"CHAT",
	"CHEAP",
	"CHLOE",
	"CHRISTMAS",
	"CHROME",
	"CHURCH",
	"CI",
	"CIPRIANI",
	"CIRCLE",
	"CISCO",
	"CITIC",
	"CITY",
	"CITYEATS",
	"CK",
	"CL",
	"CLAIMS",
	"CLEANING",
	"CLICK",
	"CLINIC",
	"CLINIQUE",
	"CLOTHING",
	"CLOUD",
	"CLUB",
	"CLUBMED",
	"CM",
	"CN",
	"CO",
	"COACH",
	"CODES",
	"COFFEE",
	"COLLEGE",
	"COLOGNE",
	"COM",
	"COMMBANK",
	"COMMUNITY",
	"COMPANY",
	"COMPARE",
	"COMPUTER",
	"COMSEC",
	"CONDOS",
	"CONSTRUCTION",
	"CONSULTING",
	"CONTACT",
	"CONTRACTORS",
	"COOKING",
	"COOL",
	"COOP",
	"CORSICA",
	"COUNTRY",
	"COUPON",
	"COUPONS",
	"COURSES",
	"CR",
	"CREDIT",
	"CREDITCARD",
	"CREDITUNION",
	"CRICKET",
	"CROWN",
	"CRS",
	"CRUISES",
	"CSC",
	"CU",
	"CUISINELLA",
	"CV",
	"CW",
	"CX",
	"CY",
	"CYMRU",
	"CYOU",
	"CZ",
	"DABUR",
	"DAD",
	"DANCE",
	"DATE",
	"DATING",
	"DATSUN",
	"DAY",
	"DCLK",
	"DE",
	"DEALER",
	"DEALS",
	"DEGREE",
	"DELIVERY",
	"DELL",
	"DELOITTE",
	"DELTA",
	"DEMOCRAT",
	"DENTAL",
	"DENTIST",
	"DESI",
	"DESIGN",
	"DEV",
	"DIAMONDS",
	"DIET",
	"DIGITAL",
	"DIRECT",
	"DIRECTORY",
	"DISCOUNT",
	"DJ",
	"DK",
	"DM",
	"DNP",
	"DO",
	"DOCS",
	"DOG",
	"DOHA",
	"DOMAINS",
	"DOWNLOAD",
	"DRIVE",
	"DUBAI",
	"DURBAN",
	"DVAG",
	"DZ",
	"EARTH",
	"EAT",
	"EC",
	"EDEKA",
	"EDU",
	"EDUCATION",
	"EE",
	"EG",
	"EMAIL",
	"EMERCK",
	"ENERGY",
	"ENGINEER",
	"ENGINEERING",
	"ENTERPRISES",
	"EPSON",
	"EQUIPMENT",
	"ER",
	"ERNI",
	"ES",
	"ESQ",
	"ESTATE",
	"ET",
	"EU",
	"EUROVISION",
	"EUS",
	"EVENTS",
	"EVERBANK",
	"EXCHANGE",
	"EXPERT",
	"EXPOSED",
	"EXPRESS",
	"EXTRASPACE",
	"FAGE",
	"FAIL",
	"FAIRWINDS",
	"FAITH",
	"FAMILY",
	"FAN",
	"FANS",
	"FARM",
	"FASHION",
	"FAST",
	"FEEDBACK",
	"FERRERO",
	"FI",
	"FILM",
	"FINAL",
	"FINANCE",
	"FINANCIAL",
	"FIRESTONE",
	"FIRMDALE",
	"FISH",
	"FISHING",
	"FIT",
	"FITNESS",
	"FJ",
	"FK",
	"FLICKR",
	"FLIGHTS",
	"FLORIST",
	"FLOWERS",
	"FLSMIDTH",
	"FLY",
	"FM",
	"FO",
	"FOO",
	"FOOTBALL",
	"FORD",
	"FOREX",
	"FORSALE",
	"FORUM",
	"FOUNDATION",
	"FOX",
	"FR",
	"FRESENIUS",
	"FRL",
	"FROGANS",
	"FRONTIER",
	"FTR",
	"FUND",
	"FURNITURE",
	"FUTBOL",
	"FYI",
	"GA",
	"GAL",
	"GALLERY",
	"GALLO",
	"GALLUP",
	"GAME",
	"GARDEN",
	"GB",
	"GBIZ",
	"GD",
	"GDN",
	"GE",
	"GEA",
	"GENT",
	"GENTING",
	"GF",
	"GG",
	"GGEE",
	"GH",
	"GI",
	"GIFT",
	"GIFTS",
	"GIVES",
	"GIVING",
	"GL",
	"GLASS",
	"GLE",
	"GLOBAL",
	"GLOBO",
	"GM",
	"GMAIL",
	"GMBH",
	"GMO",
	"GMX",
	"GN",
	"GOLD",
	"GOLDPOINT",
	"GOLF",
	"GOO",
	"GOOG",
	"GOOGLE",
	"GOP",
	"GOT",
	"GOV",
	"GP",
	"GQ",
	"GR",
	"GRAINGER",
	"GRAPHICS",
	"GRATIS",
	"GREEN",
	"GRIPE",
	"GROUP",
	"GS",
	"GT",
	"GU",
	"GUCCI",
	"GUGE",
	"GUIDE",
	"GUITARS",
	"GURU",
	"GW",
	"GY",
	"HAMBURG",
	"HANGOUT",
	"HAUS",
	"HDFCBANK",
	"HEALTH",
	"HEALTHCARE",
	"HELP",
	"HELSINKI",
	"HERE",
	"HERMES",
	"HIPHOP",
	"HITACHI",
	"HIV",
	"HK",
	"HM",
	"HN",
	"HOCKEY",
	"HOLDINGS",
	"HOLIDAY",
	"HOMEDEPOT",
	"HOMES",
	"HONDA",
	"HORSE",
	"HOST",
	"HOSTING",
	"HOTELES",
	"HOTMAIL",
	"HOUSE",
	"HOW",
	"HR",
	"HSBC",
	"HT",
	"HTC",
	"HU",
	"HYUNDAI",
	"IBM",
	"ICBC",
	"ICE",
	"ICU",
	"ID",
	"IE",
	"IFM",
	"IINET",
	"IL",
	"IM",
	"IMAMAT",
	"IMMO",
	"IMMOBILIEN",
	"IN",
	"INDUSTRIES",
	"INFINITI",
	"INFO",
	"ING",
	"INK",
	"INSTITUTE",
	"INSURANCE",
	"INSURE",
	"INT",
	"INTERNATIONAL",
	"INVESTMENTS",
	"IO",
	"IPIRANGA",
	"IQ",
	"IR",
	"IRISH",
	"IS",
	"ISELECT",
	"ISMAILI",
	"IST",
	"ISTANBUL",
	"IT",
	"ITAU",
	"IWC",
	"JAGUAR",
	"JAVA",
	"JCB",
	"JCP",
	"JE",
	"JETZT",
	"JEWELRY",
	"JLC",
	"JLL",
	"JM",
	"JMP",
	"JNJ",
	"JO",
	"JOBS",
	"JOBURG",
	"JOT",
	"JOY",
	"JP",
	"JPMORGAN",
	"JPRS",
	"JUEGOS",
	"KAUFEN",
	"KDDI",
	"KE",
	"KERRYHOTELS",
	"KERRYLOGISTICS",
	"KERRYPROPERTIES",
	"KFH",
	"KG",
	"KH",
	"KI",
	"KIA",
	"KIM",
	"KINDER",
	"KITCHEN",
	"KIWI",
	"KM",
	"KN",
	"KOELN",
	"KOMATSU",
	"KP",
	"KPMG",
	"KPN",
	"KR",
	"KRD",
	"KRED",
	"KUOKGROUP",
	"KW",
	"KY",
	"KYOTO",
	"KZ",
	"LA",
	"LACAIXA",
	"LAMBORGHINI",
	"LAMER",
	"LANCASTER",
	"LAND",
	"LANDROVER",
	"LANXESS",
	"LASALLE",
	"LAT",
	"LATROBE",
	"LAW",
	"LAWYER",
	"LB",
	"LC",
	"LDS",
	"LEASE",
	"LECLERC",
	"LEGAL",
	"LEXUS",
	"LGBT",
	"LI",
	"LIAISON",
	"LIDL",
	"LIFE",
	"LIFEINSURANCE",
	"LIFESTYLE",
	"LIGHTING",
	"LIKE",
	"LIMITED",
	"LIMO",
	"LINCOLN",
	"LINDE",
	"LINK",
	"LIPSY",
	"LIVE",
	"LIVING",
	"LIXIL",
	"LK",
	"LOAN",
	"LOANS",
	"LOCUS",
	"LOL",
	"LONDON",
	"LOTTE",
	"LOTTO",
	"LOVE",
	"LR",
	"LS",
	"LT",
	"LTD",
	"LTDA",
	"LU",
	"LUPIN",
	"LUXE",
	"LUXURY",
	"LV",
	"LY",
	"MA",
	"MADRID",
	"MAIF",
	"MAISON",
	"MAKEUP",
	"MAN",
	"MANAGEMENT",
	"MANGO",
	"MARKET",
	"MARKETING",
	"MARKETS",
	"MARRIOTT",
	"MBA",
	"MC",
	"MD",
	"ME",
	"MED",
	"MEDIA",
	"MEET",
	"MELBOURNE",
	"MEME",
	"MEMORIAL",
	"MEN",
	"MENU",
	"MEO",
	"MG",
	"MH",
	"MIAMI",
	"MICROSOFT",
	"MIL",
	"MINI",
	"MK",
	"ML",
	"MLS",
	"MM",
	"MMA",
	"MN",
	"MO",
	"MOBI",
	"MOBILY",
	"MODA",
	"MOE",
	"MOI",
	"MOM",
	"MONASH",
	"MONEY",
	"MONTBLANC",
	"MORMON",
	"MORTGAGE",
	"MOSCOW",
	"MOTORCYCLES",
	"MOV",
	"MOVIE",
	"MOVISTAR",
	"MP",
	"MQ",
	"MR",
	"MS",
	"MT",
	"MTN",
	"MTPC",
	"MTR",
	"MU",
	"MUSEUM",
	"MUTUAL",
	"MUTUELLE",
	"MV",
	"MW",
	"MX",
	"MY",
	"MZ",
	"NA",
	"NADEX",
	"NAGOYA",
	"NAME",
	"NATURA",
	"NAVY",
	"NC",
	"NE",
	"NEC",
	"NET",
	"NETBANK",
	"NETWORK",
	"NEUSTAR",
	"NEW",
	"NEWS",
	"NEXT",
	"NEXTDIRECT",
	"NEXUS",
	"NF",
	"NG",
	"NGO",
	"NHK",
	"NI",
	"NICO",
	"NIKON",
	"NINJA",
	"NISSAN",
	"NISSAY",
	"NL",
	"NO",
	"NOKIA",
	"NORTHWESTERNMUTUAL",
	"NORTON",
	"NOWRUZ",
	"NP",
	"NR",
	"NRA",
	"NRW",
	"NTT",
	"NU",
	"NYC",
	"NZ",
	"OBI",
	"OFFICE",
	"OKINAWA",
	"OLAYAN",
	"OM",
	"OMEGA",
	"ONE",
	"ONG",
	"ONL",
	"ONLINE",
	"OOO",
	"ORACLE",
	"ORANGE",
	"ORG",
	"ORGANIC",
	"ORIGINS",
	"OSAKA",
	"OTSUKA",
	"OVH",
	"PA",
	"PAGE",
	"PAMPEREDCHEF",
	"PANERAI",
	"PARIS",
	"PARS",
	"PARTNERS",
	"PARTS",
	"PARTY",
	"PASSAGENS",
	"PE",
	"PET",
	"PF",
	"PG",
	"PH",
	"PHARMACY",
	"PHILIPS",
	"PHOTO",
	"PHOTOGRAPHY",
	"PHOTOS",
	"PHYSIO",
	"PIAGET",
	"PICS",
	"PICTET",
	"PICTURES",
	"PID",
	"PIN",
	"PING",
	"PINK",
	"PIZZA",
	"PK",
	"PL",
	"PLACE",
	"PLAY",
	"PLAYSTATION",
	"PLUMBING",
	"PLUS",
	"PM",
	"PN",
	"POHL",
	"POKER",
	"P**N",
	"POST",
	"PR",
	"PRAXI",
	"PRESS",
	"PRO",
	"PROD",
	"PRODUCTIONS",
	"PROF",
	"PROGRESSIVE",
	"PROMO",
	"PROPERTIES",
	"PROPERTY",
	"PROTECTION",
	"PS",
	"PT",
	"PUB",
	"PW",
	"PWC",
	"PY",
	"QA",
	"QPON",
	"QUEBEC",
	"QUEST",
	"RACING",
	"RE",
	"READ",
	"REALTOR",
	"REALTY",
	"RECIPES",
	"RED",
	"REDSTONE",
	"REDUMBRELLA",
	"REHAB",
	"REISE",
	"REISEN",
	"REIT",
	"REN",
	"RENT",
	"RENTALS",
	"REPAIR",
	"REPORT",
	"REPUBLICAN",
	"REST",
	"RESTAURANT",
	"REVIEW",
	"REVIEWS",
	"REXROTH",
	"RICH",
	"RICOH",
	"RIO",
	"RIP",
	"RO",
	"ROCHER",
	"ROCKS",
	"RODEO",
	"ROOM",
	"RS",
	"RSVP",
	"RU",
	"RUHR",
	"RUN",
	"RW",
	"RWE",
	"RYUKYU",
	"SA",
	"SAARLAND",
	"SAFE",
	"SAFETY",
	"SAKURA",
	"SALE",
	"SALON",
	"SAMSUNG",
	"SANDVIK",
	"SANDVIKCOROMANT",
	"SANOFI",
	"SAP",
	"SAPO",
	"SARL",
	"SAS",
	"SAXO",
	"SB",
	"SBI",
	"SBS",
	"SC",
	"SCA",
	"SCB",
	"SCHAEFFLER",
	"SCHMIDT",
	"SCHOLARSHIPS",
	"SCHOOL",
	"SCHULE",
	"SCHWARZ",
	"SCIENCE",
	"SCOR",
	"SCOT",
	"SD",
	"SE",
	"SEAT",
	"SECURITY",
	"SEEK",
	"SELECT",
	"SENER",
	"SERVICES",
	"SEVEN",
	"SEW",
	"SEX",
	"SEXY",
	"SFR",
	"SG",
	"SH",
	"SHARP",
	"SHAW",
	"SHELL",
	"SHIA",
	"SHIKSHA",
	"SHOES",
	"SHOUJI",
	"SHOW",
	"SHRIRAM",
	"SI",
	"SINA",
	"SINGLES",
	"SITE",
	"SJ",
	"SK",
	"SKI",
	"SKIN",
	"SKY",
	"SKYPE",
	"SL",
	"SM",
	"SMILE",
	"SN",
	"SNCF",
	"SO",
	"SOCCER",
	"SOCIAL",
	"SOFTBANK",
	"SOFTWARE",
	"SOHU",
	"SOLAR",
	"SOLUTIONS",
	"SONG",
	"SONY",
	"SOY",
	"SPACE",
	"SPIEGEL",
	"SPOT",
	"SPREADBETTING",
	"SR",
	"SRL",
	"ST",
	"STADA",
	"STAR",
	"STARHUB",
	"STATEBANK",
	"STATEFARM",
	"STATOIL",
	"STC",
	"STCGROUP",
	"STOCKHOLM",
	"STORAGE",
	"STORE",
	"STREAM",
	"STUDIO",
	"STUDY",
	"STYLE",
	"SU",
	"SUCKS",
	"SUPPLIES",
	"SUPPLY",
	"SUPPORT",
	"SURF",
	"SURGERY",
	"SUZUKI",
	"SV",
	"SWATCH",
	"SWISS",
	"SX",
	"SY",
	"SYDNEY",
	"SYMANTEC",
	"SYSTEMS",
	"SZ",
	"TAB",
	"TAIPEI",
	"TALK",
	"TAOBAO",
	"TATAMOTORS",
	"TATAR",
	"TATTOO",
	"TAX",
	"TAXI",
	"TC",
	"TCI",
	"TD",
	"TEAM",
	"TECH",
	"TECHNOLOGY",
	"TEL",
	"TELECITY",
	"TELEFONICA",
	"TEMASEK",
	"TENNIS",
	"TEVA",
	"TF",
	"TG",
	"TH",
	"THD",
	"THEATER",
	"THEATRE",
	"TICKETS",
	"TIENDA",
	"TIFFANY",
	"TIPS",
	"TIRES",
	"TIROL",
	"TJ",
	"TK",
	"TL",
	"TM",
	"TMALL",
	"TN",
	"TO",
	"TODAY",
	"TOKYO",
	"TOOLS",
	"TOP",
	"TORAY",
	"TOSHIBA",
	"TOTAL",
	"TOURS",
	"TOWN",
	"TOYOTA",
	"TOYS",
	"TR",
	"TRADE",
	"TRADING",
	"TRAINING",
	"TRAVEL",
	"TRAVELERS",
	"TRAVELERSINSURANCE",
	"TRUST",
	"TRV",
	"TT",
	"TUBE",
	"TUI",
	"TUNES",
	"TUSHU",
	"TV",
	"TVS",
	"TW",
	"TZ",
	"UA",
	"UBS",
	"UG",
	"UK",
	"UNICOM",
	"UNIVERSITY",
	"UNO",
	"UOL",
	"US",
	"UY",
	"UZ",
	"VA",
	"VACATIONS",
	"VANA",
	"VC",
	"VE",
	"VEGAS",
	"VENTURES",
	"VERISIGN",
	"VERSICHERUNG",
	"VET",
	"VG",
	"VI",
	"VIAJES",
	"VIDEO",
	"VIG",
	"VIKING",
	"VILLAS",
	"VIN",
	"VIP",
	"VIRGIN",
	"VISION",
	"VISTA",
	"VISTAPRINT",
	"VIVA",
	"VLAANDEREN",
	"VN",
	"VODKA",
	"VOLKSWAGEN",
	"VOTE",
	"VOTING",
	"VOTO",
	"VOYAGE",
	"VU",
	"VUELOS",
	"WALES",
	"WALTER",
	"WANG",
	"WANGGOU",
	"WARMAN",
	"WATCH",
	"WATCHES",
	"WEATHER",
	"WEATHERCHANNEL",
	"WEBCAM",
	"WEBER",
	"WEBSITE",
	"WED",
	"WEDDING",
	"WEIBO",
	"WEIR",
	"WF",
	"WHOSWHO",
	"WIEN",
	"WIKI",
	"WILLIAMHILL",
	"WIN",
	"WINDOWS",
	"WINE",
	"WME",
	"WOLTERSKLUWER",
	"WORK",
	"WORKS",
	"WORLD",
	"WS",
	"WTC",
	"WTF",
	"XBOX",
	"XEROX",
	"XIHUAN",
	"XIN",
	"XN--11B4C3D",
	"XN--1CK2E1B",
	"XN--1QQW23A",
	"XN--30RR7Y",
	"XN--3BST00M",
	"XN--3DS443G",
	"XN--3E0B707E",
	"XN--3PXU8K",
	"XN--42C2D9A",
	"XN--45BRJ9C",
	"XN--45Q11C",
	"XN--4GBRIM",
	"XN--55QW42G",
	"XN--55QX5D",
	"XN--5TZM5G",
	"XN--6FRZ82G",
	"XN--6QQ986B3XL",
	"XN--80ADXHKS",
	"XN--80AO21A",
	"XN--80ASEHDB",
	"XN--80ASWG",
	"XN--8Y0A063A",
	"XN--90A3AC",
	"XN--90AIS",
	"XN--9DBQ2A",
	"XN--9ET52U",
	"XN--9KRT00A",
	"XN--B4W605FERD",
	"XN--BCK1B9A5DRE4C",
	"XN--C1AVG",
	"XN--C2BR7G",
	"XN--CCK2B3B",
	"XN--CG4BKI",
	"XN--CLCHC0EA0B2G2A9GCD",
	"XN--CZR694B",
	"XN--CZRS0T",
	"XN--CZRU2D",
	"XN--D1ACJ3B",
	"XN--D1ALF",
	"XN--E1A4C",
	"XN--ECKVDTC9D",
	"XN--EFVY88H",
	"XN--ESTV75G",
	"XN--FCT429K",
	"XN--FHBEI",
	"XN--FIQ228C5HS",
	"XN--FIQ64B",
	"XN--FIQS8S",
	"XN--FIQZ9S",
	"XN--FJQ720A",
	"XN--FLW351E",
	"XN--FPCRJ9C3D",
	"XN--FZC2C9E2C",
	"XN--G2XX48C",
	"XN--GCKR3F0F",
	"XN--GECRJ9C",
	"XN--H2BRJ9C",
	"XN--HXT814E",
	"XN--I1B6B1A6A2E",
	"XN--IMR513N",
	"XN--IO0A7I",
	"XN--J1AEF",
	"XN--J1AMH",
	"XN--J6W193G",
	"XN--JLQ61U9W7B",
	"XN--JVR189M",
	"XN--KCRX77D1X4A",
	"XN--KPRW13D",
	"XN--KPRY57D",
	"XN--KPU716F",
	"XN--KPUT3I",
	"XN--L1ACC",
	"XN--LGBBAT1AD8J",
	"XN--MGB9AWBF",
	"XN--MGBA3A3EJT",
	"XN--MGBA3A4F16A",
	"XN--MGBA7C0BBN0A",
	"XN--MGBAAM7A8H",
	"XN--MGBAB2BD",
	"XN--MGBAYH7GPA",
	"XN--MGBB9FBPOB",
	"XN--MGBBH1A71E",
	"XN--MGBC0A9AZCG",
	"XN--MGBCA7DZDO",
	"XN--MGBERP4A5D4AR",
	"XN--MGBPL2FH",
	"XN--MGBT3DHD",
	"XN--MGBTX2B",
	"XN--MGBX4CD0AB",
	"XN--MIX891F",
	"XN--MK1BU44C",
	"XN--MXTQ1M",
	"XN--NGBC5AZD",
	"XN--NGBE9E0A",
	"XN--NODE",
	"XN--NQV7F",
	"XN--NQV7FS00EMA",
	"XN--NYQY26A",
	"XN--O3CW4H",
	"XN--OGBPF8FL",
	"XN--P1ACF",
	"XN--P1AI",
	"XN--PBT977C",
	"XN--PGBS0DH",
	"XN--PSSY2U",
	"XN--Q9JYB4C",
	"XN--QCKA1PMC",
	"XN--QXAM",
	"XN--RHQV96G",
	"XN--ROVU88B",
	"XN--S9BRJ9C",
	"XN--SES554G",
	"XN--T60B56A",
	"XN--TCKWE",
	"XN--UNUP4Y",
	"XN--VERMGENSBERATER-CTB",
	"XN--VERMGENSBERATUNG-PWB",
	"XN--VHQUV",
	"XN--VUQ861B",
	"XN--W4R85EL8FHU5DNRA",
	"XN--WGBH1C",
	"XN--WGBL6A",
	"XN--XHQ521B",
	"XN--XKC2AL3HYE2A",
	"XN--XKC2DL3A5EE0H",
	"XN--Y9A3AQ",
	"XN--YFRO4I67O",
	"XN--YGBI2AMMX",
	"XN--ZFR164B",
	"XPERIA",
	"XXX",
	"XYZ",
	"YACHTS",
	"YAHOO",
	"YAMAXUN",
	"YANDEX",
	"YE",
	"YODOBASHI",
	"YOGA",
	"YOKOHAMA",
	"YOU",
	"YOUTUBE",
	"YT",
	"YUN",
	"ZA",
	"ZARA",
	"ZERO",
	"ZIP",
	"ZM",
	"ZONE",
	"ZUERICH",
	"ZW",


	"AB.CA",
	"AC.AE",
	"AC.AT",
	"AC.CN",
	"AC.CR",
	"AC.CY",
	"AC.FJ",
	"AC.GG",
	"AC.ID",
	"AC.IL",
	"AC.IM",
	"AC.IN",
	"AC.JE",
	"AC.JP",
	"AC.KR",
	"AC.NZ",
	"AC.PA",
	"AC.TH",
	"AC.UG",
	"AC.UK",
	"AC.YU",
	"AC.ZA",
	"AD.JP",
	"AH.CN",
	"ALDERNEY.GG",
	"ALT.ZA",
	"ART.BR",
	"ART.DO",
	"ARTS.CO",
	"ARTS.VE",
	"ASN.AU",
	"ASN.LV",
	"BBS.TR",
	"BC.CA",
	"BIB.VE",
	"BJ.CN",
	"CO.AT",
	"CO.AO",
	"CO.CK",
	"CO.CR",
	"CO.GG",
	"CO.HU",
	"CO.ID",
	"CO.IL",
	"CO.IM",
	"CO.IN",
	"CO.JE",
	"CO.JP",
	"CO.KR",
	"COM.AR",
	"COM.AU",
	"COM.AZ",
	"COM.BB",
	"COM.BM",
	"COM.BR",
	"COM.BS",
	"COM.CN",
	"COM.CO",
	"COM.CU",
	"COM.CY",
	"COM.DO",
	"COM.EC",
	"COM.EG",
	"COM.FJ",
	"COM.GE",
	"COM.GU",
	"COM.HK",
	"COM.JO",
	"COM.KH",
	"COM.LA",
	"COM.LB",
	"COM.LC",
	"COM.LV",
	"COM.LY",
	"COM.MM",
	"COM.MO",
	"COM.MT",
	"COM.MX",
	"COM.MY",
	"COM.NA",
	"COM.NC",
	"COM.NI",
	"COM.NP",
	"COM.PA",
	"COM.PE",
	"COM.PH",
	"COM.PL",
	"COM.PY",
	"COM.RU",
	"COM.SG",
	"COM.SH",
	"COM.SY",
	"COM.TN",
	"COM.TR",
	"COM.TW",
	"COM.UA",
	"COM.UY",
	"COM.VE",
	"CONF.AU",
	"CONF.LV",
	"CO.NZ",
	"COOP",
	"CO.AE",
	"CO.SV",
	"CO.TH",
	"CO.UG",
	"CO.UK",
	"CO.VE",
	"CO.VI",
	"CO.YU",
	"CO.ZA",
	"CQ.CN",
	"CSIRO.AU",
	"ED.CR",
	"EDU.BM",
	"EDU.AR",
	"EDU.CN",
	"EDU.CO",
	"EDU.DO",
	"EDU.EC",
	"EDU.EG",
	"EDU.GE",
	"EDU.GU",
	"EDU.JO",
	"EDU.LC",
	"EDU.LV",
	"EDU.MM",
	"EDU.MO",
	"EDU.MY",
	"EDUNET.TN",
	"EDU.PA",
	"EDU.PY",
	"EDU.SG",
	"EDU.SH",
	"EDU.TR",
	"EDU.TW",
	"EDU.UY",
	"EDU.VE",
	"EDU.YU",
	"EDU.ZA",
	"ENS.TN",
	"ERNET.IN",
	"ESP.BR",
	"ETC.BR",
	"EUN.EG",
	"FI.CR",
	"FIN.EC",
	"FIN.TN",
	"FIRM.CO",
	"FIRM.VE",
	"G12.BR",
	"GD.CN",
	"GEN.NZ",
	"GOB.PA",
	"GO.CR",
	"GO.ID",
	"GO.KR",
	"GO.TH",
	"GO.UG",
	"GOV.AE",
	"GOV.AR",
	"GOV.AU",
	"GOV.BM",
	"GOV.BR",
	"GOV.CN",
	"GOV.CO",
	"GOV.CY",
	"GOV.DO",
	"GOV.EC",
	"GOV.EG",
	"GOVE.TW",
	"GOV.FJ",
	"GOV.GE",
	"GOV.GG",
	"GOV.GU",
	"GOV.IL",
	"GOV.IM",
	"GOV.IN",
	"GOV.JE",
	"GOV.JO",
	"GOV.JP",
	"GOV.LB",
	"GOV.LC",
	"GOV.LV",
	"GOV.MM",
	"GOV.MO",
	"GOV.MY",
	"GOV.SG",
	"GOV.SH",
	"GOV.TN",
	"GOVT.NZ",
	"GOV.TR",
	"GOV.UA",
	"GOV.UK",
	"GOV.VE",
	"GOV.ZA",
	"GS.CN",
	"GUERNSEY.GG",
	"GX.CN",
	"GZ.CN",
	"HB.CN",
	"HE.CN",
	"HI.CN",
	"HK.CN",
	"HL.CN",
	"HN.CN",
	"ID.AU",
	"ID.FJ",
	"ID.LV",
	"IND.BR",
	"IND.GG",
	"IND.JE",
	"IND.TN",
	"INF.BR",
	"INFO.AU",
	"INFO.CO",
	"INFO.HU",
	"INFO.TN",
	"INFO.VE",
	"INT.CO",
	"INTL.TN",
	"INT.VE",
	"JERSEY.JE",
	"JL.CN",
	"JS.CN",
	"K12.EC",
	"K12.IL",
	"K12.TR",
	"LKD.CO.IM",
	"LN.CN",
	"LTD.GG",
	"LTD.JE",
	"LTD.UK",
	"MB.CA",
	"MED.EC",
	"MIL.BR",
	"MIL.CO",
	"MIL.DO",
	"MIL.EC",
	"MIL.GE",
	"MIL.GU",
	"MIL.ID",
	"MIL.LB",
	"MIL.LV",
	"MIL.PH",
	"MIL.SH",
	"MIL.TR",
	"MIL.VE",
	"MIL.ZA",
	"MO.CN",
	"MOD.UK",
	"MUNI.IL",
	"MUSEUM",
	"NAME",
	"NAT.TN",
	"NB.CA",
	"NET.AR",
	"NET.AU",
	"NET.AZ",
	"NET.BB",
	"NET.BM",
	"NET.BR",
	"NET.BS",
	"NET.CN",
	"NET.CU",
	"NET.CY",
	"NET.DO",
	"NET.EC",
	"NET.EG",
	"NET.GE",
	"NET.GG",
	"NET.GU",
	"NET.HK",
	"NET.ID",
	"NET.IL",
	"NET.IM",
	"NET.IN",
	"NET.JE",
	"NET.JO",
	"NET.JP",
	"NET.KH",
	"NET.LA",
	"NET.LB",
	"NET.LC",
	"NET.LV",
	"NET.LY",
	"NET.MM",
	"NET.MO",
	"NET.MT",
	"NET.MX",
	"NET.MY",
	"NET.NA",
	"NET.NC",
	"NET.NP",
	"NET.NZ",
	"NET.PA",
	"NET.PE",
	"NET.PH",
	"NET.PL",
	"NET.PY",
	"NET.RU",
	"NET.SG",
	"NET.SH",
	"NET.SY",
	"NET.TH",
	"NET.TN",
	"NET.TR",
	"NET.TW",
	"NET.UA",
	"NET.UK",
	"NET.UY",
	"NET.VE",
	"NET.VI",
	"NET.ZA",
	"NF.CA",
	"NGO.PH",
	"NGO.ZA",
	"NHS.UK",
	"NIC.IM",
	"NIC.IN",
	"NM.CN",
	"NM.KR",
	"NOM.CO",
	"NOM.VE",
	"NOM.ZA",
	"NS.CA",
	"NSK.SU",
	"NT.CA",
	"NUI.HU",
	"NX.CN",
	"ON.CA",
	"OR.CR",
	"ORG.AE",
	"ORG.AR",
	"ORG.AU",
	"ORG.AZ",
	"ORG.BB",
	"ORG.BM",
	"ORG.BR",
	"ORG.BS",
	"ORG.CN",
	"ORG.CO",
	"ORG.CU",
	"ORG.CY",
	"ORG.DO",
	"ORG.EC",
	"ORG.EG",
	"ORG.FJ",
	"ORG.GE",
	"ORG.GG",
	"ORG.GU",
	"ORG.HK",
	"ORG.HU",
	"ORG.IL",
	"ORG.IM",
	"ORG.JE",
	"ORG.JP",
	"ORG.KH",
	"ORG.LA",
	"ORG.LB",
	"ORG.LC",
	"ORG.LV",
	"ORG.LY",
	"ORG.MM",
	"ORG.MO",
	"ORG.MT",
	"ORG.MX",
	"ORG.MY",
	"ORG.NA",
	"ORG.NC",
	"ORG.NZ",
	"ORG.PA",
	"ORG.PE",
	"ORG.PH",
	"ORG.PL",
	"ORG.PY",
	"ORG.RU",
	"ORG.SG",
	"ORG.SH",
	"ORG.SY",
	"ORG.TN",
	"ORG.TR",
	"ORG.TW",
	"ORG.UK",
	"ORG.UY",
	"ORG.VE",
	"ORG.VI",
	"ORG.YU",
	"ORG.ZA",
	"OR.ID",
	"OR.KR",
	"OR.TH",
	"ORT.NP",
	"OR.UG",
	"OZ.AU",
	"PE.CA",
	"PLC.CO.IM",
	"PLC.UK",
	"POLICE.UK",
	"PRIV.HU",
	"PSI.BR",
	"PVT.GE",
	"QC.CA",
	"QH.CN",
	"REC.BR",
	"REC.CO",
	"REC.VE",
	"RE.KR",
	"RES.IN",
	"RNRT.TN",
	"RNS.TN",
	"RNU.TN",
	"SA.CR",
	"SARK.GG",
	"SC.CN",
	"SCH.GG",
	"SCH.JE",
	"SCHOOL.FJ",
	"SCHOOL.ZA",
	"SCH.UK",
	"SCI.EG",
	"SH.CN",
	"SK.CA",
	"SLD.PA",
	"SN.CN",
	"STORE.CO",
	"STORE.VE",
	"SX.CN",
	"TEC.VE",
	"TELEMEMO.AU",
	"TJ.CN",
	"TM.HU",
	"TMP.BR",
	"TM.ZA",
	"TOURISM.TN",
	"TW.CN",
	"WEB.CO",
	"WEB.DO",
	"WEB.VE",
	"WEB.ZA",
	"XJ.CN",
	"XZ.CN",
	"YK.CA",
	"YN.CN",
	"ZJ.CN"
};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
				     "tldtbl") ) 
			return log("build: Could not init table of TLDs.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char      *d    = s_tlds[i];
			int32_t       dlen = gbstrlen ( d );
			int64_t  dh   = hash64Lower_a ( d , dlen );
			if ( ! s_table.addKey (&dh,NULL) )
				return log("build: dom table failed");
		}
		s_isInitialized = true;
	} 
	int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
	return s_table.isInTable ( &h );//getScoreFromTermId ( h );
}		
void startSpidering ( ) {
	// url class for parsing/normalizing url
	Url u;
	// count total urls done
	static long long s_startTime = 0;
	// set startTime
	if ( s_startTime == 0 ) s_startTime = gettimeofdayInMilliseconds();
	// get time now
	long long now = gettimeofdayInMilliseconds();
	// elapsed time to do all urls
	double took = (double)(now - s_startTime) / 1000.0 ;
	// log this every 20 urls
	if ( s_printIt && s_total > 0 && ( s_total % 20 ) == 0 ) {
		logf(LOG_INFO,"did %li urls in %f seconds. %f urls per second."
		    " threads now = %li.",
		    s_total ,  took , ((double)s_total) / took, s_launched);
		s_printIt = false;
	}
	// did we wait long enough?
	if ( now - s_lastTime < s_wait ) return;
	s_lastTime = now;
	// . use HttpServer.getDoc() to fetch it
	// . fetch X at a time
	while ( (s_server || s_p < s_pend) && s_launched < s_maxNumThreads ) {
		// clear any error
		g_errno = 0;
		//append s_append to the url
		char url[MAX_URL_LEN];
		char *p = url;
		char *pend = url + MAX_URL_LEN;
		char *t = NULL;

		if(s_server) {
			long len = gbstrlen(s_server);
			memcpy ( p, s_server, len);
			p += len;
			p += getRandomWords(p, pend, s_numRandWords);
			long appendLen = gbstrlen(s_append);
			if ( p + appendLen < pend ) {
				memcpy ( p, s_append, gbstrlen(s_append) );
				p += gbstrlen(s_append);
			}
			*p++ = '\0';
			u.set ( url , p - url);
			t = g_mem.strdup(url, "saved url");
		}
		else {
			memcpy ( p, s_p, gbstrlen(s_p));
			p += gbstrlen ( s_p );
			if ( gbstrlen(s_p) + gbstrlen(s_append) < MAX_URL_LEN )
				memcpy ( p, s_append, gbstrlen(s_append) );
			p += gbstrlen(s_append);
			//null end
			*p ='\0';

			// make into a url class
			u.set ( url , gbstrlen(url) );
			// set port if port switch is true
			//if ( s_portSwitch ) {
			//	long r = rand() % 32;
			//	u.setPort ( 8000 + r );
			//}
			// save s_p
			t = s_p;
			// skip to next url
			s_p += gbstrlen ( s_p ) + 1;
		}
		// count it
		s_launched++;
		// get it
		bool status = g_httpServer.getDoc ( &u , // url
						    0 ,  // offset
						    -1 ,  // size
						    0 , // ifModifiedSince
						    (void *)t ,  // state
						    gotDocWrapper, // callback
						    20*1000, // timeout
						    0, // proxy ip
						    0, // proxy port
						    30*1024*1024, //maxLen
						    30*1024*1024);//maxOtherLen
		// continue if it blocked
		if ( ! status ) continue;
		// otherwise, got it right away
		s_launched--;
		// log msg
		log("got doc1 %s: %s", u.getUrl() , mstrerror(g_errno) );
		// we gotta wait
		break;
	}
	// bail if not done yet
	//if ( s_launched > 0 ) return;
	if ( s_server || s_p < s_pend ) return;
	// otherwise, we're all done
	logf(LOG_INFO,"blaster: did %li urls in %f seconds. %f urls per "
	     "second.",
	    s_total ,  took , ((double)s_total) / took );
	// exit now
	exit ( 0 );
}
bool sendReply ( void *state ) {
	// get the state properly
	Msg7 *msg7= (Msg7 *) state;

	GigablastRequest *gr = &msg7->m_gr;

	// extract info from state
	TcpSocket *sock = gr->m_socket;

	XmlDoc *xd = &msg7->m_xd;
	// log it
	//if ( msg7->m_url[0] ) xd->logIt();

	// msg7 has the docid for what we injected, iff g_errno is not set
	//long long docId  = msg7->m_msg7.m_docId;
	//long      hostId = msg7->m_msg7.m_hostId;
	long long docId  = xd->m_docId;
	long      hostId = 0;//msg7->m_msg7.m_hostId;

	// set g_errno to index code
	if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno )
		g_errno = xd->m_indexCode;

	char format = gr->m_hr.getReplyFormat();

	// no url parm?
	if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML )
		g_errno = EMISSINGINPUT;

	if ( g_errno && g_errno != EDOCUNCHANGED ) {
		long save = g_errno;
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		g_errno = save;
		char *msg = mstrerror(g_errno);
		return g_httpServer.sendErrorReply(sock,save,msg,NULL);
	}

	char abuf[320];
	SafeBuf am(abuf,320,0,false);
	am.setLabel("injbuf");
	char *ct = NULL;

	// a success reply, include docid and url i guess
	if ( format == FORMAT_XML ) {
		am.safePrintf("<response>\n");
		am.safePrintf("\t<statusCode>%li</statusCode>\n",
			      (long)g_errno);
		am.safePrintf("\t<statusMsg><![CDATA[");
		am.cdataEncode(mstrerror(g_errno));
		am.safePrintf("]]></statusMsg>\n");
		am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId);
		if ( gr->m_getSections ) {
			SafeBuf *secBuf = xd->getInlineSectionVotingBuf();
			am.safePrintf("\t<htmlSrc><![CDATA[");
			if ( secBuf->length() ) 
				am.cdataEncode(secBuf->getBufStart());
			am.safePrintf("]]></htmlSrc>\n");
		}
		am.safePrintf("</response>\n");
		ct = "text/xml";
	}

	if ( format == FORMAT_JSON ) {
		am.safePrintf("{\"response\":{\n");
		am.safePrintf("\t\"statusCode\":%li,\n",(long)g_errno);
		am.safePrintf("\t\"statusMsg\":\"");
		am.jsonEncode(mstrerror(g_errno));
		am.safePrintf("\",\n");
		am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId);
		if ( gr->m_getSections ) {
			SafeBuf *secBuf = xd->getInlineSectionVotingBuf();
			am.safePrintf("\t\"htmlSrc\":\"");
			if ( secBuf->length() ) 
				am.jsonEncode(secBuf->getBufStart());
			am.safePrintf("\",\n");
		}
		// subtract ",\n"
		am.m_length -= 2;
		am.safePrintf("\n}\n}\n");
		ct = "application/json";
	}

	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage(sock,
						    am.getBufStart(),
						    am.length(),
						    0,
						    false,
						    ct );
	}

	//
	// debug
	//

	/*
	// now get the meta list, in the process it will print out a 
	// bunch of junk into msg7->m_pbuf
	if ( xd->m_docId ) {
		char *metalist = xd->getMetaList ( 1,1,1,1,1,1 );
		if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;}
		// print it out
		SafeBuf *pbuf = &msg7->m_sbuf;
		xd->printDoc( pbuf );
		bool status = g_httpServer.sendDynamicPage( msg7->m_socket , 
							   pbuf->getBufStart(),
							    pbuf->length() ,
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		// delete the state now
		mdelete ( st , sizeof(Msg7) , "PageInject" );
		delete (st);
		// return the status
		return status;
	}
	*/
	//
	// end debug
	//

	char *url = gr->m_url;
	
	// . if we're talking w/ a robot he doesn't care about this crap
	// . send him back the error code (0 means success)
	if ( url && gr->m_shortReply ) {
		char buf[1024*32];
		char *p = buf;
		// return docid and hostid
		if ( ! g_errno ) p += sprintf ( p , 
					   "0,docId=%lli,hostId=%li," , 
					   docId , hostId );
		// print error number here
		else  p += sprintf ( p , "%li,0,0,", (long)g_errno );
		// print error msg out, too or "Success"
		p += sprintf ( p , "%s", mstrerror(g_errno));
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage ( sock,buf, gbstrlen(buf) ,
						      -1/*cachetime*/);
	}

	SafeBuf sb;

	// print admin bar
	g_pages.printAdminTop ( &sb, sock , &gr->m_hr );

	// print a response msg if rendering the page after a submission
	if ( g_errno )
		sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>"
				"</center>", 
				mstrerror(g_errno) , g_errno);
	else if ( (gr->m_url&&gr->m_url[0]) ||
		  (gr->m_queryToScrape&&gr->m_queryToScrape[0]) )
		sb.safePrintf ( "<center><b>Sucessfully injected %s"
				"</center><br>"
				, xd->m_firstUrl.m_url
				);


	// print the table of injection parms
	g_parms.printParmTable ( &sb , sock , &gr->m_hr );


	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// calculate buffer length
	//long bufLen = p - buf;
	// nuke state
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	// . i thought we need -2 for cacheTime, but i guess not
	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(),
					     sb.length(), 
					     -1/*cachetime*/);
}
// . MDW: TODO: bring this back when we have a subdir for each collection
// . add a new rec
// . returns false and sets g_errno on error
// . use a collnum_t of -1 if it is new
bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
			    collnum_t collnum , bool isDump ,
			    bool saveIt ) {
	// sanity check
	if ( ( isNew && collnum >= 0) ||
	     (!isNew && collnum <  0) ) {
		log(LOG_LOGIC,"admin: Bad parms passed to addRec.");
		char *xx = NULL; *xx = 0;
	}
	// ensure coll name is legit
	char *p = coll;
	for ( ; *p ; p++ ) {
		if ( is_alnum_a(*p) ) continue;
		if ( *p == '-' ) continue;
		break;
	}
	if ( *p ) {
		g_errno = EBADENGINEER;
		log("admin: \"%s\" is a malformed collection name because it "
		    "contains the '%c' character.",coll,*p);
		return false;
	}
	// . scan for holes
	// . i is also known as the collection id
	long i ;
	if ( collnum >= 0 ) i = (long)collnum;
	else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
	// ceiling?
	if ( i >= MAX_COLLS ) {
		g_errno = ENOBUFS;
		return log("admin: Limit of %li collection reached. "
			   "Collection not created.",(long)MAX_COLLS);
	}
	// if empty... bail, no longer accepted, use "main"
	if ( ! coll || !coll[0] ) {
		g_errno = EBADENGINEER;
		return log("admin: Trying to create a new collection "
			   "but no collection name provided. Use the \"c\" "
			   "cgi parameter to specify it.");
	}
	// or if too big
	if ( gbstrlen(coll) > MAX_COLL_LEN ) {
		g_errno = ENOBUFS;
		return log("admin: Trying to create a new collection "
			   "whose name \"%s\" of %i chars is longer than the "
			   "max of %li chars.",coll,gbstrlen(coll),
			   (long)MAX_COLL_LEN);
	}
		
	// ensure does not already exist in memory
	if ( getCollnum ( coll ) >= 0 ) {
		g_errno = EEXIST;
		return log("admin: Trying to create collection \"%s\" but "
			   "already exists in memory.",coll);
	}
	// MDW: ensure not created on disk since time of last load
	char dname[512];
	sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i);
	if ( isNew && opendir ( dname ) ) {
		g_errno = EEXIST;
		return log("admin: Trying to create collection %s but "
			   "directory %s already exists on disk.",coll,dname);
	}
	//char fname[512];
	// ending '/' is ALWAYS included in g_hostdb.m_dir
	//sprintf ( fname , "%s%li.%s.conf",g_hostdb.m_dir,i,coll);
	//File f;
	//f.set ( fname );
	//if ( f.doesExist() ) {
	//	g_errno = EEXIST;
	//	return log("admin: Trying to create collection \"%s\" but "
	//		   "file %s already exists on disk.",coll,fname);
	//}
	// create the record in memory
	m_recs[i] = new (CollectionRec);
	if ( ! m_recs[i] ) 
		return log("admin: Failed to allocated %li bytes for new "
			   "collection record for \"%s\".",
			   (long)sizeof(CollectionRec),coll);
	mnew ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); 
	// get copy collection
	CollectionRec *cpcrec = NULL;
	if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
	if ( cpc && cpc[0] && ! cpcrec )
		log("admin: Collection \"%s\" to copy config from does not "
		    "exist.",cpc);
	// get the default.conf from working dir if there
	g_parms.setToDefault( (char *)m_recs[i] );

	if ( isNew ) {
		// the default conf file
		char tmp1[1024];
		sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
		// . set our parms from the file.
		// . accepts OBJ_COLLECTIONREC or OBJ_CONF
		g_parms.setFromFile ( m_recs[i] , NULL , tmp1 );
	}

	// this will override all
	if ( cpcrec ) {
		// copy it, but not the timedb hashtable, etc.
		long size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
		// JAB: bad memcpy - no donut!
		// this is not how objects are supposed to be copied!!!
		memcpy ( m_recs[i] , cpcrec , size);//sizeof(CollectionRec) );
		// perform the cleanup that a copy constructor might do...
		//for (int rx = 0; rx < MAX_FILTERS; rx++)
		//	m_recs[i]->m_pRegExParser[rx] = NULL;
		// don't NUKE the filters!
		// m_recs[i]->m_numRegExs = 0;
		// OK - done with cleaning up...
		// but never copy over the collection hostname, that is
		// problematic
		m_recs[i]->m_collectionHostname [0] = '\0';
		m_recs[i]->m_collectionHostname1[0] = '\0';
		m_recs[i]->m_collectionHostname2[0] = '\0';
	}

	// set coll id and coll name for coll id #i
	strcpy ( m_recs[i]->m_coll , coll );
	m_recs[i]->m_collLen = gbstrlen ( coll );
	m_recs[i]->m_collnum = i;

	// point to this, so Rdb and RdbBase can reference it
	coll = m_recs[i]->m_coll;

	// . if has no password or ip add the default password, footbar
	// . no, just don't have any password, just use the 127.0.0.1 ip
	//   that is the loopback
	/*
	if ( m_recs[i]->m_numAdminIps  == 0 &&
	     m_recs[i]->m_numAdminPwds == 0    ) {
		m_recs[i]->m_numAdminIps = 1;
		m_recs[i]->m_adminIps[0] = atoip("0.0.0.0",7);
		//strcpy ( m_recs[i]->m_adminPwds[0] , "footbar23" );
		//m_recs[i]->m_numAdminPwds = 1;
		//log("admin: Using default password for new collection of "
		//    "'footbar23'.");
	}
	*/

	// collection name HACK for backwards compatibility
	//if ( strcmp ( coll , "main" ) == 0 ) {
	//	m_recs[i]->m_coll[0] = '\0';
	//	m_recs[i]->m_collLen = 0;
	//	//coll[0] = '\0';
	//}

	// MDW: create the new directory
	if ( isNew ) {
	retry22:
		if ( ::mkdir ( dname , 
			       S_IRUSR | S_IWUSR | S_IXUSR | 
			       S_IRGRP | S_IWGRP | S_IXGRP | 
			       S_IROTH | S_IXOTH ) ) {
			// valgrind?
			if ( errno == EINTR ) goto retry22;
			g_errno = errno;
			mdelete ( m_recs[i] , sizeof(CollectionRec) , 
				  "CollectionRec" ); 
			delete ( m_recs[i]);
			m_recs[i] = NULL;
			return log("admin: Creating directory %s had error: "
				   "%s.", dname,mstrerror(g_errno));
		}
		// save it into this dir... might fail!
		if ( ! m_recs[i]->save() ) {
			mdelete ( m_recs[i] , sizeof(CollectionRec) , 
				  "CollectionRec" ); 
			delete ( m_recs[i]);
			m_recs[i] = NULL;
			return log("admin: Failed to save file %s: %s",
				   dname,mstrerror(g_errno));
		}
	}
	// load if not new
	if ( ! isNew && ! m_recs[i]->load ( coll , i ) ) {
		mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); 
		delete ( m_recs[i]);
		m_recs[i] = NULL;
		return log("admin: Failed to load conf for collection "
			   "\"%s\".",coll);
	}
	// mark it as needing to be saved instead
	m_recs[i]->m_needsSave = false;
	// force this to off for now
	//m_recs[i]->m_queryExpansion = false;
	// reserve it
	if ( i >= m_numRecs ) m_numRecs = i + 1;
	// count it
	m_numRecsUsed++;
	// update the time
	updateTime();
	// if we are doing a dump from the command line, skip this stuff
	if ( isDump ) return true;
	bool verify = true;
	if(isNew) verify = false;
	// tell rdbs to add one, too
	//if ( ! g_indexdb.addColl    ( coll, verify ) ) goto hadError;
	if ( ! g_posdb.addColl    ( coll, verify ) ) goto hadError;
	//if ( ! g_datedb.addColl     ( coll, verify ) ) goto hadError;

	if ( ! g_titledb.addColl    ( coll, verify ) ) goto hadError;
	//if ( ! g_revdb.addColl      ( coll, verify ) ) goto hadError;
	//if ( ! g_sectiondb.addColl  ( coll, verify ) ) goto hadError;
	if ( ! g_tagdb.addColl      ( coll, verify ) ) goto hadError;
	//if ( ! g_catdb.addColl      ( coll, verify ) ) goto hadError;
	//if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError;
	if ( ! g_spiderdb.addColl   ( coll, verify ) ) goto hadError;
	if ( ! g_doledb.addColl     ( coll, verify ) ) goto hadError;
	//if ( ! g_tfndb.addColl      ( coll, verify ) ) goto hadError;
	if ( ! g_clusterdb.addColl  ( coll, verify ) ) goto hadError;
	if ( ! g_linkdb.addColl     ( coll, verify ) ) goto hadError;
	// debug message
	log ( LOG_INFO, "admin: added collection \"%s\" (%li).",coll,(long)i);
	// tell SpiderCache about this collection, it will create a 
	// SpiderCollection class for it.
	//g_spiderCache.reset1();

	// . make it set is CollectionRec::m_sortByDateTable now
	// . everyone else uses setTimeOfDayInMilliseconds() in fctypes.cpp
	//   to call this function once their clock is synced with host #0
	//if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 )
	//	initSortByDateTable(coll);
	//else if ( g_hostdb.m_initialized && isClockInSync() )
	//	initSortByDateTable(coll);
	// . do it for all regard-less
	// . once clock is in sync with host #0 we may do it again!
	//if ( g_hostdb.m_initialized )
	//	initSortByDateTable(coll);

	// success
	return true;
 hadError:
	log("admin: Had error adding new collection: %s.",mstrerror(g_errno));
	// do not delete it, might have failed to add because not enough
	// memory to read in the tree *-saved.dat file on disk!! and if
	// you delete in then core the *-saved.dat file gets overwritten!!!
	return false;
	/*
	g_indexdb.getRdb()->delColl    ( coll );
	g_datedb.getRdb()->delColl     ( coll );
	g_timedb.getRdb()->delColl     ( coll );
	g_titledb.getRdb()->delColl    ( coll );
	g_revdb.getRdb()->delColl      ( coll );
	g_sectiondb.getRdb()->delColl  ( coll );
	g_placedb.getRdb()->delColl    ( coll );
	g_tagdb.getRdb()->delColl      ( coll );
	//g_catdb.getRdb()->delColl      ( coll );
	//g_checksumdb.getRdb()->delColl ( coll );
	g_spiderdb.getRdb()->delColl   ( coll );
	g_doledb.getRdb()->delColl     ( coll );
	g_tfndb.getRdb()->delColl      ( coll );
	g_clusterdb.getRdb()->delColl  ( coll );
	g_linkdb.getRdb()->delColl     ( coll );
	deleteRec                      ( coll );
	return false;
	*/
}