bool sendPageTurk ( TcpSocket *s , HttpRequest *r ) {
	// get the current timestamp
	int32_t now = getTimeGlobal ();
	


	char *coll =  r->getString("c");
	if ( ! coll )
		return g_httpServer.sendErrorReply( s, 500, "No collection");
		

	// make a state for callback
	State60 *st ;
	try {  st = new ( State60 ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log( "pgrank: new(%i): %s", sizeof(State60),
		     mstrerror(g_errno) );
		return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno));
	}
	mnew ( st , sizeof(State60) , "PageTurk" );

	// get username
	char *username = r->getStringFromCookie("username", NULL);
	if ( !username ) username = r->getString("username",NULL);
	if ( !username ) username = r->getString("user",NULL);
	if ( !username ) username = r->getString("code",NULL);

	if ( ! username )
		return g_httpServer.sendErrorReply(s,500,"No username");
	int32_t ulen = gbsrlen(username);
	if ( ulen >= MAX_USER_SIZE )
		return g_httpServer.sendErrorReply(s,500,"Bad username");

			
	// save crap. don't we need to copy "r" into our own? yeah...
	st->m_s = s;
	// save username
	strcpy(st->m_username,username,ulen+1);
	// assume no url
	//st->m_url[0] = 0;
	// copy coll
	strcpy(st->m_coll,coll);
	// this is 1 to imply to edit a page
	st->m_editMode = r->getString("edit",0);
	st->m_docId    = r->getLongLong("docid",0LL);

	// get url
	//char *url = r->getString ("url", NULL);

	// if no url is given then present their stats page
	if ( ! edit ) return sendPageTurkStats (st);

	// copy url
	//strcpy ( st->m_url , url );

	// otherwise, send them the eval page
	return sendPageTurkEval (st);
}
time_t genDate( char *date, long dateLen ) {

	time_t result = -1;
	// the date string should always be the same length
	if ( ! date || dateLen != 16 )
		return result;

	struct tm tmRef;
	struct tm tmBuild;

	//*
	memset( (char *)&tmRef, 0, sizeof( tmRef ) );
	time_t now = (time_t)getTimeGlobal();
	localtime_r( &now, &tmRef );
	now = mktime( &tmRef );
	// */

	char tmp[18];
	char *p = tmp;
	memcpy( p, date, dateLen );

	p[2]  = '\0';
	p[5]  = '\0';
	p[10] = '\0';
	p[13] = '\0';
	p[16] = '\0';

	memset( (char *)&tmBuild, 0, sizeof( tmBuild ) );
	tmBuild.tm_mon   = atoi( p ) - 1;	p += 3;
	tmBuild.tm_mday  = atoi( p );		p += 3;
	tmBuild.tm_year  = atoi( p ) - 1900;	p += 5;
	tmBuild.tm_hour  = atoi( p );		p += 3;
	tmBuild.tm_min   = atoi( p );		p += 3;
	tmBuild.tm_isdst = tmRef.tm_isdst;	p += 3;

	// We must manually adjust for DST difference
	// if the current state of DST does not match
	// that of the date that was requested.
	/*
	struct tm nowDST;
	struct tm resultDST;
	localtime_r( &now, &nowDST );
	localtime_r( &result, &resultDST );
	if ( nowDST.tm_isdst && !resultDST.tm_isdst )
		tmBuild.tm_hour++;
	else if ( !nowDST.tm_isdst && resultDST.tm_isdst )
		tmBuild.tm_hour--;

	memcpy( p, date, dateLen );
	p[16] = '\0';
	log ( LOG_DEBUG, "stats: user string        [%s]", p );
	log ( LOG_DEBUG, "stats: user provided time [%s]", ctime( &result ) );
	log ( LOG_DEBUG, "stats: our timestamp      [%s]", ctime( &now    ) );
	// */
	
	result = mktime( &tmBuild );
	return result;
}
// . this returns false if blocks, true otherwise
// . sets g_errno on failure
bool Msg1c::gotList ( ) {

	if ( g_errno ) return true;

	int64_t *tmpDocIds = m_msg3a.getDocIds();
	int32_t       numDocIds = m_msg3a.getNumDocIds();

	if ( m_startNum > 0) {
		numDocIds -= m_startNum;
		tmpDocIds = &tmpDocIds[m_startNum];
	}

	m_numDocIds = numDocIds; // save for reporting
	// log it
	log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds);
	// bail if no need
	if ( numDocIds <= 0 ) return true;

	// force spiders on on entire network. they will progagate from 
	// host #0... 
	g_conf.m_spideringEnabled = true;

	int32_t nowGlobal = getTimeGlobal();

	HashTableX dt;
	char dbuf[1024];
	dt.set(8,0,64,dbuf,1024,false,0,"ddocids");

	m_sb.setLabel("reiadd");

	State13 *st = (State13 *)m_state;
	GigablastRequest *gr = &st->m_gr;

	m_numDocIdsAdded = 0;

	// list consists of docIds, loop through each one
 	for(int32_t i = 0; i < numDocIds; i++) {
		int64_t docId = tmpDocIds[i];
		// when searching events we get multiple docids that are same
		if ( dt.isInTable ( &docId ) ) continue;
		// add it
		if ( ! dt.addKey ( &docId ) ) return true;

		SpiderRequest sr;
		sr.reset();

		// url is a docid!
		sprintf ( sr.m_url , "%" PRIu64 , docId );

		// make a fake first ip
		// use only 64k values so we don't stress doledb/waittrees/etc.
		// for large #'s of docids
		int32_t firstIp = (docId & 0x0000ffff);

		// bits 6-13 of the docid are the domain hash so use those
		// when doing a REINDEX (not delete!) to ensure that requests
		// on the same domain go to the same shard, at least when
		// we have up to 256 shards. if we have more than 256 shards
		// at this point some shards will not participate in the
		// query reindex/delete process because of this, so 
		// we'll want to allow more bits in in that case perhaps.
		// check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp
		// to see what shard is responsible for storing and indexing 
		// this SpiderRequest based on the firstIp.
		if ( ! m_forceDel ) { 
			// if we are a REINDEX not a delete because 
			// deletes don't need to spider/redownload the doc
			// so the distribution can be more random
			firstIp >>= 6;
			firstIp &= 0xff;
		}

		// 0 is not a legit val. it'll core below.
		if ( firstIp == 0 ) {
			firstIp = 1;
		}

		// use a fake ip
		sr.m_firstIp        =  firstIp;
		// we are not really injecting...
		sr.m_isInjecting    =  false;//true;
		sr.m_hopCount       = -1;
		sr.m_isPageReindex  =  1;
		sr.m_urlIsDocId     =  1;
		sr.m_fakeFirstIp    =  1;

		// now you can recycle content instead of re-downloading it
		// for every docid
		sr.m_recycleContent = gr->m_recycleContent;
		// if this is zero we end up getting deduped in
		// dedupSpiderList() if there was a SpiderReply whose
		// spider time was > 0
		sr.m_addedTime = nowGlobal;
	    sr.m_forceDelete = m_forceDel ? 1 : 0;

		// . complete its m_key member
		// . parentDocId is used to make the key, but only allow one
		//   page reindex spider request per url... so use "0"
		// . this will set "uh48" to hash64b(m_url) which is the docid
		sr.setKey( firstIp, 0LL , false );

		// how big to serialize
		int32_t recSize = sr.getRecSize();

		m_numDocIdsAdded++;
	
		// store it
		if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) {
			// g_errno must be set
			if ( ! g_errno ) { g_process.shutdownAbort(true); }

			log(LOG_LOGIC,
			    "admin: Query reindex size of %" PRId32" "
			    "too big. Aborting. Bad engineer." , 
			    (int32_t)0);//m_list.getListSize() );
			return true;
		}
	}
Exemplo n.º 4
0
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) {
	// get request
	char *request = udpSlot->m_readBuf;
	int32_t  reqSize = udpSlot->m_readBufSize;
	// shortcut
	UdpServer *us = &g_udpServer;
	// breathe
	QUICKPOLL ( niceness );

	// shortcut
	char *reply = udpSlot->m_tmpBuf;

	//
	// . is it confirming that he got all the locks?
	// . if so, remove the doledb record and dock the doleiptable count
	//   before adding a waiting tree entry to re-pop the doledb record
	//
	if ( reqSize == sizeof(ConfirmRequest) ) {
		char *msg = NULL;
		ConfirmRequest *cq = (ConfirmRequest *)request;

		// confirm the lock
		HashTableX *ht = &g_spiderLoop.m_lockTable;
		int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 );
		if ( slot < 0 ) { 
			log("spider: got a confirm request for a key not "
			    "in the table! coll must have been deleted "
			    " or reset "
			    "while lock request was outstanding.");
			g_errno = EBADENGINEER;
			
			log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
			us->sendErrorReply ( udpSlot , g_errno );
			return;
			//char *xx=NULL;*xx=0; }
		}
		UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot );
		lock->m_confirmed = true;

		// note that
		if ( g_conf.m_logDebugSpider ) // Wait )
			log("spider: got confirm lock request for ip=%s",
			    iptoa(lock->m_firstIp));

		// get it
		SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum);
		// make it negative
		cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL;
		// and add the negative rec to doledb (deletion operation)
		Rdb *rdb = &g_doledb.m_rdb;
		if ( ! rdb->addRecord ( cq->m_collnum,
					(char *)&cq->m_doledbKey,
					NULL , // data
					0    , //dataSize
					1 )){ // niceness
			// tree is dumping or something, probably ETRYAGAIN
			if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb";	log("spider: %s %s",msg,mstrerror(g_errno));
			}
			//char *xx=NULL;*xx=0;
			
			log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
			us->sendErrorReply ( udpSlot , g_errno );
			return;
		}
		// now remove from doleiptable since we removed from doledb
		if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp );

		// how many spiders outstanding for this coll and IP?
		//int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp);

		// DO NOT add back to waiting tree if max spiders
		// out per ip was 1 OR there was a crawldelay. but better
		// yet, take care of that in the winReq code above.

		// . now add to waiting tree so we add another spiderdb
		//   record for this firstip to doledb
		// . true = callForScan
		// . do not add to waiting tree if we have enough outstanding
		//   spiders for this ip. we will add to waiting tree when
		//   we receive a SpiderReply in addSpiderReply()
		if ( sc && //out < cq->m_maxSpidersOutPerIp &&
		     // this will just return true if we are not the 
		     // responsible host for this firstip
		    // DO NOT populate from this!!! say "false" here...
		     ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) &&
		     // must be an error...
		     g_errno ) {
			msg = "FAILED TO ADD TO WAITING TREE";
			log("spider: %s %s",msg,mstrerror(g_errno));
			
			log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
			us->sendErrorReply ( udpSlot , g_errno );
			return;
		}
		// success!!
		reply[0] = 1;
		us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
		return;
	}



	// sanity check
	if ( reqSize != sizeof(LockRequest) ) {
		log("spider: bad msg12 request size of %" PRId32,reqSize);
		
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( udpSlot , EBADREQUEST );
		return;
	}
	// deny it if we are not synced yet! otherwise we core in 
	// getTimeGlobal() below
	if ( ! isClockInSync() ) { 
		// log it so we can debug it
		//log("spider: clock not in sync with host #0. so "
		//    "returning etryagain for lock reply");
		// let admin know why we are not spidering
		
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( udpSlot , ETRYAGAIN );
		return;
	}

	LockRequest *lr = (LockRequest *)request;
	//uint64_t lockKey = *(int64_t *)request;
	//int32_t lockSequence = *(int32_t *)(request+8);
	// is this a remove operation? assume not
	//bool remove = false;
	// get top bit
	//if ( lockKey & 0x8000000000000000LL ) remove = true;

	// mask it out
	//lockKey &= 0x7fffffffffffffffLL;
	// sanity check, just 6 bytes! (48 bits)
	if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; }
	// note it
	if ( g_conf.m_logDebugSpider )
		log("spider: got msg12 request uh48=%" PRId64" remove=%" PRId32,
		    lr->m_lockKeyUh48, (int32_t)lr->m_removeLock);
	// get time
	int32_t nowGlobal = getTimeGlobal();
	// shortcut
	HashTableX *ht = &g_spiderLoop.m_lockTable;

	int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port );
	// this must be legit - sanity check
	if ( hostId < 0 ) { char *xx=NULL;*xx=0; }

	// remove expired locks from locktable
	removeExpiredLocks ( hostId );

	int64_t lockKey = lr->m_lockKeyUh48;

	// check tree
	int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 );
	// put it here
	UrlLock *lock = NULL;
	// if there say no no
	if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot );

	// if doing a remove operation and that was our hostid then unlock it
	if ( lr->m_removeLock && 
	     lock && 
	     lock->m_hostId == hostId &&
	     lock->m_lockSequence == lr->m_lockSequence ) {
		// note it for now
		if ( g_conf.m_logDebugSpider )
			log("spider: removing lock for lockkey=%" PRIu64" hid=%" PRId32,
			    lr->m_lockKeyUh48,hostId);
		// unlock it
		ht->removeSlot ( slot );
		// it is gone
		lock = NULL;
	}
	// ok, at this point all remove ops return
	if ( lr->m_removeLock ) {
		reply[0] = 1;
		us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
		return;
	}

	/////////
	//
	// add new lock
	//
	/////////


	// if lock > 1 hour old then remove it automatically!!
	if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) {
		// note it for now
		log("spider: removing lock after %" PRId32" seconds "
		    "for lockKey=%" PRIu64" hid=%" PRId32,
		    (nowGlobal - lock->m_timestamp),
		    lr->m_lockKeyUh48,hostId);
		// unlock it
		ht->removeSlot ( slot );
		// it is gone
		lock = NULL;
	}
	// if lock still there, do not grant another lock
	if ( lock ) {
		// note it for now
		if ( g_conf.m_logDebugSpider )
			log("spider: refusing lock for lockkey=%" PRIu64" hid=%" PRId32,
			    lr->m_lockKeyUh48,hostId);
		reply[0] = 0;
		us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
		return;
	}
	// make the new lock
	UrlLock tmp;
	tmp.m_hostId       = hostId;
	tmp.m_lockSequence = lr->m_lockSequence;
	tmp.m_timestamp    = nowGlobal;
	tmp.m_expires      = 0;
	tmp.m_firstIp      = lr->m_firstIp;
	tmp.m_collnum      = lr->m_collnum;

	// when the spider returns we remove its lock on reception of the
	// spiderReply, however, we actually just set the m_expires time
	// to 5 seconds into the future in case there is a current request
	// to get a lock for that url in progress. but, we do need to
	// indicate that the spider has indeed completed by setting
	// m_spiderOutstanding to true. this way, addToWaitingTree() will
	// not count it towards a "max spiders per IP" quota when deciding
	// on if it should add a new entry for this IP.
	tmp.m_spiderOutstanding = true;
	// this is set when all hosts in the group (shard) have granted the
	// lock and the host sends out a confirmLockAcquisition() request.
	// until then we do not know if the lock will be granted by all hosts
	// in the group (shard)
	tmp.m_confirmed    = false;

	// put it into the table
	if ( ! ht->addKey ( &lockKey , &tmp ) ) {
		// return error if that failed!
		
		log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__);
		us->sendErrorReply ( udpSlot , g_errno );
		return;
	}
	// note it for now
	if ( g_conf.m_logDebugSpider )
		log("spider: granting lock for lockKey=%" PRIu64" hid=%" PRId32,
		    lr->m_lockKeyUh48,hostId);
	// grant the lock
	reply[0] = 1;
	us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot );
	return;
}
Exemplo n.º 5
0
// returns true if all done, false if waiting for more replies
bool Msg12::gotLockReply ( UdpSlot *slot ) {

	// no longer use this
	char *xx=NULL;*xx=0;

	// got reply
	m_numReplies++;
	// don't let udpserver free the request, it's our m_request[]
	slot->m_sendBufAlloc = NULL;
	// check for a hammer reply
	char *reply     = slot->m_readBuf;
	int32_t  replySize = slot->m_readBufSize;
	// if error, treat as a not grant
	if ( g_errno ) {
		bool logIt = true;
		// note it
		if ( g_conf.m_logDebugSpider )
			log("spider: got msg12 reply error = %s",
			    mstrerror(g_errno));
		// if we got an ETRYAGAIN when trying to confirm our lock
		// that means doledb was saving/dumping to disk and we 
		// could not remove the record from doledb and add an
		// entry to the waiting tree, so we need to keep trying
		if ( g_errno == ETRYAGAIN && m_confirming ) {
			// c ount it again
			m_numRequests++;
			// use what we were using
			char *request     = (char *)&m_confirmRequest;
			int32_t  requestSize = sizeof(ConfirmRequest);
			Host *h = g_hostdb.getHost(slot->m_hostId);
			// send request to him
			UdpServer *us = &g_udpServer;
			if ( ! us->sendRequest ( request      ,
						 requestSize  ,
						 0x12         , // msgType
						 h->m_ip      ,
						 h->m_port    ,
						 h->m_hostId  ,
						 NULL         , // retSlotPtrPt
						 this         , // state data
						 gotLockReplyWrapper ,
						 udpserver_sendrequest_infinite_timeout ) ) 
				return false;
			// error?
			// don't spam the log!
			static int32_t s_last = 0;
			int32_t now = getTimeLocal();
			if ( now - s_last >= 1 ) {
				s_last = now;
				log("spider: error re-sending confirm "
				    "request: %s",  mstrerror(g_errno));
			}
		}
		// only log every 10 seconds for ETRYAGAIN
		if ( g_errno == ETRYAGAIN ) {
			static time_t s_lastTime = 0;
			time_t now = getTimeLocal();
			logIt = false;
			if ( now - s_lastTime >= 3 ) {
				logIt = true;
				s_lastTime = now;
			}
		}
		if ( logIt )
			log ( "sploop: host had error getting lock url=%s"
			      ": %s" ,
			      m_url,mstrerror(g_errno) );
	}
	// grant or not
	if ( replySize == 1 && ! g_errno && *reply == 1 ) m_grants++;
	// wait for all to get back
	if ( m_numReplies < m_numRequests ) return false;
	// all done if we were removing
	if ( m_removing ) {
		// note it
		if ( g_conf.m_logDebugSpider )
		      logf(LOG_DEBUG,"spider: done removing all locks "
			   "(replies=%" PRId32") for %s",
			   m_numReplies,m_url);//m_sreq->m_url);
		// we are done
		m_gettingLocks = false;
		return true;
	}
	// all done if we were confirming
	if ( m_confirming ) {
		// note it
		if ( g_conf.m_logDebugSpider )
		      logf(LOG_DEBUG,"spider: done confirming all locks "
			   "for %s uh48=%" PRId64,m_url,m_origUh48);//m_sreq->m_url);
		// we are done
		m_gettingLocks = false;
		// . keep processing
		// . if the collection was nuked from under us the spiderUrl2
		//   will return true and set g_errno
		if ( ! m_callback ) return g_spiderLoop.spiderUrl2();
		// if we had a callback let our parent call it
		return true;
	}

	// if got ALL locks, spider it
	if ( m_grants == m_numReplies ) {
		// note it
		if ( g_conf.m_logDebugSpider )
		      logf(LOG_DEBUG,"spider: got lock for docid=lockkey=%" PRIu64,
			   m_lockKeyUh48);
		// flag this
		m_hasLock = true;
		// we are done
		//m_gettingLocks = false;


		///////
		//
		// now tell our group (shard) to remove from doledb
		// and re-add to waiting tree. the evalIpLoop() function
		// should skip this probable docid because it is in the 
		// LOCK TABLE!
		//
		// This logic should allow us to spider multiple urls
		// from the same IP at the same time.
		//
		///////

		// returns false if would block
		if ( ! confirmLockAcquisition ( ) ) return false;
		// . we did it without blocking, maybe cuz we are a single node
		// . ok, they are all back, resume loop
		// . if the collection was nuked from under us the spiderUrl2
		//   will return true and set g_errno
		if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( );
		// all done
		return true;

	}
	// note it
	if ( g_conf.m_logDebugSpider )
		logf(LOG_DEBUG,"spider: missed lock for %s lockkey=%" PRIu64" "
		     "(grants=%" PRId32")",   m_url,m_lockKeyUh48,m_grants);

	// . if it was locked by another then add to our lock cache so we do
	//   not try to lock it again
	// . if grants is not 0 then one host granted us the lock, but not
	//   all hosts, so we should probably keep trying on it until it is
	//   locked up by one host
	if ( m_grants == 0 ) {
		int32_t now = getTimeGlobal();
		g_spiderLoop.m_lockCache.addLong(0,m_lockKeyUh48,now,NULL);
	}

	// reset again
	m_numRequests = 0;
	m_numReplies  = 0;
	// no need to remove them if none were granted because another
	// host in our group might have it 100% locked. 
	if ( m_grants == 0 ) {
		// no longer in locks operation mode
		m_gettingLocks = false;
		// ok, they are all back, resume loop
		//if ( ! m_callback ) g_spiderLoop.spiderUrl2 ( );
		// all done
		return true;
	}
	// note that
	if ( g_conf.m_logDebugSpider )
		logf(LOG_DEBUG,"spider: sending request to all in shard to "
		     "remove lock uh48=%" PRIu64". grants=%" PRId32,
		     m_lockKeyUh48,(int32_t)m_grants);
	// remove all locks we tried to get, BUT only if from our hostid!
	// no no! that doesn't quite work right... we might be the ones
	// locking it! i.e. another one of our spiders has it locked...
	if ( ! removeAllLocks ( ) ) return false; // true;
	// if did not block, how'd that happen?
	log("sploop: did not block in removeAllLocks: %s",mstrerror(g_errno));
	return true;
}
Exemplo n.º 6
0
// . returns false if blocked true otherwise
// . sets g_errno on error
bool Msg35::getToken ( void *state, 
		       void (*callback )(void *state), char priority ){
	// if threads are disabled, we are probably repairing dbs
	// from main.cpp fixTitleRecs() or makeDbs() so no token needed
	if ( g_threads.areThreadsDisabled() ) return true;
	// you can also disable the token so twins can merge as the same time
	if ( ! g_conf.m_useMergeToken ) return true;
	// disable this until it works again
	return true;
	// . if only one host per group, you always have the token
	// . no, they can only have one merge going at a time
	//if ( g_hostdb.getNumHostsPerShard() == 1 ) return true;
	// . ensure not already registered
	// . this can happen if a client's get request arrives before their
	//   release request... so allow for that now
	for ( int32_t i = 0 ; i < 64 ; i++ ) {
		if ( m_clientWaits[i].m_isEmpty        ) continue;
		if ( m_clientWaits[i].m_state != state ) continue;
		//g_errno = EBADENGINEER;
		log(LOG_REMIND,"merge: Already queued merge token request.");
		// return false since they'll be called when token comes
		//return false;
		break;
	}
	// get next available slot
	int32_t i;
	for ( i = 0 ; i < 64 ; i++ )
		if ( m_clientWaits[i].m_isEmpty ) break;
	// . if none empty bitch and return
	// . this should never happen, if it does than increase the limit
	//   otherwise, his callback will never be called!!
	if ( i >= 64 ) {
		g_errno = EBADENGINEER;
		log(LOG_LOGIC,"merge: msg35: Too many waiting for token.");
		return true;
	}
	ClientWait *c = &m_clientWaits [ i ];
	// get current time
	int32_t timestamp = getTimeGlobal();
	// the request is just the priority really
	char *p = c->m_buf;
	*p         = REQUEST_GETTOKEN ; p += 1;
	*(int32_t *)p = g_hostdb.m_hostId; p += 4;
	*(int32_t *)p = timestamp        ; p += 4;
	*p         = priority         ; p += 1;
	*p         = i;               ; p += 1; // client slot #
	// . send to the governing host, he must be up
	// . this returns NULL and sets g_errno on error
	Host *h = getTokenManager ( );
	// . the priority of this msg is low, use g_udpServer
        // . returns false and sets g_errno on error
	// . if there is a sending error, we will try sending token manager
	//   our client queue (queue of requests) during call to sync()
        if ( !  g_udpServer.sendRequest ( c->m_buf   ,
					  11         , // requestLen
					  0x35       , // msgType 0x35
					  h->m_ip    , // low priority ip
					  h->m_port  , // low priority port
					  h->m_hostId,
					  NULL       , // slotPtr
					  this       , // state data
					  gotReplyWrapper35 ,
					  31536000 ) ) { // 1 yr timeout
		log("merge: Got error sending merge token request: %s.",
		    mstrerror(g_errno));
		g_errno = 0;
	}
	// save callback info even if request not launched successfully since
	// it will be retried during call to sync()
	c->m_state     = state;
	c->m_callback  = callback;
	c->m_priority  = priority;
	c->m_timestamp = timestamp;
	c->m_isEmpty   = false;
	if ( i > m_topUsedClient ) m_topUsedClient = i;
	// we blocked waiting for the reply
	return false;
}
void gotDatedbList ( State60 *st ) {

	// must only be run on host #0 since we need just one lock table
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// load turk lock table if we need to
	bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		if ( ! g_turkLocks.set(8,sizeof(TurkLock),256) )
			log("turk: failed to init turk lock table");
		if ( ! g_turkLocks.load(g_conf.m_dir,"turkdir/docidlocks.dat"))
			log("turk: failed to load turk lock table");
	}

	time_t now = getTimeGlobal();
	// int16_tcut
	RdbList *list = &st->m_list;
	// the best docid
	int64_t best = 0LL;
	// scan the list to get urls/docids to turk out
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *k = list->getCurrentKey();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (k[0] & 0x01) == 0x00 ) continue;
		// get the docid
		int64_t docid = g_datedb.getDocId ( k );
		// skip if locked
		TurkLock *tt = (TurkLock *)g_turkLock.getValue(&docid);
		// if there check time
		if ( tt && now - tt->m_lockTime > 3600 ) {
			// remove it
			g_turkLock.removeKey(&docId);
			// nuke tt
			tt = NULL;
		}
		// if still there, skip it and try next one
		if ( tt ) continue;
		// ok, we got a good docid to dish out
		best = docId;
		break;
	}

	SafeBuf sb;

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");

	// if we had no docid, give user an empty msg
	if ( ! best ) {
		sb.safePrintf("<center>Nothing currently available to edit. "
			      "Please try again later.</center>"
			      "</body></html>\n");
		sendReply ( &sb );
		return;
	}

	// lock it!
	TurkLock tt;
	strcpy ( tt.m_user , st->m_user );
	tt.m_lockTime = now;
	if ( ! g_lockTable.addLock ( &tt ) ) {
		sendErrorReply ( st , g_errno );
		return;
	}

	// . fetch the TitleRec
	// . a max cache age of 0 means not to read from the cache
	XmlDoc *xd = &st->m_xd;
	// . when getTitleRec() is called it will load the old one
	//   since XmlDoc::m_setFromTitleRec will be true
	// . niceness is 0
	xd->set3 ( best , st->m_coll , 0 );
	// if it blocks while it loads title rec, it will re-call this routine
	xd->setCallback ( st , processLoopWrapper );
	// good to go!
	return processLoop ( st );
}
// . displays the stats for a username
// . show stats for every day we have them for
// . in a big list
// . if they click the day display all docids evaluated for that day
// . show the accuracy for that day too
// . how many docs they edited
// . how many of those docs were verified by another
// . and if there was consensus
void gotTransdbList ( State60 *st ) {

	// get today's time range
	time_t now = getTimeGlobal();
	// get start of today
	time_t dayStart = now / (24*3600);

	SafeBuf sb;

	// int16_tcut
	TcpSocket *s = st->m_s;

	// make about 200k of mem to write into
	if ( ! sb.reserve ( 200000 ) ) 
		return g_httpServer.sendErrorReply(s,500,mstrerrno(g_errno));

	// print description so they can clikc a button to start the turk
	sb.safePrintf("<html>\n"
		      "<title>Event Editor</title>\n"
		      "<body>\n"
		      "<table width=\"100%%\" border=\"0\">\n"
		      "<tr><td style=\"background-color:#0079ba;\">\n"
		      "<center><font color=#00000>"
		      "<h2>Event Editor</h2>\n"
		      "</font></center></td>"
		      "</tr></table>");
	// print the content
	sb.safePrintf("<center><font size=4><blink>"
		      "<b><a href=\"/pageturk?c=%s&edit=1\">"
		      "Click here to start editing.</a></b></blink>"
		      "</font><br><i>Please take your "
		      "time to read the information below before you begin"
		      "</i><br><font color=\"red\" size=2> Warning: Adult "
		      "content might be presented to you."
		      " You should be above 18 years of age to continue."
		      "</center></font>",st->m_coll);

	sb.safePrintf("<font face=arial,sans-serif color=black size=3>"
		      "<p>By clicking <i>Start Voting</i>, you will be "
		       "presented with an interface for editing events. "
		      "The editor will display a modified web page that "
		      "contains one or more events. Each event's description "
		      "will be highlight with a blue background. You can "
		      "toggle whether a particular event is displayed by "
		      "clicking on that event's ID. You can highlight one or "
		      "multiple event descriptions at the same time. "
		      "</p><p>"
		      "By clicking on the section icons in the web page you "
		      "can tell the editor that a virtual fence should be "
		      "erected around that section. The fence will make sure "
		      "that event descriptions can not span across it. Each "
		      "event description must be fully contained either "
		      "inside or outside the fence. However, you can also "
		      "declare a section as a title section, which means that "
		      "the text that the title section contains is free to be "
		      "used by any event description."
		      "</p>\n"
		      "<p>When you are done erecting section fences, you "
		      "submit your changes. The more changes you make the "
		      "more points you earn. Other users may evaluate " 
		      "your edits for accuracy. You will be paid based on the "
		      "points you earn as well as your accuracy. All "
		      "transactions are listed in the table below.</p>"
		      "<p>You may not change your username or password "
		      "but you can change your email address. Your email "
		      "address will be used to pay you with PayPal every "
		      "Friday. Paypal fees will be deducted on your end. By "
		      "using this service you agree to all stated Terms & "
		      "Conditions.</p>"
		      "</font>\n");

	// get the user record
	User *uu = g_users.getUser ( username );
	// print out their info, like paypal email
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Info</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>Email</td>"
		      "<td><input type=text value=%s></td>"
		      "<td>email address used to pay with paypal</td>"
		      "</tr>\n"
		      "<tr><td colspan=10><input type=submit value=update>"
		      "</td></tr>\n"
		      "</table>\n" ,
		      uu->m_payPalEmail );

	// print your stats here now
	sb.safePrintf("<table>\n"
		      "<tr><td colspan=10><center>Your Stats</center>"
		      "</td></tr>\n"
		      "<tr>"
		      "<td>date</td>"
		      "<td>action</td>"
		      "<td>amount</td>"
		      "<td>desc</td>"
		      "</tr>\n");

	// int16_tcut
	RdbList *list = &st->m_list;

	int32_t lastDay        = -1;
	int32_t totalReceives  = 0;
	int32_t totalSubmits   = 0;
	int32_t totalPasses    = 0;
	int32_t totalFails     = 0;

	// scan the list
	for ( ; ! list->isExhausted() ; ) {
		// get rec
		char *rec      = list->getCurrentRecord();
		char *data     = list->getCurrentData();
		int32_t  dataSize = list->getCurrentDataSize();
		// skip that
		list->skipCurrentRecord();
		// skip if negative
		if ( (rec[0] & 0x01) == 0x00 ) continue;
		// get the time (global time - sync'd with host #0)
		time_t tt = g_transdb.getTimeStamp ( rec );
		// get day #
		int32_t daynum = tt / (24*3600);
		// is it today?
		bool isToday = ( daynum >= dayStart );
		// point to the Transaction
		Trans *trans = (Trans *)data;
		// if is today, print it out verbatim
		if ( isToday ) {
			// print it in html row format to match table above
			//printTrans ( &sb , rec );
			sb.safePrintf("<tr>");
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%H:%M:%S",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			if ( trans->m_actionType == AT_RECEIVE_DOC )
				sb.safePrintf("<td>receive</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_SUBMIT_DOC )
				sb.safePrintf("<td>submit</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_PASS_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was verified "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_FAIL_DOC )
				sb.safePrintf("<td>verify</td>"
					      "<td>%"INT32" pts</td>"
					      "<td>docid=%"UINT64" was deemed to "
					      "be incorrect "
					      "by user=\"%s\"</td>",
					      (int32_t)trans->m_number,
					      trans->m_docId,
					      trans->m_desc);
			else if ( trans->m_actionType == AT_ACCURACY_EVAL)
				sb.safePrintf("<td>accuracy eval</td>"
					      "<td>%.02f</td>"
					      "<td>docid=%"UINT64"</td>",
					      trans->m_number,
					      trans->m_docId);
			else if ( trans->m_actionType == AT_CHARGE)
				sb.safePrintf("<td>credit</td>"
					      "<td>%.02f</td>"
					      "<td>You made money.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_PAYMENT)
				sb.safePrintf("<td>payment</td>"
					      "<td>%.02f</td>"
					      "<td>We paid you.</td>",
					      trans->m_number);
			else if ( trans->m_actionType == AT_LOGIN)
				sb.safePrintf("<td>login</td>"
					      "<td>-</td>"
					      "<td>You logged in.</td>");
			else if ( trans->m_actionType == AT_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You logged out.</td>");
			else if ( trans->m_actionType == AT_AUTO_LOGOUT)
				sb.safePrintf("<td>logout</td>"
					      "<td>-</td>"
					      "<td>You were auto "
					      "logged out.</td>");
			else {
				char *xx=NULL;*xx=0; }
			sb.safePrintf("</tr>\n");
			continue;
		}
		// if does not match last day, print out that last day's stats
		// and reset for next guy
		if ( daynum != lastDay && lastDay != -1 ) {
			// make it into a nice date
			time_t dd = lastDay * 86400;
			struct tm *timeStruct = localtime ( &dd );
			char ppp[100];
			strftime(ppp,100,"%b-%d-%Y",timeStruct);
			// print last days stats first
			sb.safePrintf("<td>%s</td>",ppp);
			// then stats
			sb.safePrintf("<tr>"
				      "<td>receive</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total received</td>"
				      "</tr>\n",
				      totalReceives);
			sb.safePrintf("<tr>"
				      "<td>submit</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total submitted</td>"
				      "</tr>\n",
				      totalSubmits);
			sb.safePrintf("<tr>"
				      "<td>pass</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests passed</td>"
				      "</tr>\n",
				      totalPasses);
			sb.safePrintf("<tr>"
				      "<td>fail</td>"
				      "<td>%"INT32"</td>"
				      "<td>Total accuracy tests failed</td>"
				      "</tr>\n",
				      totalFails);
			// reset as well
			totalReceived = 0;
			totalSubmits  = 0;
			totalPasses   = 0;
			totalFails    = 0;
		}
		// remember last day # we processed for accumulating stats
		lastDay = daynum;
		// accum stats
		if ( trans->m_actionType == AT_RECEIVE_DOC )
			totalReceives++;
		if ( trans->m_actionType == AT_SUBMIT_DOC )
			totalSubmits++;
		if ( trans->m_actionType == AT_PASS_DOC )
			totalPasses++;
		if ( trans->m_actionType == AT_FAIL_DOC )
			totalFails++;
	}

	sb.safePrintf("</body></html>\n");

	sendReply ( &sb );
}
// . returns false if blocked, otherwise true
// . sets g_errno on error
bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
	
	char *cgi;
	long cgiLen;
	StateStatsdb *st;
	try { st = new StateStatsdb; }
	catch ( ... ) {
		g_errno = ENOMEM;
		log(LOG_INFO, "PageStatsdb: failed to allocate state memory.");
		return true;
	}
	mnew( st, sizeof(StateStatsdb), "PageStatsdb" );

	st->m_niceness     = MAX_NICENESS;

	st->m_socket  	   = s;
	st->m_request 	   = *r;

	// hostId must be one of the following:
	// 	 0-n - a valid hostId
	//	-1   - a sample (subset) of the hosts
	//	-2   - all hosts
	//	-3   - this host
	st->m_hostId = r->getLong( "host", -3 );
	if ( st->m_hostId == -3 )
		st->m_hostId = g_hostdb.getMyHostId();

        // If we are pulling from multiple hosts, are we merging
        // the data into a single graph?
	// TODO:
	// - Make sure this always happens. Now our only concern
	//   is how many stats we will be drawing.
	//st->m_mergeResults = (bool    )r->getLong( "merge_results" , 1 );

	// get session parameters
	st->m_cacti	   = (bool   )r->getLong( "cacti"       , 0 );

	// get date parameters
	cgi = r->getString( "sdate" , &cgiLen , NULL );
	st->m_startDate = genDate( cgi, cgiLen );
	cgi = r->getString( "edate" , &cgiLen , NULL );
	st->m_endDate = genDate( cgi, cgiLen );
	st->m_dateCustom   = (bool)r->getLong( "custom",  0 );
	// default to 10 hours, i would do 1 day except that there are
	// some bugs that mess up the display a lot when i do that
	st->m_datePeriod   = r->getLong( "date_period" , 36000 );
	st->m_dateUnits    = r->getLong( "date_units"  , 1 );//SECS_PER_MIN
	st->m_now	   = (bool)r->getLong( "date_now"   , 1 );
	st->m_autoUpdate   = (bool)r->getLong( "auto_update" , 0 );

	// # samples in moving average
	st->m_samples      = r->getLong( "samples" , 300 );


	//if ( st->m_columns < MIN_COLUMNS || st->m_columns > MAX_COLUMNS )
	//	st->m_columns = DEF_COLUMNS;

	if ( st->m_now )
		st->m_startDate = (time_t)getTimeGlobal();

	st->m_startDateR = st->m_startDate;
	st->m_endDateR   = st->m_endDate; 

	if ( ! st->m_dateCustom ) {
		st->m_endDateR = st->m_startDateR - ( st->m_datePeriod *
						      st->m_dateUnits );
		st->m_endDate = st->m_endDateR;
	}


	if ( ! g_statsdb.makeGIF ( st->m_endDateR   ,
				   st->m_startDateR ,
				   st->m_samples ,
				   &st->m_sb2 ,
				   st               ,
				   sendReply        ) )
		return false;

	// if we didn't block call it ourselves directly
	sendReply ( st );

	return true;
}
Exemplo n.º 10
0
bool Msg3a::gotAllSplitReplies ( ) {

    // if any of the split requests had an error, give up and set m_errno
    // but don't set if for non critical errors like query truncation
    if ( m_errno ) {
        g_errno = m_errno;
        return true;
    }

    // also reset the finalbuf and the oldNumTopDocIds
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    // update our estimated total hits
    m_numTotalEstimatedHits = 0;

    for ( long i = 0; i < m_numHosts ; i++ ) {
        // get that host that gave us the reply
        //Host *h = g_hostdb.getHost(i);
        // . get the reply from multicast
        // . multicast should have destroyed all slots, but saved reply
        // . we are responsible for freeing the reply
        // . we need to call this even if g_errno or m_errno is
        //   set so we can free the replies in Msg3a::reset()
        // . if we don't call getBestReply() on it multicast should
        //   free it, because Multicast::m_ownReadBuf is still true
        Multicast *m = &m_mcast[i];
        bool freeit = false;
        long  replySize = 0;
        long  replyMaxSize;
        char *rbuf;
        Msg39Reply *mr;
        // . only get it if the reply not already full
        // . if reply already processed, skip
        // . perhaps it had no more docids to give us or all termlists
        //   were exhausted on its disk and this is a re-call
        // . we have to re-process it for count m_numTotalEstHits, etc.
        rbuf = m->getBestReply ( &replySize    ,
                                 &replyMaxSize ,
                                 &freeit       ,
                                 true          ); //stealIt?
        // cast it
        mr = (Msg39Reply *)rbuf;
        // in case of mem leak, re-label from "mcast" to this so we
        // can determine where it came from, "Msg3a-GBR"
        relabel( rbuf, replyMaxSize , "Msg3a-GBR" );
        // . we must be able to free it... we must own it
        // . this is true if we should free it, but we should not have
        //   to free it since it is owned by the slot?
        if ( freeit ) {
            log(LOG_LOGIC,"query: msg3a: Steal failed.");
            char *xx = NULL;
            *xx=0;
        }
        // bad reply?
        if ( ! mr ) {
            log(LOG_LOGIC,"query: msg3a: Bad NULL reply.");
            m_reply       [i] = NULL;
            m_replyMaxSize[i] = 0;
            // it might have been timd out, just ignore it!!
            continue;
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            // all reply buffers should be freed on reset()
            return true;
        }
        // how did this happen?
        if ( replySize < 29 && ! mr->m_errno ) {
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.",
                replySize);
            // all reply buffers should be freed on reset()
            return true;
        }

        // can this be non-null? we shouldn't be overwriting one
        // without freeing it...
        if ( m_reply[i] )
            // note the mem leak now
            log("query: mem leaking a 0x39 reply");

        // cast it and set it
        m_reply       [i] = mr;
        m_replyMaxSize[i] = replyMaxSize;
        // deserialize it (just sets the ptr_ and size_ member vars)
        //mr->deserialize ( );
        deserializeMsg ( sizeof(Msg39Reply) ,
                         &mr->size_docIds,
                         &mr->size_clusterRecs,
                         &mr->ptr_docIds,
                         mr->m_buf );

        // sanity check
        if ( mr->m_nqt != m_q->getNumTerms() ) {
            g_errno = EBADREPLY;
            m_errno = EBADREPLY;
            log("query: msg3a: Split reply qterms=%li != %li.",
                (long)mr->m_nqt,(long)m_q->getNumTerms() );
            return true;
        }
        // return if split had an error, but not for a non-critical
        // error like query truncation
        if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
            g_errno = mr->m_errno;
            m_errno = mr->m_errno;
            log("query: msg3a: Split had error: %s",
                mstrerror(g_errno));
            return true;
        }
        // skip down here if reply was already set
        //skip:
        // add of the total hits from each split, this is how many
        // total results the lastest split is estimated to be able to
        // return
        // . THIS should now be exact since we read all termlists
        //   of posdb...
        m_numTotalEstimatedHits += mr->m_estimatedHits;

        // debug log stuff
        if ( ! m_debug ) continue;
        // cast these for printing out
        long long *docIds    = (long long *)mr->ptr_docIds;
        score_t   *scores    = (score_t   *)mr->ptr_scores;
        // print out every docid in this split reply
        for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
            // print out score_t
            logf( LOG_DEBUG,
                  "query: msg3a: [%lu] %03li) "
                  "split=%li docId=%012llu domHash=0x%02lx "
                  "score=%lu"                     ,
                  (unsigned long)this                      ,
                  j                                        ,
                  i                                        ,
                  docIds [j] ,
                  (long)g_titledb.getDomHash8FromDocId(docIds[j]),
                  (long)scores[j] );
        }
    }

    // this seems to always return true!
    mergeLists ( );

    if ( ! m_r->m_useSeoResultsCache ) return true;

    // now cache the reply
    SafeBuf cr;
    long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4);
    long need = sizeof(key_t) + 4 + dataSize;
    bool status = cr.reserve ( need );
    // sanity
    if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) {
        char *xx=NULL;
        *xx=0;
    }
    // ignore errors
    g_errno = 0;
    // return on error with g_errno cleared if cache add failed
    if ( ! status ) return true;
    // add to buf otherwise
    cr.safeMemcpy ( &m_ckey , sizeof(key_t) );
    cr.safeMemcpy ( &dataSize , 4 );
    long now = getTimeGlobal();
    cr.pushLong ( now );
    cr.pushLong ( m_numDocIds );
    cr.pushLong ( m_numTotalEstimatedHits );//Results );
    long max = m_numDocIds;
    // then the docids
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLongLong(m_docIds[i] );
    for ( long i = 0 ; i < max ; i++ )
        cr.pushFloat(m_scores[i]);
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLong(getSiteHash26(i));
    // sanity
    if ( cr.length() != need ) {
        char *xx=NULL;
        *xx=0;
    }
    // make these
    key_t startKey;
    key_t endKey;
    startKey = m_ckey;
    // clear delbit
    startKey.n0 &= 0xfffffffffffffffeLL;
    // end key is us
    endKey = m_ckey;
    // that is the single record
    m_seoCacheList.set ( cr.getBufStart() ,
                         cr.length(),
                         cr.getBufStart(), // alloc
                         cr.getCapacity(), // alloc size
                         (char *)&startKey,
                         (char *)&endKey,
                         -1, // fixeddatasize
                         true, // owndata?
                         false,// use half keys?
                         sizeof(key_t) );
    // do not allow cr to free it, msg1 will
    cr.detachBuf();
    // note it
    //log("seopipe: storing ckey=%s q=%s"
    //    ,KEYSTR(&m_ckey,12)
    //    ,m_r->ptr_query
    //    );
    //log("msg1: sending niceness=%li",(long)m_r->m_niceness);
    // this will often block, but who cares!? it just sends a request off
    if ( ! m_msg1.addList ( &m_seoCacheList ,
                            RDB_SERPDB,//RDB_CACHEDB,
                            m_r->ptr_coll,
                            this, // state
                            gotSerpdbReplyWrapper, // callback
                            false, // forcelocal?
                            m_r->m_niceness ) ) {
        //log("blocked");
        return false;
    }

    // we can safely delete m_msg17... just return true
    return true;
}
// a cacheTime of -1 means browser should not cache at all
void HttpMime::makeMime  ( long    totalContentLen    , 
			   long    cacheTime          ,
			   time_t  lastModified       ,
			   long    offset             , 
			   long    bytesToSend        ,
			   char   *ext                ,
			   bool    POSTReply          ,
			   char   *contentType        ,
			   char   *charset            ,
			   long    httpStatus         ,
			   char   *cookie             ) {
	// assume UTF-8
	//if ( ! charset ) charset = "utf-8";
	// . make the content type line
	// . uses a static buffer
	if ( ! contentType ) 
		contentType = (char *)getContentTypeFromExtension ( ext );

	// do not cache plug ins
	if ( contentType && strcmp(contentType,"application/x-xpinstall")==0)
		cacheTime = -2;

	// assume UTF-8, but only if content type is text
	// . No No No!!!  
	// . This prevents charset specification in html files
	// . -partap

	//if ( ! charset && contentType && strncmp(contentType,"text",4)==0) 
	//	charset = "utf-8";
	// this is used for bz2 and gz files (mp3?)
	const char *contentEncoding = getContentEncodingFromExtension ( ext );
	// the string
	char enc[128];
	if ( contentEncoding ) 
		sprintf ( enc , "Content-Encoding: %s\r\n", contentEncoding );
	else
		enc[0] = '\0';
	// get the time now
	//time_t now = getTimeGlobal();
	time_t now;
	if ( isClockInSync() ) now = getTimeGlobal();
	else                   now = getTimeLocal();
	// get the greenwhich mean time (GMT)
	char ns[128];
	struct tm *timeStruct = gmtime ( &now );
	// Wed, 20 Mar 2002 16:47:30 GMT
	strftime ( ns , 126 , "%a, %d %b %Y %T GMT" , timeStruct );
	// if lastModified is 0 use now
	if ( lastModified == 0 ) lastModified = now;
	// convert lastModified greenwhich mean time (GMT)
	char lms[128];
	timeStruct = gmtime ( &lastModified );
	// Wed, 20 Mar 2002 16:47:30 GMT
	strftime ( lms , 126 , "%a, %d %b %Y %T GMT" , timeStruct );
	// . the pragma no cache string (used just for proxy servers?)
	// . also use cache-control: for the browser itself (HTTP1.1, though)
	// . pns = "Pragma: no-cache\nCache-Control: no-cache\nExpires: -1\n";
	char tmp[128];
	char *pns ;
	// with cache-control on, when you hit the back button, it reloads
	// the page, this is bad for most things... so we only avoid the
	// cache for index.html and PageAddUrl.cpp (the main and addurl page)
	if      ( cacheTime == -2 ) pns =  "Cache-Control: no-cache\r\n"
					   "Pragma: no-cache\r\n"
					   "Expires: -1\r\n";
	// so when we click on a control link, it responds correctly.
	// like turning spiders on.
	else if  ( cacheTime == -1 ) pns = "Pragma: no-cache\r\n"
					   "Expires: -1\r\n";
	// don't specify cache times if it's 0 (let browser regulate it)
	else if ( cacheTime == 0 ) pns = "";
	// otherwise, expire tag: "Expires: Wed, 23 Dec 2001 10:23:01 GMT"
	else {
		time_t  expDate = now + cacheTime;
		timeStruct = gmtime ( &expDate );
		strftime ( tmp , 100 , "Expires: %a, %d %b %Y %T GMT\r\n", 
			   timeStruct );
		pns = tmp;
	}
	// . set httpStatus
	// . a reply to a POST (not a GET or HEAD) should be 201
	char *p = m_buf;
	char *smsg = "";
	if ( POSTReply ) {
		if ( httpStatus == -1 ) httpStatus = 200;
		if ( httpStatus == 200 ) smsg = " OK";
		if ( ! charset ) charset = "utf-8";
		//sprintf ( m_buf , 
		p += sprintf ( p,
			  "HTTP/1.0 %li%s\r\n"
			  "Date: %s\r\n"
			       //"P3P: CP=\"CAO PSA OUR\"\r\n"
			  "Server: Gigablast/1.0\r\n"
			  "Content-Length: %li\r\n"
			  //"Expires: Wed, 23 Dec 2003 10:23:01 GMT\r\n"
			  //"Expires: -1\r\n"
			  "Connection: Close\r\n"
			  "%s"
			  "Content-Type: %s\r\n\r\n",
			  //"Connection: Keep-Alive\r\n"
			  //"%s"
			  //"Location: f**k\r\n"
			  //"Location: http://192.168.0.4:8000/cgi/3.cgi\r\n"
			  //"Last-Modified: %s\r\n\r\n" ,
			  httpStatus , smsg ,
			  ns , totalContentLen , enc , contentType  );
			  //pns ,
	                  //ns );
			  //lms );
	}
	// . is it partial content?
	// . if bytesToSend is < 0 it means "totalContentLen"
	else if ( offset > 0 || bytesToSend != -1 ) {
		if ( httpStatus == -1 ) httpStatus = 206;
		if ( ! charset ) charset = "utf-8";
		//sprintf ( m_buf , 
		p += sprintf( p,
			      "HTTP/1.0 %li Partial content\r\n"
			      "%s"
			      "Content-Length: %li\r\n"
			      "Content-Range: %li-%li(%li)\r\n"// added "bytes"
			      "Connection: Close\r\n"
			      //"P3P: CP=\"CAO PSA OUR\"\r\n"
			      "Server: Gigablast/1.0\r\n"
			      "%s"
			      "Date: %s\r\n"
			      "Last-Modified: %s\r\n" 
			      "Content-Type: %s\r\n",
			      httpStatus ,
			      enc ,bytesToSend ,
			      offset , offset + bytesToSend , 
			      totalContentLen ,
			      pns ,
			      ns , 
			      lms , contentType );
		// otherwise, do a normal mime
	}
	else {
		char encoding[256];
		if (charset) sprintf(encoding, "; charset=%s", charset);
		else encoding[0] = '\0';
		
		
		if ( httpStatus == -1 ) httpStatus = 200;
		if ( httpStatus == 200 ) smsg = " OK";
		//sprintf ( m_buf , 
		p += sprintf( p,
			      "HTTP/1.0 %li%s\r\n"
			      // make it at least 4 spaces so we can change
			      // the length of the content should we insert
			      // a login bar in Proxy::storeLoginBar()
			      "Content-Length: %04li\r\n"
			      "%s"
			      "Content-Type: %s",
			      httpStatus , smsg ,
			      totalContentLen , enc , contentType );
		if ( charset ) p += sprintf ( p , "; charset=%s", charset );
		p += sprintf ( p , "\r\n");
		p += sprintf ( p ,
			       //"Connection: Keep-Alive\r\n"
			       "Connection: Close\r\n"
			       //"P3P: CP=\"CAO PSA OUR\"\r\n"
			       "Server: Gigablast/1.0\r\n"
			       "%s"
			       "Date: %s\r\n"
			       "Last-Modified: %s\r\n" ,
			       pns ,
			       ns , 
			       lms );
	}
	// write the cookie if we have one
	if (cookie) {
		// now it is a list of Set-Cookie: x=y\r\n lines
		//p += sprintf ( p, "Set-Cookie: %s\r\n", cookie);
		if ( strncmp(cookie,"Set-Cookie",10 ) )
			p += sprintf(p,"Set-Cookie: ");
		p += sprintf ( p, "%s", cookie);
		if ( p[-1] != '\n' && p[-2] != '\r' ) {
			*p++ = '\r';
			*p++ = '\n';
		}
	}
			
	// write another line to end the mime
	p += sprintf(p, "\r\n");
	// set the mime's length
	//m_bufLen = gbstrlen ( m_buf );
	m_bufLen = p - m_buf;
}
Exemplo n.º 12
0
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . dumps the RdbTree, m_tree, into m_file
// . also sets and writes the RdbMap for m_file
// . we methodically get RdbLists from the RdbTree 
// . dumped recs are ordered by key if "orderedDump" was true in call to set()
//   otherwise, lists are ordered by node #
// . we write each list of recs to the file until the whole tree has been done
// . we delete all records in list from the tree after we've written the list
// . if a cache was provided we incorporate the list into the cache before
//   deleting it from the tree to keep the cache in sync. NO we do NOT!
// . called again by writeBuf() when it's done writing the whole list
bool RdbDump::dumpTree ( bool recall ) {
	// set up some vars
	//int32_t  nextNode;
	//key_t maxEndKey;
	//maxEndKey.setMax();
	char maxEndKey[MAX_KEY_BYTES];
	KEYMAX(maxEndKey,m_ks);
	// if dumping statsdb, we can only dump records 30 seconds old or
	// more because Statsdb.cpp can "back modify" such records in the tree
	// because it may have a query that took 10 seconds come in then it
	// needs to add a partial stat to the last 10 stats for those 10 secs.
	// we use Global time at this juncture
	if ( m_rdb->m_rdbId == RDB_STATSDB ) {
		int32_t nowSecs = getTimeGlobal();
		StatKey *sk = (StatKey *)maxEndKey;
		sk->m_zero      = 0x01;
		sk->m_labelHash = 0xffffffff;
		// leave last 60 seconds in there just to be safe
		sk->m_time1     = nowSecs - 60;
	}

	// this list will hold the list of nodes/recs from m_tree
	m_list = &m_ourList;
	// convert coll to collnum
	//collnum_t collnum = g_collectiondb.getCollnum ( m_coll );
	// a collnum of -1 is for collectionless rdbs
	//if ( collnum < 0 ) {
	//	//if ( g_catdb->getRdb() == m_rdb )
	//	if ( ! m_rdb->m_isCollectionLess ) {
	//		char *xx=NULL;*xx=0; //return true;
	//	}
	//	g_errno = 0;
	//	collnum = 0;
	//}
	// getMemOccupiedForList2() can take some time, so breathe
	int32_t niceness = 1;
 loop:
	// if the lastKey was the max end key last time then we're done
	if ( m_rolledOver     ) return true;
	// this is set to -1 when we're done with our unordered dump
	if ( m_nextNode == -1 ) return true;
	// . NOTE: list's buffer space should be re-used!! (TODO)
	// . "lastNode" is set to the last node # in the list
	bool status = true;
	//if ( ! m_orderedDump ) {
	//	status = ((RdbTree *)m_tree)->getListUnordered ( m_nextNode ,
	//							 m_maxBufSize ,
	//							 m_list , 
	//							 &nextNode );
	//	// this is -1 when no more nodes are left
	//	m_nextNode = nextNode;
	//}
	// "lastKey" is set to the last key in the list
	//else {
	{

		// can we remove neg recs?
		// class RdbBase *base = m_rdb->getBase(m_collnum);
		// bool removeNegRecs = false;
		// if ( base->m_numFiles <= 0 ) removeNegRecs = true;

		if ( recall ) goto skip;

		// debug msg
		//log("RdbDump:: getting list");
		m_t1 = gettimeofdayInMilliseconds();
		if(m_tree)
			status = m_tree->getList ( m_collnum       ,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys ,
						   niceness );
		else if(m_buckets)
			status = m_buckets->getList ( m_collnum,
					   m_nextKey     , 
					   maxEndKey     ,
					   m_maxBufSize  , // max recSizes
					   m_list        , 
					   &m_numPosRecs   ,
					   &m_numNegRecs   ,
					   m_useHalfKeys );


		// don't dump out any neg recs if it is our first time dumping
		// to a file for this rdb/coll. TODO: implement this later.
		//if ( removeNegRecs )
		//	m_list.removeNegRecs();

 		// if(!m_list->checkList_r ( false , // removeNegRecs?
 		// 			 false , // sleep on problem?
 		// 			 m_rdb->m_rdbId )) {
 		// 	log("db: list to dump is not sane!");
		// 	char *xx=NULL;*xx=0;
 		// }


	skip:
		int64_t t2;
		//key_t lastKey;
		char *lastKey;
		// if error getting list (out of memory?)
		if ( ! status ) goto hadError;
		// debug msg
		t2 = gettimeofdayInMilliseconds();
		log(LOG_INFO,"db: Get list took %"INT64" ms. "
		    "%"INT32" positive. %"INT32" negative.",
		    t2 - m_t1 , m_numPosRecs , m_numNegRecs );
		// keep a total count for reporting when done
		m_totalPosDumped += m_numPosRecs;
		m_totalNegDumped += m_numNegRecs;
		// . check the list we got from the tree for problems
		// . ensures keys are ordered from lowest to highest as well
		//#ifdef GBSANITYCHECK
		if ( g_conf.m_verifyWrites ) {
			char *s = "none";
			if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
			log("dump: verifying list before dumping (rdb=%s)",s);
			m_list->checkList_r ( false , // removeNegRecs?
					      false , // sleep on problem?
					      m_rdb->m_rdbId );
		}
		// if list is empty, we're done!
		if ( status && m_list->isEmpty() ) {
			// consider that a rollover?
			if ( m_rdb->m_rdbId == RDB_STATSDB )
				m_rolledOver = true;
			return true;
		}
		// get the last key of the list
		lastKey = m_list->getLastKey();
		// advance m_nextKey
		//m_nextKey  = lastKey ;
		//m_nextKey += (uint32_t)1;
		//if ( m_nextKey < lastKey ) m_rolledOver = true;
		KEYSET(m_nextKey,lastKey,m_ks);
		KEYADD(m_nextKey,1,m_ks);
		if (KEYCMP(m_nextKey,lastKey,m_ks)<0) m_rolledOver = true;
	      // debug msg
	      //log(0,"RdbDump:lastKey.n1=%"UINT32",n0=%"UINT64"",lastKey.n1,lastKey.n0);
	      //log(0,"RdbDump:next.n1=%"UINT32",n0=%"UINT64"",m_nextKey.n1,m_nextKey.n0);
	}
	// . return true on error, g_errno should have been set
	// . this is probably out of memory error
	if ( ! status ) {
	hadError:
		log("db: Had error getting data for dump: %s. Retrying.", 
		    mstrerror(g_errno));
		// debug msg
		//log("RdbDump::getList: sleeping and retrying");
		// retry for the remaining two types of errors
		if (!g_loop.registerSleepCallback(1000,this,tryAgainWrapper2)){
			log(
			    "db: Retry failed. Could not register callback.");
			return true;
		}
		// wait for sleep
		return false;
	}
	// if list is empty, we're done!
	if ( m_list->isEmpty() ) return true;
	// . set m_firstKeyInQueue and m_lastKeyInQueue
	// . this doesn't work if you're doing an unordered dump, but we should
	//   not allow adds when closing
	m_lastKeyInQueue  = m_list->getLastKey();
	//m_firstKeyInQueue = m_list->getCurrentKey();
	m_list->getCurrentKey(m_firstKeyInQueue);
	// . write this list to disk
	// . returns false if blocked, true otherwise
	// . sets g_errno on error
	// . if this blocks it should call us (dumpTree() back)
	if ( ! dumpList ( m_list , m_niceness , false ) ) return false;
	// close up shop on a write/dumpList error
	if ( g_errno ) return true;
	// . if dumpList() did not block then keep on truckin'
	// . otherwise, wait for callback of dumpTree()
	goto loop;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
	// . get fields from cgi field of the requested url
	// . get the search query
	long  urlLen = 0;
	char *url = r->getString ( "u" , &urlLen , NULL /*default*/);

	// see if they provided a url of a file of urls if they did not
	// provide a url to add directly
	//bool isAdmin = g_collectiondb.isAdmin ( r , s );
	bool isAdmin = r->getIsLocal();
	long  ufuLen = 0;
	char *ufu = NULL;
	if ( isAdmin )
		// get the url of a file of urls (ufu)
		ufu = r->getString ( "ufu" , &ufuLen , NULL );

	// can't be too long, that's obnoxious
	if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
		g_errno = EBUFTOOSMALL;
		g_msg = " (error: url too long)";
		return g_httpServer.sendErrorReply(s,500,"url too long");
	}
	// get the collection
	long  collLen = 0;
	char *coll    = r->getString("c",&collLen);
	if ( ! coll || ! coll[0] ) {
		//coll    = g_conf.m_defaultColl;
		coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
		collLen = gbstrlen(coll);
	}
	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec ( coll );
	// bitch if no collection rec found
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		g_msg = " (error: no collection)";
		return g_httpServer.sendErrorReply(s,500,"no coll rec");
	}
	// . make sure the ip is not banned
	// . we may also have an exclusive list of IPs for private collections
	if ( ! cr->hasSearchPermission ( s ) ) {
		g_errno = ENOPERM;
		g_msg = " (error: permission denied)";
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// make a new state
	State1 *st1 ;
	try { st1 = new (State1); }
	catch ( ... ) { 
		g_errno = ENOMEM;
		log("PageAddUrl: new(%i): %s", 
		    sizeof(State1),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
	mnew ( st1 , sizeof(State1) , "PageAddUrl" );
	// save socket and isAdmin
	st1->m_socket  = s;
	st1->m_isAdmin = isAdmin;

	// assume no url buf yet, set below
	//st1->m_ubuf      = NULL;
	//st1->m_ubufAlloc = NULL;
	//st1->m_metaList  = NULL;

	// save the url
	st1->m_url[0] = '\0';
	if ( url ) {
		// normalize and add www. if it needs it
		Url uu;
		uu.set ( url , gbstrlen(url) , true );
		// remove >'s i guess and store in st1->m_url[] buffer
		st1->m_urlLen=cleanInput ( st1->m_url,
					   MAX_URL_LEN, 
					   uu.getUrl(),
					   uu.getUrlLen() );
		// point to that as the url "buf" to add
		//st1->m_ubuf      = st1->m_url;
		//st1->m_ubufSize  = urlLen;
		//st1->m_ubufAlloc = NULL; // do not free it!
	}

	// save the "ufu" (url of file of urls)
	st1->m_ufu[0] = '\0';
	st1->m_ufuLen  = ufuLen;
	memcpy ( st1->m_ufu , ufu , ufuLen );
	st1->m_ufu[ufuLen] = '\0';

	st1->m_doTuringTest = cr->m_doTuringTest;
	char *username     = g_users.getUsername(r);
	if(username) strcpy(st1->m_username,username);
	//st1->m_user    = g_pages.getUserType ( s , r );
	st1->m_spiderLinks = true;
	st1->m_strip   = true;
	//st1->m_raw = r->getLong("raw",0);

	// init state2
	for ( long i = 0; i < 5; i++ ){
		st1->m_state2[i].m_buf = NULL;
		st1->m_state2[i].m_bufLen = 0;
		st1->m_state2[i].m_bufMaxLen = 0;
	}

	// save the collection name in the State1 class
	if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
	strncpy ( st1->m_coll , coll , collLen );
	st1->m_coll [ collLen ] = '\0';

	// assume they answered turing test correctly
	st1->m_goodAnswer = true;
	// if addurl is turned off, just print "disabled" msg
	if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false );
	// can also be turned off in the collection rec
	if ( ! cr->m_addUrlEnabled    ) return sendReply ( st1 , false );
	// or if in read-only mode
	if (   g_conf.m_readOnlyMode  ) return sendReply ( st1 , false );
	// cannot add if another Msg10 from here is still in progress
	if ( s_inprogress ) return sendReply ( st1 , true );
	// use now as the spiderTime

	// get ip of submitter
	//unsigned long h = ipdom ( s->m_ip );
	// . use top 2 bytes now, some isps have large blocks
	// . if this causes problems, then they can do pay for inclusion
	unsigned long h = iptop ( s->m_ip );
	long codeLen;
	char* code = r->getString("code", &codeLen);
	if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
		long uipLen = 0;
		char* uip = r->getString("uip",&uipLen);
		long hip = 0;
		//use the uip when we have a raw query to test if 
		//we can submit
		if(uip) {
			hip = atoip(uip, uipLen);
			h = iptop( hip );
		}
	}


	st1->m_strip = r->getLong("strip",0);
	// Remember, for cgi, if the box is not checked, then it is not 
	// reported in the request, so set default return value to 0
	long spiderLinks = r->getLong("spiderLinks",-1);
	// also support all lowercase like PageInject.cpp uses
	if ( spiderLinks == -1 )
		spiderLinks = r->getLong("spiderlinks",0);

	// . should we force it into spiderdb even if already in there
	// . use to manually update spider times for a url
	// . however, will not remove old scheduled spider times
	// . mdw: made force on the default
	st1->m_forceRespider = r->getLong("force",1); // 0);

	long now = getTimeGlobal();

	// . allow 1 submit every 1 hour
	// . restrict by submitter domain ip
	if ( ! st1->m_isAdmin &&
	     ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
		// return error page
		g_errno = ETOOEARLY;
		return sendReply ( st1 , true );
	}


	//st1->m_query = r->getString( "qts", &st1->m_queryLen );


	// check it, if turing test is enabled for this collection
	if ( ! st1->m_isAdmin && cr->m_doTuringTest && 
	     ! g_turingTest.isHuman(r) )  {
		// log note so we know it didn't make it
		g_msg = " (error: bad answer)";
		//log("PageAddUrl:: addurl failed for %s : bad answer",
		//    iptoa(s->m_ip));
		st1->m_goodAnswer = false;
		return sendReply ( st1 , true /*addUrl enabled?*/ );
	}

	//if ( st1->m_queryLen > 0 )
	//	return getPages( st1 );

	// if no url given, just print a blank page
	if ( ! url ) return sendReply (  st1 , true );


	//
	// make a SpiderRequest
	//

	SpiderRequest *sreq = &st1->m_sreq;
	// reset it
	sreq->reset();
	// make the probable docid
	long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
	// make one up, like we do in PageReindex.cpp
	long firstIp = (probDocId & 0xffffffff);
	// . now fill it up
	// . TODO: calculate the other values... lazy!!! (m_isRSSExt, 
	//         m_siteNumInlinks,...)
	sreq->m_isNewOutlink = 1;
	sreq->m_isAddUrl     = 1;
	sreq->m_addedTime    = now;
	sreq->m_fakeFirstIp   = 1;
	sreq->m_probDocId     = probDocId;
	sreq->m_firstIp       = firstIp;
	sreq->m_hopCount      = 0;
	// its valid if root
	Url uu; uu.set ( st1->m_url );
	if ( uu.isRoot() ) sreq->m_hopCountValid = true;
	// too big?
	//long len = st1->m_urlLen;
	// the url! includes \0
	strcpy ( sreq->m_url , st1->m_url );
	// call this to set sreq->m_dataSize now
	sreq->setDataSize();
	// make the key dude -- after setting url
	sreq->setKey ( firstIp , 0LL, false );
	// need a fake first ip lest we core!
	//sreq->m_firstIp = (pdocId & 0xffffffff);
	// how to set m_firstIp? i guess addurl can be throttled independently
	// of the other urls???  use the hash of the domain for it!
	long  dlen;
	char *dom = getDomFast ( st1->m_url , &dlen );
	// fake it for this...
	//sreq->m_firstIp = hash32 ( dom , dlen );
	// sanity
	if ( ! dom ) {
		g_errno = EBADURL;
		return sendReply ( st1 , true );
	}
	// shortcut
	Msg4 *m = &st1->m_msg4;
	// now add that to spiderdb using msg4
	if ( ! m->addMetaList ( (char *)sreq    ,
				sreq->getRecSize() ,
				coll            ,
				st1             , // state
				addedStuff      ,
				MAX_NICENESS    ,
				RDB_SPIDERDB    ) )
		// we blocked
		return false;

	// send back the reply
	return sendReply ( st1 , true );
}