bool gotTermFreq ( State10 *st ) {
	// set the term freq
	st->m_termFreq = st->m_msg36.getTermFreq();
	// reset
	st->m_i = 0;
	// . query each indexdb/datedb split
	// . returns false if blocked, true otherwise
	if ( ! launchRequests ( st ) ) return false;
	// if it completed, keep on chugging
	return gotIndexList ( (void *) st );
}
// . returns false if not all replies have been received (or timed/erroredout)
// . returns true if done (or an error finished us)
// . sets g_errno on error
bool Msg37::gotTermFreq ( Msg36 *msg36 ) {
	long i ;
	long j;
	// if called from above skip down to bottom
	if ( ! msg36 ) goto skip;
	// . set our m_errno if there was an error so everyone else knows
	// . don't overwrite it if it's already set
	if ( g_errno && ! m_errno ) m_errno = g_errno;
	// . now m_linkInfo[i] (for some i, i dunno which) is filled
	m_numReplies++;
	// extract info we stored in there
	i = msg36->m_i ;
	j = msg36->m_j ;
	// sanity check
	if ( &m_msg36[j] != msg36 ) {
		log("query: msg37 failed sanity check 3.");
		char *xx = NULL; *xx = 0;
	}
	// if no error set out term freq
	if ( ! g_errno ) m_termFreqs[i] = msg36->m_termFreq;
	// sanity check
	if ( ! m_inUse[j] ) {
		log("query: msg37 failed sanity check 2.");
		char *xx = NULL; *xx = 0;
	}
	// mark as available
	m_inUse[j] = 0;
	// try to launch more, returns true if all done though
	if ( ! launchRequests() ) return false;
	// wait until we got all the replies before we attempt to merge
	//if ( m_numReplies < m_numRequests ) return false;
 skip:
	// . did we have an error from any reply?
	// . return true if we got all replies
	// . do not merge since someone had an error
	if ( m_errno ) { g_errno = m_errno ; return true; }
	// set all to 1 in case there's an error
	//for ( long i = 0 ; i < m_numTerms ; i++ ) {
	//	// skip if ignored
	//	//if ( m_termFreqs[i] == 0LL ) continue;
	//	m_termFreqs[i] = m_msg36[i].getTermFreq();
	//}
	// . return true cuz we're done
	// . g_errno may be set though
	return true;
}
예제 #3
0
// returns false if blocked, true otherwise
bool Images::gotTermFreq ( ) {
	// error?
	if ( g_errno ) return true;
	// bail if less than 10
	//long long nt = m_msg36.getTermFreq();
	// each key but the first is 12 bytes (compressed)
	long long nt = (m_list.getListSize() - 6)/ 12;
	// . return true, without g_errno set, we are done
	// . if we do not have 10 or more webpages that share this same 
	//   template then do not do image extraction at all, it is too risky
	//   that we get a bad image
	// . MDW: for debugging, do not require 10 pages of same template
	//if ( nt < 10 ) return true;
	if ( nt < -2 ) return true;
	// now see which of the image urls are unique
	if ( ! launchRequests () ) return false;
	// i guess we did not block
	return true;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . "termIds/termFreqs" should NOT be on the stack in case we block
// . i based this on ../titled/Msg25.cpp since it sends out multiple msgs at 
//   the same time, too
bool Msg37::getTermFreqs ( collnum_t collnum,//char       *coll       ,
			   long        maxAge     ,
			   long long  *termIds    ,
			   long        numTerms   ,
			   long long  *termFreqs  ,
			   void       *state      ,
			   void (* callback)(void *state ) ,
			   long        niceness   ,
			   bool        exactCount ) {

	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: bad collection. msg37.");
	// we haven't got any responses as of yet or sent any requests
	m_callback    = callback;
	m_state       = state;
	m_exactCount  = exactCount;
	m_niceness    = niceness;
	m_numReplies  = 0;
	m_numRequests = 0;
	m_errno       = 0;
	m_numTerms    = numTerms;
	m_termFreqs   = termFreqs;
	m_collnum     = collnum;
	//m_coll        = coll;
	m_maxAge      = maxAge;
	m_termIds     = termIds;
	// set all to 1 in case there's an error
	for ( long i = 0 ; i < m_numTerms ; i++ ) {
		//if ( ignore[i] ) m_termFreqs[i] = 0LL;
		//else             m_termFreqs[i] = 1LL;
		m_termFreqs[i] = 1LL;
	}
	// reset
	m_i = 0;
	memset ( m_inUse , 0 , MAX_MSG36_OUT );
	// launch the requests
	if ( ! launchRequests() ) return false;
	// set our array
	gotTermFreq ( NULL );
	// we did not block, return true
	return true;
}
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList ( void *state ) {
	// the state
	State10 *st = (State10 *) state;
	// launch more
	if ( ! launchRequests ( st ) ) return false;
	/*
	// get the date list
	//fprintf(stderr,"termId now=%lli\n",st->m_termId);
	//fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK));
	// . now get the indexList for this termId
	// . date is complemented, so start with bigger one first
	key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff);
	key128_t endKey   = g_datedb.makeEndKey   ( st->m_termId ,0x0);
	// get the rdb ptr to titledb's rdb
	//Rdb *rdb = g_indexdb.getRdb();
	// -1 means read from all files in Indexdb
	long numFiles = -1;
	// make it zero if caller doesn't want to hit the disk
	if ( ! st->m_useDisk ) numFiles = 0;
	// get the title rec at or after this docId
	if ( ! st->m_msg0.getList ( -1 ,
				    0  ,
				    0  ,
				    0  ,    // max cache age
				    false , // add to cache?
				    RDB_DATEDB  , // rdbId of 2 = indexdb
				    st->m_coll ,
				    &st->m_list2  ,
				    (char *)&startKey  ,
				    (char *)&endKey    ,
				    st->m_numRecs * sizeof(key128_t),//recSizes
				    //st->m_useTree   , // include tree?
				    //st->m_useCache  , // include cache?
				    //false     , // add to cache?
				    //0         , // startFileNum
				    //numFiles  , // numFiles
				    st        , // state
				    gotIndexListWrapper2 ,
				    0  ) )  // niceness
		return false;
	// otherwise call gotResults which returns false if blocked, true else
	// and sets g_errno on error
	return gotIndexList2 ( (void *) st , NULL );
}


void gotIndexListWrapper2 ( void *state , RdbList *list ) {
	gotIndexList2 ( state , list );
}

void addedKeyWrapper ( void *state ) {
	gotIndexList2 ( state, NULL );
}

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList2 ( void *state , RdbList *list ) {
	// the state
	State10 *st = (State10 *) state;
	*/
	// get the socket
	TcpSocket *s = st->m_socket;
	// don't allow pages bigger than 128k in cache
	//char  buf [ 64*1024 ];
	// a ptr into "buf"
	//char *p    = buf;
	//char *pend = buf + 64*1024;
	/*
	// get termId
	key_t k = *(key_t *)st->m_list.getStartKey();
	long long termId = g_indexdb.getTermId ( k );
	// get groupId from termId
	//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
	unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k );
	long hostnum = g_hostdb.makeHostId ( groupId );
	*/
	// check box " checked" strings
	char *ubs = "";
	char *uts = "";
	char *uds = "";
	char *ucs = "";
	char *add = "";
	char *del = "";
	if ( st->m_useDatedb) ubs = " checked";
	if ( st->m_useTree  ) uts = " checked";
	if ( st->m_useDisk  ) uds = " checked";
	if ( st->m_useCache ) ucs = " checked";
	if ( st->m_add      ) add = " checked";
	if ( st->m_del      ) del = " checked";

	SafeBuf *pbuf = &st->m_pbuf;

	g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r );

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; 
	if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true;

	// print the standard header for admin pages
	pbuf->safePrintf ( 
		  "<center>\n"
		  "<table cellpadding=2><tr><td colspan=4>"
		  "useDatedb:<input type=checkbox value=1 name=ub%s> "
		  "useTree:<input type=checkbox value=1 name=ut%s> "
		  "useDisk:<input type=checkbox value=1 name=ud%s> "
		  "useCache:<input type=checkbox value=1 name=uc%s> "
		  "ADD:<input type=checkbox value=1 name=add%s> "
		  "DELETE:<input type=checkbox value=1 name=del%s>"
		  "</td></tr><tr><td>"
		  "query:"
		  "</td><td>"
		  "<input type=text name=q value=\"%s\" size=20>"
		  "</td><td>"
		  "collection:"
		  "</td><td>"
		  "<input type=text name=c value=\"%s\" size=10>"
		  "</td></tr><tr><td>"
		  "termId:"
		  "</td><td>"
		  "<input type=text name=t value=%lli size=20>"
		  "</td><td>"
		  "numRecs:"
		  "</td><td>"
		  "<input type=text name=numRecs value=%li size=10> "
		  "</td></tr><tr><td>"
		  "docId:"
		  "</td><td>"
		  "<input type=text name=d value=%lli size=20> "
		  "</td><td>"
		  "score:"
		  "</td><td>"
		  "<input type=text name=score value=%li size=10> "
		  "</td><td>"
		  "<input type=submit value=ok border=0>"
		  "</td></tr>"
		  "<tr><td colspan=2>"
		  "term appears in about %lli docs +/- %li"
		  "</td></tr>"
		  //"<tr><td colspan=2>"
		  //"this indexlist held by host #%li and twins"
		  //"</td></tr>"
		  "</table>"
		  "</form><br><br>" ,
		  ubs, uts, uds, ucs, add, del,
		  st->m_query , st->m_coll , st->m_termId  , 
		  st->m_numRecs  ,
		  st->m_docId , (long)st->m_score ,
		  st->m_termFreq ,
		  2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * 
		  base->getNumFiles() );
		  //hostnum );

	if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){
		if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno));
		else        pbuf->safePrintf("List is empty");
		pbuf->safePrintf("</center>");
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		bool status = g_httpServer.sendDynamicPage(s , 
							   pbuf->getBufStart(),
							   pbuf->length() );
		// delete it
		mdelete ( st , sizeof(State10) , "PageIndexdb" );
		delete (st);
		return status;
	}

	pbuf->safePrintf ( 
		  "<table cellpadding=1 border=1>" 
		  "<tr><td>#</td><td>score</td>"
		  "<td>docId</td><td>domHash</td></tr>");

	//if ( searchingEvents

	// now print the score/docId of indexlist
	long i = 0;
	for (   st->m_list.resetListPtr () ;
	      ! st->m_list.isExhausted  () ;
		st->m_list.skipCurrentRecord () ) {
		// break if buf is low
		//if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list.getCurrentDocId () ;
		unsigned long groupId = getGroupIdFromDocId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// log the first docid so we can blaster url: queries
		// to PageIndexdb and see if they are in indexdb
		if ( i == 0 ) 
			logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query);
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		unsigned long date = 0;
		if ( st->m_useDatedb )
			date = (unsigned long)st->m_list.getCurrentDate();
		uint8_t dh = g_titledb.getDomHash8FromDocId ( docId );
		char ds[32];
		ds[0]=0;
		if ( st->m_useDatedb ) sprintf (ds,"%lu/",date);
		pbuf->safePrintf ( 
			  "<tr><td>%li.</td>"
			  "<td>%s%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td>"
			  "<td>"
			  "0x%02lx"
			  "</td>"
			  "</tr>\n" ,
			  i++,
			  ds, (int)st->m_list.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId ,
			  (long)dh );
	}	
	pbuf->safePrintf ( "</table>" );

	/*
	if ( ! st->m_list2.isEmpty() ) 
		p += sprintf ( p ,
			       "<br>"
			       "<br>"
			       "<table cellpadding=1 border=1>" 
			       "<tr><td>#</td><td>termId</td>"
			       "<td>date</td><td>score</td>"
			       "<td>docId</td></tr>");

	// now print the score/docId of datedb list
	i = 0;
	for (   st->m_list2.resetListPtr () ;
	      ! st->m_list2.isExhausted  () ;
		st->m_list2.skipCurrentRecord () ) {
		// break if buf is low
		if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list2.getCurrentDocId () ;
		unsigned long groupId = g_titledb.getGroupId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		// debug
		char kb[16];
		st->m_list2.getCurrentKey(kb);
		//log(LOG_INFO,"debug: n1=%016llx n0=%016llx",
		//    *(long long *)(kb+8),*(long long *)(kb+0));
		//if ( (unsigned long)st->m_list2.getCurrentDate() == 0 )
		//	log("STOP");
		sprintf ( p , 
			  "<tr><td>%li.</td>"
			  "<td>%llu</td>"
			  "<td>%lu</td><td>%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td></tr>\n" ,
			  i++,
			  st->m_list2.getTermId16(kb) ,
			  (unsigned long)st->m_list2.getCurrentDate() ,
			  (int)st->m_list2.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId );
		p += gbstrlen ( p );
	}	
	*/
	if ( ! st->m_list.isEmpty() ) 
		pbuf->safePrintf ( "</table>" );


	// print msg if we could fit all into buf
	//if ( p + 1024 >= pend ) {
	//	sprintf ( p ,"... truncated ... no mem" );
	//	p += gbstrlen ( p );		
	//}
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	pbuf->safePrintf ( "</center>\n");
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage ( s , 
						     pbuf->getBufStart() ,
						     pbuf->length() );
	// delete the state
	mdelete ( st , sizeof(State10) , "PageIndexdb" );
	delete (st) ;
	return status;
}
// . get various information for each url in a list of urls
// . urls in "urlBuf" are \0 terminated
// . used to be called getSiteRecs()
// . you can pass in a list of docIds rather than urlPtrs
bool Msge0::getTagRecs ( char        **urlPtrs           ,
			 linkflags_t  *urlFlags          , //Links::m_linkFlags
			 long          numUrls           ,
			// if skipOldLinks && urlFlags[i]&LF_OLDLINK, skip it
			 bool          skipOldLinks      ,
			 TagRec       *baseTagRec        ,
			 collnum_t     collnum,
			 long          niceness          ,
			 void         *state             ,
			 void        (*callback)(void *state) ) {
	reset();
	// bail if no urls or linkee
	if ( numUrls <= 0 ) return true;

	// save all input parms
	m_urlPtrs          = urlPtrs;
	m_urlFlags         = urlFlags;
	m_numUrls          = numUrls;
	m_skipOldLinks     = skipOldLinks;
	m_baseTagRec       = baseTagRec;
	m_collnum          = collnum;
	m_niceness         = niceness;
	m_state            = state;
	m_callback         = callback;

	// . how much mem to alloc?
	// . include an extra 4 bytes for each one to hold possible errno
	long need = 
		4 + // error
		4 + // tag ptr
		4 ; // slab ptr
	// one per url
	need *= numUrls;
	// allocate the buffer to hold all the info we gather
	m_buf = (char *)mcalloc ( need , "Msge0buf" );
	if ( ! m_buf ) return true;
	m_bufSize = need;
	// clear it all
	memset ( m_buf , 0 , m_bufSize );
	// set the ptrs!
	char *p = m_buf;
	m_tagRecErrors      = (long    *)p ; p += numUrls * 4;
	m_tagRecPtrs        = (TagRec **)p ; p += numUrls * 4;
	m_slab              = (char   **)p ; p += numUrls * 4;
	// initialize
	m_numRequests = 0;
	m_numReplies  = 0;
	// . point to first url to process
	// . url # m_n
	m_n = 0;
	// clear the m_used flags
	memset ( m_used , 0 , MAX_OUTSTANDING_MSGE0 );

	// . launch the requests
	// . a request can be a msg8a, msgc, msg50 or msg20 request depending
	//   on what we need to get
	// . when a reply returns, the next request is launched for that url
	// . we keep a msgESlot state for each active url in the buffer
	// . we can have up to MAX_ACTIVE urls active
	if ( ! launchRequests ( 0 ) ) return false;

	// none blocked, we are done
	return true;
}