コード例 #1
0
bool Titledb::isLocal ( int64_t docId ) {
	// shift it up (64 minus 38) bits so we can mask it
	//key96_t key = makeTitleRecKey ( docId , false /*isDelKey?*/ );
	// mask upper bits of the top 4 bytes
	//return ( getGroupIdFromDocId ( docId ) == g_hostdb.m_groupId ) ;
	return ( getShardNumFromDocId(docId) == getMyShardNum() );
}
コード例 #2
0
Host *getHostToHandleInjection ( char *url ) {
	Url norm;
	norm.set ( url );
	int64_t docId = g_titledb.getProbableDocId ( &norm );
	// get iroupId from docId
	uint32_t shardNum = getShardNumFromDocId ( docId );
	// from Msg22.cpp
	Host *group = g_hostdb.getShard ( shardNum );
	int32_t hostNum = docId % g_hostdb.m_numHostsPerShard;
	Host *host = &group[hostNum];

	bool isWarcInjection = false;
	int32_t ulen = gbstrlen(url);
	if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 )
		isWarcInjection = true;
	if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 )
		isWarcInjection = true;

	if ( ! isWarcInjection ) return host;

	// warc files end up calling XmlDoc::indexWarcOrArc() which spawns
	// a msg7 injection request for each doc in the warc/arc file
	// so let's do load balancing differently for them so one host
	// doesn't end up doing a bunch of wget/gunzips on warc files 
	// thereby bottlenecking the cluster. get the first hostid that
	// we have not sent a msg7 injection request to that is still out
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		Host *h = g_hostdb.getHost(i);
		h->m_tmpCount = 0;
	}
	for ( UdpSlot *slot = g_udpServer.m_head2 ; 
	      slot ; 
	      slot = slot->m_next2 ) {
		// skip if not injection request
		if ( slot->m_msgType != 0x07 ) continue;
		//if ( ! slot->m_weInitiated ) continue;
		// if we did not initiate the injection request, i.e. if
		// it is to us, skip it
		if ( ! slot->m_callback ) continue;
		// who is it from?
		int32_t hostId = slot->m_hostId;
		if ( hostId < 0 ) continue;
		Host *h = g_hostdb.getHost ( hostId );
		if ( ! h ) continue;
		h->m_tmpCount++;
	}
	int32_t min = 999999;
	Host *minh = NULL;
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		Host *h = g_hostdb.getHost(i);
		if ( h->m_tmpCount == 0 ) return h;
		if ( h->m_tmpCount >= min ) continue;
		min  = h->m_tmpCount;
		minh = h;
	}
	if ( minh ) return minh;
	// how can this happen?
	return host;
}
コード例 #3
0
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotTitleRec ( void *state ) {
	// cast the State4 out
	State4 *st = (State4 *) state;
	// get the socket
	TcpSocket *s = st->m_socket;

	SafeBuf sb;
	// get it's docId
	long long docId = st->m_docId;
	// make the query string for passing to different hosts
	char  qs[64];
	sprintf(qs,"&d=%lli",docId);
	if ( docId==0LL ) qs[0] = 0;
	// print standard header
	sb.reserve2x ( 32768 );
	g_pages.printAdminTop (&sb, st->m_socket, &st->m_r );
	//PAGE_TITLEDB,
	//		       st->m_username,//NULL ,
	//		       st->m_coll , st->m_pwd , s->m_ip , qs );
	// shortcut
	XmlDoc *xd = &st->m_xd;

	// . deal with errors
	// . print none if non title rec at or after the provided docId
	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
		// print docId in box
		sb.safePrintf (  "<center>\nEnter docId: "
				 "<input type=text name=d value=%lli size=15>",
				 docId);
		sb.safePrintf ( "</form><br>\n" );
		if ( docId == 0 ) 
			sb.safePrintf("<br>");
		else if ( g_errno ) 
			sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno));
		else 
			sb.safePrintf("<br><br>No titleRec for that docId "
				      "or higher");
		// print where it should be
		//unsigned long gid = getGroupIdFromDocId ( docId );
		//Host *hosts = g_hostdb.getGroup(gid);
		long shardNum = getShardNumFromDocId ( docId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		long hostId = -1;
		if ( hosts ) hostId = hosts[0].m_hostId;
		sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
		sb.safePrintf ( "\n</center>" );
		mdelete ( st , sizeof(State4) , "PageTitledb");
		delete (st);
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart(),
						      sb.length() );
	}
	// print docId in box
	sb.safePrintf ("<br>\n"
		       "<center>Enter docId: "
		       "<input type=text name=d value=%lli size=15>", docId );
	// print where it should be
	//unsigned long gid = getGroupIdFromDocId ( docId );
	//Host *hosts = g_hostdb.getGroup(gid);
	long shardNum = getShardNumFromDocId ( docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	long hostId = -1;
	if ( hosts ) hostId = hosts[0].m_hostId;
	sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
	sb.safePrintf ( "</form><br>\n" );

	//char *coll    = st->m_coll;

	Title *ti = xd->getTitle();
	if ( ! ti ) {
		log ( "admin: Could not set title" );
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// sanity check. should not block
	if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; }

	// print it out
	xd->printDoc ( &sb );

	// don't forget to cleanup
	mdelete ( st , sizeof(State4) , "PageTitledb");
	delete (st);
	// now encapsulate it in html head/tail and send it off
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}
コード例 #4
0
// . if url is NULL use the docId to get the titleRec
// . if titleRec is NULL use our own internal m_myTitleRec
// . sets g_errno to ENOTFOUND if TitleRec does not exist for this url/docId
// . if g_errno is ENOTFOUND m_docId will be set to the best available docId
//   for this url to use if we're adding it to Titledb
// . if g_errno is ENOTFOUND and m_docId is 0 then no docIds were available
// . "url" must be NULL terminated
bool Msg22::getTitleRec ( Msg22Request  *r              ,
			  char          *url            ,
			  long long      docId          ,
			  char          *coll           ,
			  char         **titleRecPtrPtr ,
			  long          *titleRecSizePtr,
			  bool           justCheckTfndb ,
			  // when indexing spider replies we just want
			  // a unique docid... "docId" should be the desired
			  // one, but we might have to change it.
			  bool           getAvailDocIdOnly  ,
			  void          *state          ,
			  void         (* callback) (void *state) ,
			  long           niceness       ,
			  bool           addToCache     ,
			  long           maxCacheAge    ,
			  long           timeout        ,
			  bool           doLoadBalancing ) {

	// sanity
	if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
	if ( getAvailDocIdOnly && url            ) { char *xx=NULL;*xx=0; }

	//if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url);
	// sanity checks
	if ( url    && docId!=0LL ) { char *xx=NULL;*xx=0; }
	if ( url    && !url[0]    ) { char *xx=NULL;*xx=0; }
	if ( docId!=0LL && url    ) { char *xx=NULL;*xx=0; }
	if ( ! coll               ) { char *xx=NULL;*xx=0; }
	if ( ! callback           ) { char *xx=NULL;*xx=0; }
	if ( r->m_inUse           ) { char *xx=NULL;*xx=0; }
	if ( m_outstanding        ) { char *xx = NULL;*xx=0; }
	// sanity check
	if ( ! justCheckTfndb && ! getAvailDocIdOnly ) {
		if ( ! titleRecPtrPtr  ) { char *xx=NULL;*xx=0; }
		if ( ! titleRecSizePtr ) { char *xx=NULL;*xx=0; }
	}

	// remember, caller want us to set this
	m_titleRecPtrPtr  = titleRecPtrPtr;
	m_titleRecSizePtr = titleRecSizePtr;
	// assume not found. this can be NULL if justCheckTfndb is true,
	// like when it is called from XmlDoc::getIsNew()
	if ( titleRecPtrPtr  ) *titleRecPtrPtr  = NULL;
	if ( titleRecSizePtr ) *titleRecSizePtr = 0;

	// save callback
	m_state           = state;
	m_callback        = callback;

	// save it
	m_r = r;
	// set request
	r->m_docId           = docId;
	r->m_niceness        = niceness;
	r->m_justCheckTfndb  = (bool)justCheckTfndb;
	r->m_getAvailDocIdOnly   = (bool)getAvailDocIdOnly;
	r->m_doLoadBalancing = (bool)doLoadBalancing;
	r->m_collnum         = g_collectiondb.getCollnum ( coll );
	r->m_addToCache      = false;
	r->m_maxCacheAge     = 0;
	// url must start with http(s)://. must be normalized.
	if ( url && url[0] != 'h' ) {
		log("msg22: BAD URL! does not start with 'h'");
		m_errno = g_errno = EBADENGINEER;
		return true;
	}
	// store url
	if ( url ) strcpy(r->m_url,url);
	else r->m_url[0] = '\0';

	// if no docid provided, use probable docid
	if ( ! docId ) 
		docId = g_titledb.getProbableDocId ( url );

	// get groupId from docId
	uint32_t shardNum = getShardNumFromDocId ( docId );
	// generate cacheKey, just use docid now
	key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = docId;
	// do load balancing iff we're the spider because if we send this
	// request to a merging host, and prefer local reads is true, the
	// resulting disk read will be starved somewhat. otherwise, we save
	// time by not having to cast a Msg36
	bool balance = false;

	/*
	// if clusterdb, do bias
	long firstHostId = -1;
	// i don't see why not to always bias it, this makes tfndb page cache
	// twice as effective for all lookups
	long numTwins = g_hostdb.getNumHostsPerShard();
	//long long bias=((0x0000003fffffffffLL)/(long long)numTwins);
	long long sectionWidth = (DOCID_MASK/(long long)numTwins) + 1;
	long hostNum = (docId & DOCID_MASK) / sectionWidth;
	long numHosts = g_hostdb.getNumHostsPerShard();
	Host *hosts = g_hostdb.getGroup ( groupId );
	if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; }
	firstHostId = hosts [ hostNum ].m_hostId ;
	*/

	// get our group
	long  allNumHosts = g_hostdb.getNumHostsPerShard();
	Host *allHosts    = g_hostdb.getShard ( shardNum );//Group ( groupId );

	// put all alive hosts in this array
	Host *cand[32];
	long long  nc = 0;
	for ( long i = 0 ; i < allNumHosts ; i++ ) {
		// get that host
		Host *hh = &allHosts[i];
		// skip if dead
		if ( g_hostdb.isDead(hh) ) continue;
		// add it if alive
		cand[nc++] = hh;
	}
	// if none alive, make them all candidates then
	bool allDead = (nc == 0);
	for ( long i = 0 ; allDead && i < allNumHosts ; i++ ) 
		cand[nc++] = &allHosts[i];

	// route based on docid region, not parity, because we want to hit
	// the urldb page cache as much as possible
	long long sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL;
	// we mod by 1MB since tied scores resort to sorting by docid
	// so we don't want to overload the host responsible for the lowest
	// range of docids. CAUTION: do this for msg22 too!
	// in this way we should still ensure a pretty good biased urldb
	// cache... 
	// . TODO: fix the urldb cache preload logic
	long hostNum = (docId % (128LL*1024*1024)) / sectionWidth;
	if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
	if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; }
	long firstHostId = cand [ hostNum ]->m_hostId ;

	// while this prevents tfndb seeks, it also causes bottlenecks
	// if one host is particularly slow, because load balancing is
	// bypassed.
	//if ( ! g_conf.m_useBiasedTfndb ) firstHostId = -1;
	// flag it
	m_outstanding = true;
	r->m_inUse    = 1;

	// . send this request to the least-loaded host that can handle it
	// . returns false and sets g_errno on error
	// . use a pre-allocated buffer to hold the reply
	// . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating
        if ( ! m_mcast.send ( (char *)r       , 
			      r->getSize()    ,
			      0x22            , // msgType 0x22
			      false           , // m_mcast own m_request?
			      shardNum        , // send to group (groupKey)
			      false           , // send to whole group?
			      //hostKey         , // key is lower bits of docId
			      0               , // key is lower bits of docId
			      this            , // state data
			      NULL            , // state data
			      gotReplyWrapper22 ,
			      timeout         , // 60 second time out
			      r->m_niceness   , // nice, reply size can be huge
			      false           , // realtime?
			      firstHostId     , // first hostid
			      NULL            , // replyBuf
			      0               , // replyBufMaxSize
			      false           , // free reply buf?
			      balance         , // do disk load balancing?
			      maxCacheAge     , // maxCacheAge
			      cacheKey        , // cacheKey
			      RDB_TITLEDB     , // rdbId of titledb
			      32*1024       ) ){// minRecSizes avg
		log("db: Requesting title record had error: %s.",
		    mstrerror(g_errno) );
		// set m_errno
		m_errno = g_errno;
		// no, multicast will free since he owns it!
		//if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" );
		return true;	
	}
	// otherwise, we blocked and gotReplyWrapper will be called
	return false;
}
コード例 #5
0
// returns true and sets g_errno on error, otherwise, blocks and returns false
bool Msg20::getSummary ( Msg20Request *req ) {
	// reset ourselves in case recycled
	reset();

	// consider it "launched"
	m_launched = true;

	// save it
	m_requestDocId = req->m_docId;
	m_state        = req->m_state;
	m_callback     = req->m_callback;
	m_callback2    = NULL;

	// does this ever happen?
	if ( g_hostdb.getNumHosts() <= 0 ) {
		log("build: hosts2.conf is not in working directory, or "
		    "contains no valid hosts.");
		g_errno = EBADENGINEER;
		return true;
	}

	if ( req->m_docId < 0 && ! req->ptr_ubuf ) {
		log("msg20: docid<0 and no url for msg20::getsummary");
		g_errno = EBADREQUEST;
		return true;
	}

	// get groupId from docId, if positive
	uint32_t shardNum;
	if ( req->m_docId >= 0 ) 
		shardNum = g_hostdb.getShardNumFromDocId(req->m_docId);
	else {
		int64_t pdocId = Titledb::getProbableDocId(req->ptr_ubuf);
		shardNum = getShardNumFromDocId(pdocId);
	}

	// we might be getting inlinks for a spider request
	// so make sure timeout is inifinite for that...
	const int32_t timeout = (req->m_niceness==0)
	                      ? multicast_msg20_summary_timeout
	                      : multicast_infinite_send_timeout;

	// get our group
	int32_t  allNumHosts = g_hostdb.getNumHostsPerShard();
	Host *allHosts    = g_hostdb.getShard ( shardNum );

	// put all alive hosts in this array
	Host *cand[32];
	int64_t  nc = 0;
	for ( int32_t i = 0 ; i < allNumHosts ; i++ ) {
		// get that host
		Host *hh = &allHosts[i];
		// skip if dead
		if ( g_hostdb.isDead(hh) ) continue;

		// Respect no-spider, no-query directives from hosts.conf 
		if ( !req->m_getLinkInfo && ! hh->m_queryEnabled ) continue;
		if ( req->m_getLinkInfo && ! hh->m_spiderEnabled ) continue;
		// add it if alive
		cand[nc++] = hh;
	}
	// if none alive, make them all candidates then
	bool allDead = (nc == 0);
	for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) {
		// NEVER add a noquery host to the candidate list, even
		// if the query host is dead
		if ( ! allHosts[i].m_queryEnabled ) continue;
		cand[nc++] = &allHosts[i];
	}

	if ( nc == 0 ) {
		log("msg20: error sending mcast: no queryable hosts "
		    "availble to handle summary generation");
		g_errno = EBADENGINEER;
		m_gotReply = true;
		return true;
	}

	// route based on docid region, not parity, because we want to hit
	// the urldb page cache as much as possible
	int64_t sectionWidth =((128LL*1024*1024)/nc)+1;
	int64_t probDocId    = req->m_docId;
	// i think reference pages just pass in a url to get the summary
	if ( probDocId < 0 && req->size_ubuf ) 
		probDocId = Titledb::getProbableDocId ( req->ptr_ubuf );
	if ( probDocId < 0        ) {
		log("query: Got bad docid/url combo.");
		probDocId = 0;
	}
	// we mod by 1MB since tied scores resort to sorting by docid
	// so we don't want to overload the host responsible for the lowest
	// range of docids. CAUTION: do this for msg22 too!
	// in this way we should still ensure a pretty good biased urldb
	// cache... 
	// . TODO: fix the urldb cache preload logic
	int32_t hostNum = (probDocId % (128LL*1024*1024)) / sectionWidth;
	if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
	if ( hostNum >= nc ) { g_process.shutdownAbort(true); }
	int32_t firstHostId = cand [ hostNum ]->m_hostId ;

	m_requestSize = 0;
	m_request = req->serialize ( &m_requestSize );
	// . it sets g_errno on error and returns NULL
	// . we MUST call gotReply() here to set m_gotReply
	//   otherwise Msg40.cpp can end up looping forever
	//   calling Msg40::launchMsg20s()
	if ( ! m_request ) { gotReply(NULL); return true; }

	// . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . use a pre-allocated buffer to hold the reply
	// . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating
	if (!m_mcast.send(m_request, m_requestSize, msg_type_20, false, shardNum, false, probDocId, this, NULL, gotReplyWrapper20, timeout, req->m_niceness, firstHostId, false)) {
		// sendto() sometimes returns "Network is down" so i guess
		// we just had an "error reply".
		log("msg20: error sending mcast %s",mstrerror(g_errno));
		m_gotReply = true;
		return true;
	}

	// we are officially "in progress"
	m_inProgress = true;

	// we blocked
	return false;
}
コード例 #6
0
// returns true and sets g_errno on error, otherwise, blocks and returns false
bool Msg20::getSummary ( Msg20Request *req ) {

	// reset ourselves in case recycled
	reset();

	// consider it "launched"
	m_launched = true;

	// save it
	m_requestDocId = req->m_docId;
	m_state        = req->m_state;
	m_callback     = req->m_callback;
	m_callback2    = req->m_callback2;
	m_expected     = req->m_expected;
	m_eventId      = req->m_eventId;

	// clear this
	//m_eventIdBits.clear();
	// set this
	//if ( req->m_eventId ) m_eventIdBits.addEventId(req->m_eventId);

	Hostdb *hostdb = req->m_hostdb;
	// ensure hostdb has a host in it
	if ( ! hostdb ) hostdb = &g_hostdb;
	// does this ever happen?
	if ( hostdb->getNumHosts() <= 0 ) {
		log("build: hosts2.conf is not in working directory, or "
		    "contains no valid hosts.");
		g_errno = EBADENGINEER;
		return true;
	}

	// do not re-route to twins if accessing an external network
	if ( hostdb != &g_hostdb ) req->m_expected = false;

	// get groupId from docId, if positive
	unsigned long shardNum;
	if ( req->m_docId >= 0 ) 
		shardNum = hostdb->getShardNumFromDocId(req->m_docId);
	else {
		long long pdocId = g_titledb.getProbableDocId(req->ptr_ubuf);
		shardNum = getShardNumFromDocId(pdocId);
	}

	// we might be getting inlinks for a spider request
	// so make sure timeout is inifinite for that...
	long timeout = 9999999; // 10 million seconds, basically inf.
	if ( req->m_niceness == 0 ) timeout = 20;

	// get our group
	long  allNumHosts = hostdb->getNumHostsPerShard();
	Host *allHosts    = hostdb->getShard ( shardNum );//getGroup(groupId );

	// put all alive hosts in this array
	Host *cand[32];
	long long  nc = 0;
	for ( long i = 0 ; i < allNumHosts ; i++ ) {
		// get that host
		Host *hh = &allHosts[i];
		// skip if dead
		if ( g_hostdb.isDead(hh) ) continue;
		// add it if alive
		cand[nc++] = hh;
	}
	// if none alive, make them all candidates then
	bool allDead = (nc == 0);
	for ( long i = 0 ; allDead && i < allNumHosts ; i++ ) 
		cand[nc++] = &allHosts[i];

	// route based on docid region, not parity, because we want to hit
	// the urldb page cache as much as possible
	long long sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL;
	long long probDocId    = req->m_docId;
	// i think reference pages just pass in a url to get the summary
	if ( probDocId < 0 && req->size_ubuf ) 
		probDocId = g_titledb.getProbableDocId ( req->ptr_ubuf );
	if ( probDocId < 0        ) {
		log("query: Got bad docid/url combo.");
		probDocId = 0;
	}
	// we mod by 1MB since tied scores resort to sorting by docid
	// so we don't want to overload the host responsible for the lowest
	// range of docids. CAUTION: do this for msg22 too!
	// in this way we should still ensure a pretty good biased urldb
	// cache... 
	// . TODO: fix the urldb cache preload logic
	long hostNum = (probDocId % (128LL*1024*1024)) / sectionWidth;
	if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids
	if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; }
	long firstHostId = cand [ hostNum ]->m_hostId ;

	// . make buffer m_request to hold the request
	// . tries to use m_requestBuf[] if it is big enough to hold it
	// . allocs a new buf if MAX_MSG20_REQUEST_SIZE is too small
	// . serializes the request into m_request
	// . sets m_requestSize to the size of the serialized request
	m_requestSize = 0;
	m_request = req->serialize ( &m_requestSize, m_requestBuf ,
				     MAX_MSG20_REQUEST_SIZE );
	// . it sets g_errno on error and returns NULL
	// . we MUST call gotReply() here to set m_gotReply
	//   otherwise Msg40.cpp can end up looping forever
	//   calling Msg40::launchMsg20s()
	if ( ! m_request ) { gotReply(NULL); return true; }

        // . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . use a pre-allocated buffer to hold the reply
	// . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating
        if ( ! m_mcast.send ( m_request         ,
			      m_requestSize     , 
			      0x20              , // msgType 0x20
			      false             , // m_mcast own m_request?
			      shardNum          , // send to group (groupKey)
			      false             , // send to whole group?
			      probDocId         , // key is lower bits of docId
			      this              , // state data
			      NULL              , // state data
			      gotReplyWrapper20 ,
			      timeout           , // 60 second time out
			      req->m_niceness   ,
			      false             , // real time?
			      firstHostId       , // first hostid
			      NULL,//m_replyBuf        ,
			      0,//MSG20_MAX_REPLY_SIZE,//m_replyMaxSize
			      false             , // free reply buf?
			      false             , // do disk load balancing?
			      -1                , // max cache age
			      0                 , // cacheKey
			      0                 , // bogus rdbId
			      -1                , // minRecSizes(unknownRDsize)
			      true              , // sendToSelf
			      true              , // retry forever
			      hostdb            )) {
		// sendto() sometimes returns "Network is down" so i guess
		// we just had an "error reply".
		log("msg20: error sending mcast %s",mstrerror(g_errno));
		m_gotReply = true;
		return true;
	}

	// we are officially "in progress"
	m_inProgress = true;

	// we blocked
	return false;
}
コード例 #7
0
// . if url is NULL use the docId to get the titleRec
// . if titleRec is NULL use our own internal m_myTitleRec
// . sets g_errno to ENOTFOUND if TitleRec does not exist for this url/docId
// . if g_errno is ENOTFOUND m_docId will be set to the best available docId
//   for this url to use if we're adding it to Titledb
// . if g_errno is ENOTFOUND and m_docId is 0 then no docIds were available
// . "url" must be NULL terminated
bool Msg22::getTitleRec ( Msg22Request  *r              ,
			  char          *url            ,
			  int64_t      docId          ,
			  char          *coll           ,
			  char         **titleRecPtrPtr ,
			  int32_t          *titleRecSizePtr,
			  bool           justCheckTfndb ,
			  // when indexing spider replies we just want
			  // a unique docid... "docId" should be the desired
			  // one, but we might have to change it.
			  bool           getAvailDocIdOnly  ,
			  void          *state          ,
			  void         (* callback) (void *state) ,
			  int32_t           niceness       ,
			  bool           addToCache     ,
			  int32_t           maxCacheAge    ,
			  int32_t           timeout ) {

	m_availDocId = 0;

	// sanity
	if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
	if ( getAvailDocIdOnly && url            ) { char *xx=NULL;*xx=0; }

	//if ( url ) log(LOG_DEBUG,"build: getting TitleRec for %s",url);
	// sanity checks
	if ( url    && docId!=0LL ) { char *xx=NULL;*xx=0; }
	if ( url    && !url[0]    ) { char *xx=NULL;*xx=0; }
	if ( docId!=0LL && url    ) { char *xx=NULL;*xx=0; }
	if ( ! coll               ) { char *xx=NULL;*xx=0; }
	if ( ! callback           ) { char *xx=NULL;*xx=0; }
	if ( r->m_inUse           ) { char *xx=NULL;*xx=0; }
	if ( m_outstanding        ) { char *xx = NULL;*xx=0; }
	// sanity check
	if ( ! justCheckTfndb && ! getAvailDocIdOnly ) {
		if ( ! titleRecPtrPtr  ) { char *xx=NULL;*xx=0; }
		if ( ! titleRecSizePtr ) { char *xx=NULL;*xx=0; }
	}

	// remember, caller want us to set this
	m_titleRecPtrPtr  = titleRecPtrPtr;
	m_titleRecSizePtr = titleRecSizePtr;
	// assume not found. this can be NULL if justCheckTfndb is true,
	// like when it is called from XmlDoc::getIsNew()
	if ( titleRecPtrPtr  ) *titleRecPtrPtr  = NULL;
	if ( titleRecSizePtr ) *titleRecSizePtr = 0;

	// save callback
	m_state           = state;
	m_callback        = callback;

	// save it
	m_r = r;
	// set request
	r->m_docId           = docId;
	r->m_niceness        = niceness;
	r->m_justCheckTfndb  = (bool)justCheckTfndb;
	r->m_getAvailDocIdOnly   = (bool)getAvailDocIdOnly;
	r->m_collnum         = g_collectiondb.getCollnum ( coll );
	r->m_addToCache      = false;
	r->m_maxCacheAge     = 0;
	// url must start with http(s)://. must be normalized.
	if ( url && url[0] != 'h' ) {
		log("msg22: BAD URL! does not start with 'h'");
		m_errno = g_errno = EBADENGINEER;
		return true;
	}
	// store url
	if ( url ) strcpy(r->m_url,url);
	else r->m_url[0] = '\0';

	// if no docid provided, use probable docid
	if ( ! docId ) 
		docId = g_titledb.getProbableDocId ( url );

	// get groupId from docId
	uint32_t shardNum = getShardNumFromDocId ( docId );
	// generate cacheKey, just use docid now
	key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = docId;
	// do load balancing iff we're the spider because if we send this
	// request to a merging host, and prefer local reads is true, the
	// resulting disk read will be starved somewhat. otherwise, we save
	// time by not having to cast a Msg36
	bool balance = false;

	Host *firstHost ;
	// if niceness 0 can't pick noquery host.
	// if niceness 1 can't pick nospider host.
	firstHost = g_hostdb.getLeastLoadedInShard ( shardNum, r->m_niceness );
	int32_t firstHostId = firstHost->m_hostId;

	m_outstanding = true;
	r->m_inUse    = 1;

	// . send this request to the least-loaded host that can handle it
	// . returns false and sets g_errno on error
	// . use a pre-allocated buffer to hold the reply
	// . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating
        if ( ! m_mcast.send ( (char *)r       , 
			      r->getSize()    ,
			      0x22            , // msgType 0x22
			      false           , // m_mcast own m_request?
			      shardNum        , // send to group (groupKey)
			      false           , // send to whole group?
			      //hostKey         , // key is lower bits of docId
			      0               , // key is lower bits of docId
			      this            , // state data
			      NULL            , // state data
			      gotReplyWrapper22 ,
			      timeout*1000    , // timeout
			      r->m_niceness   , // nice, reply size can be huge
			      firstHostId     , // first hostid
			      NULL            , // replyBuf
			      0               , // replyBufMaxSize
			      false           , // free reply buf?
			      balance         , // do disk load balancing?
			      maxCacheAge     , // maxCacheAge
			      cacheKey        , // cacheKey
			      RDB_TITLEDB     , // rdbId of titledb
			      32*1024       ) ){// minRecSizes avg
		log("db: Requesting title record had error: %s.",
		    mstrerror(g_errno) );
		// set m_errno
		m_errno = g_errno;
		// no, multicast will free since he owns it!
		//if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" );
		return true;	
	}
	// otherwise, we blocked and gotReplyWrapper will be called
	return false;
}
コード例 #8
0
// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList ( void *state ) {
	// the state
	State10 *st = (State10 *) state;
	// launch more
	if ( ! launchRequests ( st ) ) return false;
	/*
	// get the date list
	//fprintf(stderr,"termId now=%lli\n",st->m_termId);
	//fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK));
	// . now get the indexList for this termId
	// . date is complemented, so start with bigger one first
	key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff);
	key128_t endKey   = g_datedb.makeEndKey   ( st->m_termId ,0x0);
	// get the rdb ptr to titledb's rdb
	//Rdb *rdb = g_indexdb.getRdb();
	// -1 means read from all files in Indexdb
	long numFiles = -1;
	// make it zero if caller doesn't want to hit the disk
	if ( ! st->m_useDisk ) numFiles = 0;
	// get the title rec at or after this docId
	if ( ! st->m_msg0.getList ( -1 ,
				    0  ,
				    0  ,
				    0  ,    // max cache age
				    false , // add to cache?
				    RDB_DATEDB  , // rdbId of 2 = indexdb
				    st->m_coll ,
				    &st->m_list2  ,
				    (char *)&startKey  ,
				    (char *)&endKey    ,
				    st->m_numRecs * sizeof(key128_t),//recSizes
				    //st->m_useTree   , // include tree?
				    //st->m_useCache  , // include cache?
				    //false     , // add to cache?
				    //0         , // startFileNum
				    //numFiles  , // numFiles
				    st        , // state
				    gotIndexListWrapper2 ,
				    0  ) )  // niceness
		return false;
	// otherwise call gotResults which returns false if blocked, true else
	// and sets g_errno on error
	return gotIndexList2 ( (void *) st , NULL );
}


void gotIndexListWrapper2 ( void *state , RdbList *list ) {
	gotIndexList2 ( state , list );
}

void addedKeyWrapper ( void *state ) {
	gotIndexList2 ( state, NULL );
}

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList2 ( void *state , RdbList *list ) {
	// the state
	State10 *st = (State10 *) state;
	*/
	// get the socket
	TcpSocket *s = st->m_socket;
	// don't allow pages bigger than 128k in cache
	//char  buf [ 64*1024 ];
	// a ptr into "buf"
	//char *p    = buf;
	//char *pend = buf + 64*1024;
	/*
	// get termId
	key_t k = *(key_t *)st->m_list.getStartKey();
	long long termId = g_indexdb.getTermId ( k );
	// get groupId from termId
	//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
	unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k );
	long hostnum = g_hostdb.makeHostId ( groupId );
	*/
	// check box " checked" strings
	char *ubs = "";
	char *uts = "";
	char *uds = "";
	char *ucs = "";
	char *add = "";
	char *del = "";
	if ( st->m_useDatedb) ubs = " checked";
	if ( st->m_useTree  ) uts = " checked";
	if ( st->m_useDisk  ) uds = " checked";
	if ( st->m_useCache ) ucs = " checked";
	if ( st->m_add      ) add = " checked";
	if ( st->m_del      ) del = " checked";

	SafeBuf *pbuf = &st->m_pbuf;

	g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r );

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; 
	if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_collnum)))return true;

	// print the standard header for admin pages
	pbuf->safePrintf ( 
		  "<center>\n"
		  "<table cellpadding=2><tr><td colspan=4>"
		  "useDatedb:<input type=checkbox value=1 name=ub%s> "
		  "useTree:<input type=checkbox value=1 name=ut%s> "
		  "useDisk:<input type=checkbox value=1 name=ud%s> "
		  "useCache:<input type=checkbox value=1 name=uc%s> "
		  "ADD:<input type=checkbox value=1 name=add%s> "
		  "DELETE:<input type=checkbox value=1 name=del%s>"
		  "</td></tr><tr><td>"
		  "query:"
		  "</td><td>"
		  "<input type=text name=q value=\"%s\" size=20>"
		  "</td><td>"
		  "collection:"
		  "</td><td>"
		  "<input type=text name=c value=\"%s\" size=10>"
		  "</td></tr><tr><td>"
		  "termId:"
		  "</td><td>"
		  "<input type=text name=t value=%lli size=20>"
		  "</td><td>"
		  "numRecs:"
		  "</td><td>"
		  "<input type=text name=numRecs value=%li size=10> "
		  "</td></tr><tr><td>"
		  "docId:"
		  "</td><td>"
		  "<input type=text name=d value=%lli size=20> "
		  "</td><td>"
		  "score:"
		  "</td><td>"
		  "<input type=text name=score value=%li size=10> "
		  "</td><td>"
		  "<input type=submit value=ok border=0>"
		  "</td></tr>"
		  "<tr><td colspan=2>"
		  "term appears in about %lli docs +/- %li"
		  "</td></tr>"
		  //"<tr><td colspan=2>"
		  //"this indexlist held by host #%li and twins"
		  //"</td></tr>"
		  "</table>"
		  "</form><br><br>" ,
		  ubs, uts, uds, ucs, add, del,
		  st->m_query , st->m_coll , st->m_termId  , 
		  st->m_numRecs  ,
		  st->m_docId , (long)st->m_score ,
		  st->m_termFreq ,
		  2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * 
		  base->getNumFiles() );
		  //hostnum );

	if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){
		if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno));
		else        pbuf->safePrintf("List is empty");
		pbuf->safePrintf("</center>");
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		bool status = g_httpServer.sendDynamicPage(s , 
							   pbuf->getBufStart(),
							   pbuf->length() );
		// delete it
		mdelete ( st , sizeof(State10) , "PageIndexdb" );
		delete (st);
		return status;
	}

	pbuf->safePrintf ( 
		  "<table cellpadding=1 border=1>" 
		  "<tr><td>#</td><td>score</td>"
		  "<td>docId</td><td>domHash</td></tr>");

	//if ( searchingEvents

	// now print the score/docId of indexlist
	long i = 0;
	for (   st->m_list.resetListPtr () ;
	      ! st->m_list.isExhausted  () ;
		st->m_list.skipCurrentRecord () ) {
		// break if buf is low
		//if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list.getCurrentDocId () ;
		//unsigned long groupId = getGroupIdFromDocId ( docId );
		long shardNum = getShardNumFromDocId ( docId );
		// get the first host's hostId in this groupId
		//Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		// just pick a host now...
		Host *h = &hosts[0];
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// log the first docid so we can blaster url: queries
		// to PageIndexdb and see if they are in indexdb
		if ( i == 0 ) 
			logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query);
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		unsigned long date = 0;
		if ( st->m_useDatedb )
			date = (unsigned long)st->m_list.getCurrentDate();
		uint8_t dh = g_titledb.getDomHash8FromDocId ( docId );
		char ds[32];
		ds[0]=0;
		if ( st->m_useDatedb ) sprintf (ds,"%lu/",date);
		pbuf->safePrintf ( 
			  "<tr><td>%li.</td>"
			  "<td>%s%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/admin/titledb?d=%llu>"
			  "<a href=/admin/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td>"
			  "<td>"
			  "0x%02lx"
			  "</td>"
			  "</tr>\n" ,
			  i++,
			  ds, (int)st->m_list.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId ,
			  (long)dh );
	}	
	pbuf->safePrintf ( "</table>" );

	/*
	if ( ! st->m_list2.isEmpty() ) 
		p += sprintf ( p ,
			       "<br>"
			       "<br>"
			       "<table cellpadding=1 border=1>" 
			       "<tr><td>#</td><td>termId</td>"
			       "<td>date</td><td>score</td>"
			       "<td>docId</td></tr>");

	// now print the score/docId of datedb list
	i = 0;
	for (   st->m_list2.resetListPtr () ;
	      ! st->m_list2.isExhausted  () ;
		st->m_list2.skipCurrentRecord () ) {
		// break if buf is low
		if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list2.getCurrentDocId () ;
		unsigned long groupId = g_titledb.getGroupId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		// debug
		char kb[16];
		st->m_list2.getCurrentKey(kb);
		//log(LOG_INFO,"debug: n1=%016llx n0=%016llx",
		//    *(long long *)(kb+8),*(long long *)(kb+0));
		//if ( (unsigned long)st->m_list2.getCurrentDate() == 0 )
		//	log("STOP");
		sprintf ( p , 
			  "<tr><td>%li.</td>"
			  "<td>%llu</td>"
			  "<td>%lu</td><td>%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/admin/titledb?d=%llu>"
			  "<a href=/admin/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td></tr>\n" ,
			  i++,
			  st->m_list2.getTermId16(kb) ,
			  (unsigned long)st->m_list2.getCurrentDate() ,
			  (int)st->m_list2.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId );
		p += gbstrlen ( p );
	}	
	*/
	if ( ! st->m_list.isEmpty() ) 
		pbuf->safePrintf ( "</table>" );


	// print msg if we could fit all into buf
	//if ( p + 1024 >= pend ) {
	//	sprintf ( p ,"... truncated ... no mem" );
	//	p += gbstrlen ( p );		
	//}
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	pbuf->safePrintf ( "</center>\n");
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage ( s , 
						     pbuf->getBufStart() ,
						     pbuf->length() );
	// delete the state
	mdelete ( st , sizeof(State10) , "PageIndexdb" );
	delete (st) ;
	return status;
}