bool Titledb::isLocal ( int64_t docId ) { // shift it up (64 minus 38) bits so we can mask it //key96_t key = makeTitleRecKey ( docId , false /*isDelKey?*/ ); // mask upper bits of the top 4 bytes //return ( getGroupIdFromDocId ( docId ) == g_hostdb.m_groupId ) ; return ( getShardNumFromDocId(docId) == getMyShardNum() ); }
Host *getHostToHandleInjection ( char *url ) { Url norm; norm.set ( url ); int64_t docId = g_titledb.getProbableDocId ( &norm ); // get iroupId from docId uint32_t shardNum = getShardNumFromDocId ( docId ); // from Msg22.cpp Host *group = g_hostdb.getShard ( shardNum ); int32_t hostNum = docId % g_hostdb.m_numHostsPerShard; Host *host = &group[hostNum]; bool isWarcInjection = false; int32_t ulen = gbstrlen(url); if ( ulen > 10 && strcmp(url+ulen-8,".warc.gz") == 0 ) isWarcInjection = true; if ( ulen > 10 && strcmp(url+ulen-5,".warc") == 0 ) isWarcInjection = true; if ( ! isWarcInjection ) return host; // warc files end up calling XmlDoc::indexWarcOrArc() which spawns // a msg7 injection request for each doc in the warc/arc file // so let's do load balancing differently for them so one host // doesn't end up doing a bunch of wget/gunzips on warc files // thereby bottlenecking the cluster. get the first hostid that // we have not sent a msg7 injection request to that is still out for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = g_hostdb.getHost(i); h->m_tmpCount = 0; } for ( UdpSlot *slot = g_udpServer.m_head2 ; slot ; slot = slot->m_next2 ) { // skip if not injection request if ( slot->m_msgType != 0x07 ) continue; //if ( ! slot->m_weInitiated ) continue; // if we did not initiate the injection request, i.e. if // it is to us, skip it if ( ! slot->m_callback ) continue; // who is it from? int32_t hostId = slot->m_hostId; if ( hostId < 0 ) continue; Host *h = g_hostdb.getHost ( hostId ); if ( ! h ) continue; h->m_tmpCount++; } int32_t min = 999999; Host *minh = NULL; for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { Host *h = g_hostdb.getHost(i); if ( h->m_tmpCount == 0 ) return h; if ( h->m_tmpCount >= min ) continue; min = h->m_tmpCount; minh = h; } if ( minh ) return minh; // how can this happen? return host; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotTitleRec ( void *state ) { // cast the State4 out State4 *st = (State4 *) state; // get the socket TcpSocket *s = st->m_socket; SafeBuf sb; // get it's docId long long docId = st->m_docId; // make the query string for passing to different hosts char qs[64]; sprintf(qs,"&d=%lli",docId); if ( docId==0LL ) qs[0] = 0; // print standard header sb.reserve2x ( 32768 ); g_pages.printAdminTop (&sb, st->m_socket, &st->m_r ); //PAGE_TITLEDB, // st->m_username,//NULL , // st->m_coll , st->m_pwd , s->m_ip , qs ); // shortcut XmlDoc *xd = &st->m_xd; // . deal with errors // . print none if non title rec at or after the provided docId if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) { // print docId in box sb.safePrintf ( "<center>\nEnter docId: " "<input type=text name=d value=%lli size=15>", docId); sb.safePrintf ( "</form><br>\n" ); if ( docId == 0 ) sb.safePrintf("<br>"); else if ( g_errno ) sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno)); else sb.safePrintf("<br><br>No titleRec for that docId " "or higher"); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "\n</center>" ); mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage ( s , sb.getBufStart(), sb.length() ); } // print docId in box sb.safePrintf ("<br>\n" "<center>Enter docId: " "<input type=text name=d value=%lli size=15>", docId ); // print where it should be //unsigned long gid = getGroupIdFromDocId ( docId ); //Host *hosts = g_hostdb.getGroup(gid); long shardNum = getShardNumFromDocId ( docId ); Host *hosts = g_hostdb.getShard ( shardNum ); long hostId = -1; if ( hosts ) hostId = hosts[0].m_hostId; sb.safePrintf("<br><br>docId on host #%li and twins.",hostId); sb.safePrintf ( "</form><br>\n" ); //char *coll = st->m_coll; Title *ti = xd->getTitle(); if ( ! ti ) { log ( "admin: Could not set title" ); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // sanity check. should not block if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; } // print it out xd->printDoc ( &sb ); // don't forget to cleanup mdelete ( st , sizeof(State4) , "PageTitledb"); delete (st); // now encapsulate it in html head/tail and send it off return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length()); }
// . if url is NULL use the docId to get the titleRec // . if titleRec is NULL use our own internal m_myTitleRec // . sets g_errno to ENOTFOUND if TitleRec does not exist for this url/docId // . if g_errno is ENOTFOUND m_docId will be set to the best available docId // for this url to use if we're adding it to Titledb // . if g_errno is ENOTFOUND and m_docId is 0 then no docIds were available // . "url" must be NULL terminated bool Msg22::getTitleRec ( Msg22Request *r , char *url , long long docId , char *coll , char **titleRecPtrPtr , long *titleRecSizePtr, bool justCheckTfndb , // when indexing spider replies we just want // a unique docid... "docId" should be the desired // one, but we might have to change it. bool getAvailDocIdOnly , void *state , void (* callback) (void *state) , long niceness , bool addToCache , long maxCacheAge , long timeout , bool doLoadBalancing ) { // sanity if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; } if ( getAvailDocIdOnly && url ) { char *xx=NULL;*xx=0; } //if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url); // sanity checks if ( url && docId!=0LL ) { char *xx=NULL;*xx=0; } if ( url && !url[0] ) { char *xx=NULL;*xx=0; } if ( docId!=0LL && url ) { char *xx=NULL;*xx=0; } if ( ! coll ) { char *xx=NULL;*xx=0; } if ( ! callback ) { char *xx=NULL;*xx=0; } if ( r->m_inUse ) { char *xx=NULL;*xx=0; } if ( m_outstanding ) { char *xx = NULL;*xx=0; } // sanity check if ( ! justCheckTfndb && ! getAvailDocIdOnly ) { if ( ! titleRecPtrPtr ) { char *xx=NULL;*xx=0; } if ( ! titleRecSizePtr ) { char *xx=NULL;*xx=0; } } // remember, caller want us to set this m_titleRecPtrPtr = titleRecPtrPtr; m_titleRecSizePtr = titleRecSizePtr; // assume not found. this can be NULL if justCheckTfndb is true, // like when it is called from XmlDoc::getIsNew() if ( titleRecPtrPtr ) *titleRecPtrPtr = NULL; if ( titleRecSizePtr ) *titleRecSizePtr = 0; // save callback m_state = state; m_callback = callback; // save it m_r = r; // set request r->m_docId = docId; r->m_niceness = niceness; r->m_justCheckTfndb = (bool)justCheckTfndb; r->m_getAvailDocIdOnly = (bool)getAvailDocIdOnly; r->m_doLoadBalancing = (bool)doLoadBalancing; r->m_collnum = g_collectiondb.getCollnum ( coll ); r->m_addToCache = false; r->m_maxCacheAge = 0; // url must start with http(s)://. must be normalized. if ( url && url[0] != 'h' ) { log("msg22: BAD URL! does not start with 'h'"); m_errno = g_errno = EBADENGINEER; return true; } // store url if ( url ) strcpy(r->m_url,url); else r->m_url[0] = '\0'; // if no docid provided, use probable docid if ( ! docId ) docId = g_titledb.getProbableDocId ( url ); // get groupId from docId uint32_t shardNum = getShardNumFromDocId ( docId ); // generate cacheKey, just use docid now key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = docId; // do load balancing iff we're the spider because if we send this // request to a merging host, and prefer local reads is true, the // resulting disk read will be starved somewhat. otherwise, we save // time by not having to cast a Msg36 bool balance = false; /* // if clusterdb, do bias long firstHostId = -1; // i don't see why not to always bias it, this makes tfndb page cache // twice as effective for all lookups long numTwins = g_hostdb.getNumHostsPerShard(); //long long bias=((0x0000003fffffffffLL)/(long long)numTwins); long long sectionWidth = (DOCID_MASK/(long long)numTwins) + 1; long hostNum = (docId & DOCID_MASK) / sectionWidth; long numHosts = g_hostdb.getNumHostsPerShard(); Host *hosts = g_hostdb.getGroup ( groupId ); if ( hostNum >= numHosts ) { char *xx = NULL; *xx = 0; } firstHostId = hosts [ hostNum ].m_hostId ; */ // get our group long allNumHosts = g_hostdb.getNumHostsPerShard(); Host *allHosts = g_hostdb.getShard ( shardNum );//Group ( groupId ); // put all alive hosts in this array Host *cand[32]; long long nc = 0; for ( long i = 0 ; i < allNumHosts ; i++ ) { // get that host Host *hh = &allHosts[i]; // skip if dead if ( g_hostdb.isDead(hh) ) continue; // add it if alive cand[nc++] = hh; } // if none alive, make them all candidates then bool allDead = (nc == 0); for ( long i = 0 ; allDead && i < allNumHosts ; i++ ) cand[nc++] = &allHosts[i]; // route based on docid region, not parity, because we want to hit // the urldb page cache as much as possible long long sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL; // we mod by 1MB since tied scores resort to sorting by docid // so we don't want to overload the host responsible for the lowest // range of docids. CAUTION: do this for msg22 too! // in this way we should still ensure a pretty good biased urldb // cache... // . TODO: fix the urldb cache preload logic long hostNum = (docId % (128LL*1024*1024)) / sectionWidth; if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; } long firstHostId = cand [ hostNum ]->m_hostId ; // while this prevents tfndb seeks, it also causes bottlenecks // if one host is particularly slow, because load balancing is // bypassed. //if ( ! g_conf.m_useBiasedTfndb ) firstHostId = -1; // flag it m_outstanding = true; r->m_inUse = 1; // . send this request to the least-loaded host that can handle it // . returns false and sets g_errno on error // . use a pre-allocated buffer to hold the reply // . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating if ( ! m_mcast.send ( (char *)r , r->getSize() , 0x22 , // msgType 0x22 false , // m_mcast own m_request? shardNum , // send to group (groupKey) false , // send to whole group? //hostKey , // key is lower bits of docId 0 , // key is lower bits of docId this , // state data NULL , // state data gotReplyWrapper22 , timeout , // 60 second time out r->m_niceness , // nice, reply size can be huge false , // realtime? firstHostId , // first hostid NULL , // replyBuf 0 , // replyBufMaxSize false , // free reply buf? balance , // do disk load balancing? maxCacheAge , // maxCacheAge cacheKey , // cacheKey RDB_TITLEDB , // rdbId of titledb 32*1024 ) ){// minRecSizes avg log("db: Requesting title record had error: %s.", mstrerror(g_errno) ); // set m_errno m_errno = g_errno; // no, multicast will free since he owns it! //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); return true; } // otherwise, we blocked and gotReplyWrapper will be called return false; }
// returns true and sets g_errno on error, otherwise, blocks and returns false bool Msg20::getSummary ( Msg20Request *req ) { // reset ourselves in case recycled reset(); // consider it "launched" m_launched = true; // save it m_requestDocId = req->m_docId; m_state = req->m_state; m_callback = req->m_callback; m_callback2 = NULL; // does this ever happen? if ( g_hostdb.getNumHosts() <= 0 ) { log("build: hosts2.conf is not in working directory, or " "contains no valid hosts."); g_errno = EBADENGINEER; return true; } if ( req->m_docId < 0 && ! req->ptr_ubuf ) { log("msg20: docid<0 and no url for msg20::getsummary"); g_errno = EBADREQUEST; return true; } // get groupId from docId, if positive uint32_t shardNum; if ( req->m_docId >= 0 ) shardNum = g_hostdb.getShardNumFromDocId(req->m_docId); else { int64_t pdocId = Titledb::getProbableDocId(req->ptr_ubuf); shardNum = getShardNumFromDocId(pdocId); } // we might be getting inlinks for a spider request // so make sure timeout is inifinite for that... const int32_t timeout = (req->m_niceness==0) ? multicast_msg20_summary_timeout : multicast_infinite_send_timeout; // get our group int32_t allNumHosts = g_hostdb.getNumHostsPerShard(); Host *allHosts = g_hostdb.getShard ( shardNum ); // put all alive hosts in this array Host *cand[32]; int64_t nc = 0; for ( int32_t i = 0 ; i < allNumHosts ; i++ ) { // get that host Host *hh = &allHosts[i]; // skip if dead if ( g_hostdb.isDead(hh) ) continue; // Respect no-spider, no-query directives from hosts.conf if ( !req->m_getLinkInfo && ! hh->m_queryEnabled ) continue; if ( req->m_getLinkInfo && ! hh->m_spiderEnabled ) continue; // add it if alive cand[nc++] = hh; } // if none alive, make them all candidates then bool allDead = (nc == 0); for ( int32_t i = 0 ; allDead && i < allNumHosts ; i++ ) { // NEVER add a noquery host to the candidate list, even // if the query host is dead if ( ! allHosts[i].m_queryEnabled ) continue; cand[nc++] = &allHosts[i]; } if ( nc == 0 ) { log("msg20: error sending mcast: no queryable hosts " "availble to handle summary generation"); g_errno = EBADENGINEER; m_gotReply = true; return true; } // route based on docid region, not parity, because we want to hit // the urldb page cache as much as possible int64_t sectionWidth =((128LL*1024*1024)/nc)+1; int64_t probDocId = req->m_docId; // i think reference pages just pass in a url to get the summary if ( probDocId < 0 && req->size_ubuf ) probDocId = Titledb::getProbableDocId ( req->ptr_ubuf ); if ( probDocId < 0 ) { log("query: Got bad docid/url combo."); probDocId = 0; } // we mod by 1MB since tied scores resort to sorting by docid // so we don't want to overload the host responsible for the lowest // range of docids. CAUTION: do this for msg22 too! // in this way we should still ensure a pretty good biased urldb // cache... // . TODO: fix the urldb cache preload logic int32_t hostNum = (probDocId % (128LL*1024*1024)) / sectionWidth; if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids if ( hostNum >= nc ) { g_process.shutdownAbort(true); } int32_t firstHostId = cand [ hostNum ]->m_hostId ; m_requestSize = 0; m_request = req->serialize ( &m_requestSize ); // . it sets g_errno on error and returns NULL // . we MUST call gotReply() here to set m_gotReply // otherwise Msg40.cpp can end up looping forever // calling Msg40::launchMsg20s() if ( ! m_request ) { gotReply(NULL); return true; } // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . use a pre-allocated buffer to hold the reply // . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating if (!m_mcast.send(m_request, m_requestSize, msg_type_20, false, shardNum, false, probDocId, this, NULL, gotReplyWrapper20, timeout, req->m_niceness, firstHostId, false)) { // sendto() sometimes returns "Network is down" so i guess // we just had an "error reply". log("msg20: error sending mcast %s",mstrerror(g_errno)); m_gotReply = true; return true; } // we are officially "in progress" m_inProgress = true; // we blocked return false; }
// returns true and sets g_errno on error, otherwise, blocks and returns false bool Msg20::getSummary ( Msg20Request *req ) { // reset ourselves in case recycled reset(); // consider it "launched" m_launched = true; // save it m_requestDocId = req->m_docId; m_state = req->m_state; m_callback = req->m_callback; m_callback2 = req->m_callback2; m_expected = req->m_expected; m_eventId = req->m_eventId; // clear this //m_eventIdBits.clear(); // set this //if ( req->m_eventId ) m_eventIdBits.addEventId(req->m_eventId); Hostdb *hostdb = req->m_hostdb; // ensure hostdb has a host in it if ( ! hostdb ) hostdb = &g_hostdb; // does this ever happen? if ( hostdb->getNumHosts() <= 0 ) { log("build: hosts2.conf is not in working directory, or " "contains no valid hosts."); g_errno = EBADENGINEER; return true; } // do not re-route to twins if accessing an external network if ( hostdb != &g_hostdb ) req->m_expected = false; // get groupId from docId, if positive unsigned long shardNum; if ( req->m_docId >= 0 ) shardNum = hostdb->getShardNumFromDocId(req->m_docId); else { long long pdocId = g_titledb.getProbableDocId(req->ptr_ubuf); shardNum = getShardNumFromDocId(pdocId); } // we might be getting inlinks for a spider request // so make sure timeout is inifinite for that... long timeout = 9999999; // 10 million seconds, basically inf. if ( req->m_niceness == 0 ) timeout = 20; // get our group long allNumHosts = hostdb->getNumHostsPerShard(); Host *allHosts = hostdb->getShard ( shardNum );//getGroup(groupId ); // put all alive hosts in this array Host *cand[32]; long long nc = 0; for ( long i = 0 ; i < allNumHosts ; i++ ) { // get that host Host *hh = &allHosts[i]; // skip if dead if ( g_hostdb.isDead(hh) ) continue; // add it if alive cand[nc++] = hh; } // if none alive, make them all candidates then bool allDead = (nc == 0); for ( long i = 0 ; allDead && i < allNumHosts ; i++ ) cand[nc++] = &allHosts[i]; // route based on docid region, not parity, because we want to hit // the urldb page cache as much as possible long long sectionWidth =((128LL*1024*1024)/nc)+1;//(DOCID_MASK/nc)+1LL; long long probDocId = req->m_docId; // i think reference pages just pass in a url to get the summary if ( probDocId < 0 && req->size_ubuf ) probDocId = g_titledb.getProbableDocId ( req->ptr_ubuf ); if ( probDocId < 0 ) { log("query: Got bad docid/url combo."); probDocId = 0; } // we mod by 1MB since tied scores resort to sorting by docid // so we don't want to overload the host responsible for the lowest // range of docids. CAUTION: do this for msg22 too! // in this way we should still ensure a pretty good biased urldb // cache... // . TODO: fix the urldb cache preload logic long hostNum = (probDocId % (128LL*1024*1024)) / sectionWidth; if ( hostNum < 0 ) hostNum = 0; // watch out for negative docids if ( hostNum >= nc ) { char *xx = NULL; *xx = 0; } long firstHostId = cand [ hostNum ]->m_hostId ; // . make buffer m_request to hold the request // . tries to use m_requestBuf[] if it is big enough to hold it // . allocs a new buf if MAX_MSG20_REQUEST_SIZE is too small // . serializes the request into m_request // . sets m_requestSize to the size of the serialized request m_requestSize = 0; m_request = req->serialize ( &m_requestSize, m_requestBuf , MAX_MSG20_REQUEST_SIZE ); // . it sets g_errno on error and returns NULL // . we MUST call gotReply() here to set m_gotReply // otherwise Msg40.cpp can end up looping forever // calling Msg40::launchMsg20s() if ( ! m_request ) { gotReply(NULL); return true; } // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . use a pre-allocated buffer to hold the reply // . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating if ( ! m_mcast.send ( m_request , m_requestSize , 0x20 , // msgType 0x20 false , // m_mcast own m_request? shardNum , // send to group (groupKey) false , // send to whole group? probDocId , // key is lower bits of docId this , // state data NULL , // state data gotReplyWrapper20 , timeout , // 60 second time out req->m_niceness , false , // real time? firstHostId , // first hostid NULL,//m_replyBuf , 0,//MSG20_MAX_REPLY_SIZE,//m_replyMaxSize false , // free reply buf? false , // do disk load balancing? -1 , // max cache age 0 , // cacheKey 0 , // bogus rdbId -1 , // minRecSizes(unknownRDsize) true , // sendToSelf true , // retry forever hostdb )) { // sendto() sometimes returns "Network is down" so i guess // we just had an "error reply". log("msg20: error sending mcast %s",mstrerror(g_errno)); m_gotReply = true; return true; } // we are officially "in progress" m_inProgress = true; // we blocked return false; }
// . if url is NULL use the docId to get the titleRec // . if titleRec is NULL use our own internal m_myTitleRec // . sets g_errno to ENOTFOUND if TitleRec does not exist for this url/docId // . if g_errno is ENOTFOUND m_docId will be set to the best available docId // for this url to use if we're adding it to Titledb // . if g_errno is ENOTFOUND and m_docId is 0 then no docIds were available // . "url" must be NULL terminated bool Msg22::getTitleRec ( Msg22Request *r , char *url , int64_t docId , char *coll , char **titleRecPtrPtr , int32_t *titleRecSizePtr, bool justCheckTfndb , // when indexing spider replies we just want // a unique docid... "docId" should be the desired // one, but we might have to change it. bool getAvailDocIdOnly , void *state , void (* callback) (void *state) , int32_t niceness , bool addToCache , int32_t maxCacheAge , int32_t timeout ) { m_availDocId = 0; // sanity if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; } if ( getAvailDocIdOnly && url ) { char *xx=NULL;*xx=0; } //if ( url ) log(LOG_DEBUG,"build: getting TitleRec for %s",url); // sanity checks if ( url && docId!=0LL ) { char *xx=NULL;*xx=0; } if ( url && !url[0] ) { char *xx=NULL;*xx=0; } if ( docId!=0LL && url ) { char *xx=NULL;*xx=0; } if ( ! coll ) { char *xx=NULL;*xx=0; } if ( ! callback ) { char *xx=NULL;*xx=0; } if ( r->m_inUse ) { char *xx=NULL;*xx=0; } if ( m_outstanding ) { char *xx = NULL;*xx=0; } // sanity check if ( ! justCheckTfndb && ! getAvailDocIdOnly ) { if ( ! titleRecPtrPtr ) { char *xx=NULL;*xx=0; } if ( ! titleRecSizePtr ) { char *xx=NULL;*xx=0; } } // remember, caller want us to set this m_titleRecPtrPtr = titleRecPtrPtr; m_titleRecSizePtr = titleRecSizePtr; // assume not found. this can be NULL if justCheckTfndb is true, // like when it is called from XmlDoc::getIsNew() if ( titleRecPtrPtr ) *titleRecPtrPtr = NULL; if ( titleRecSizePtr ) *titleRecSizePtr = 0; // save callback m_state = state; m_callback = callback; // save it m_r = r; // set request r->m_docId = docId; r->m_niceness = niceness; r->m_justCheckTfndb = (bool)justCheckTfndb; r->m_getAvailDocIdOnly = (bool)getAvailDocIdOnly; r->m_collnum = g_collectiondb.getCollnum ( coll ); r->m_addToCache = false; r->m_maxCacheAge = 0; // url must start with http(s)://. must be normalized. if ( url && url[0] != 'h' ) { log("msg22: BAD URL! does not start with 'h'"); m_errno = g_errno = EBADENGINEER; return true; } // store url if ( url ) strcpy(r->m_url,url); else r->m_url[0] = '\0'; // if no docid provided, use probable docid if ( ! docId ) docId = g_titledb.getProbableDocId ( url ); // get groupId from docId uint32_t shardNum = getShardNumFromDocId ( docId ); // generate cacheKey, just use docid now key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = docId; // do load balancing iff we're the spider because if we send this // request to a merging host, and prefer local reads is true, the // resulting disk read will be starved somewhat. otherwise, we save // time by not having to cast a Msg36 bool balance = false; Host *firstHost ; // if niceness 0 can't pick noquery host. // if niceness 1 can't pick nospider host. firstHost = g_hostdb.getLeastLoadedInShard ( shardNum, r->m_niceness ); int32_t firstHostId = firstHost->m_hostId; m_outstanding = true; r->m_inUse = 1; // . send this request to the least-loaded host that can handle it // . returns false and sets g_errno on error // . use a pre-allocated buffer to hold the reply // . TMPBUFSIZE is how much a UdpSlot can hold w/o allocating if ( ! m_mcast.send ( (char *)r , r->getSize() , 0x22 , // msgType 0x22 false , // m_mcast own m_request? shardNum , // send to group (groupKey) false , // send to whole group? //hostKey , // key is lower bits of docId 0 , // key is lower bits of docId this , // state data NULL , // state data gotReplyWrapper22 , timeout*1000 , // timeout r->m_niceness , // nice, reply size can be huge firstHostId , // first hostid NULL , // replyBuf 0 , // replyBufMaxSize false , // free reply buf? balance , // do disk load balancing? maxCacheAge , // maxCacheAge cacheKey , // cacheKey RDB_TITLEDB , // rdbId of titledb 32*1024 ) ){// minRecSizes avg log("db: Requesting title record had error: %s.", mstrerror(g_errno) ); // set m_errno m_errno = g_errno; // no, multicast will free since he owns it! //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); return true; } // otherwise, we blocked and gotReplyWrapper will be called return false; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList ( void *state ) { // the state State10 *st = (State10 *) state; // launch more if ( ! launchRequests ( st ) ) return false; /* // get the date list //fprintf(stderr,"termId now=%lli\n",st->m_termId); //fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK)); // . now get the indexList for this termId // . date is complemented, so start with bigger one first key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff); key128_t endKey = g_datedb.makeEndKey ( st->m_termId ,0x0); // get the rdb ptr to titledb's rdb //Rdb *rdb = g_indexdb.getRdb(); // -1 means read from all files in Indexdb long numFiles = -1; // make it zero if caller doesn't want to hit the disk if ( ! st->m_useDisk ) numFiles = 0; // get the title rec at or after this docId if ( ! st->m_msg0.getList ( -1 , 0 , 0 , 0 , // max cache age false , // add to cache? RDB_DATEDB , // rdbId of 2 = indexdb st->m_coll , &st->m_list2 , (char *)&startKey , (char *)&endKey , st->m_numRecs * sizeof(key128_t),//recSizes //st->m_useTree , // include tree? //st->m_useCache , // include cache? //false , // add to cache? //0 , // startFileNum //numFiles , // numFiles st , // state gotIndexListWrapper2 , 0 ) ) // niceness return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error return gotIndexList2 ( (void *) st , NULL ); } void gotIndexListWrapper2 ( void *state , RdbList *list ) { gotIndexList2 ( state , list ); } void addedKeyWrapper ( void *state ) { gotIndexList2 ( state, NULL ); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList2 ( void *state , RdbList *list ) { // the state State10 *st = (State10 *) state; */ // get the socket TcpSocket *s = st->m_socket; // don't allow pages bigger than 128k in cache //char buf [ 64*1024 ]; // a ptr into "buf" //char *p = buf; //char *pend = buf + 64*1024; /* // get termId key_t k = *(key_t *)st->m_list.getStartKey(); long long termId = g_indexdb.getTermId ( k ); // get groupId from termId //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k ); long hostnum = g_hostdb.makeHostId ( groupId ); */ // check box " checked" strings char *ubs = ""; char *uts = ""; char *uds = ""; char *ucs = ""; char *add = ""; char *del = ""; if ( st->m_useDatedb) ubs = " checked"; if ( st->m_useTree ) uts = " checked"; if ( st->m_useDisk ) uds = " checked"; if ( st->m_useCache ) ucs = " checked"; if ( st->m_add ) add = " checked"; if ( st->m_del ) del = " checked"; SafeBuf *pbuf = &st->m_pbuf; g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_collnum)))return true; // print the standard header for admin pages pbuf->safePrintf ( "<center>\n" "<table cellpadding=2><tr><td colspan=4>" "useDatedb:<input type=checkbox value=1 name=ub%s> " "useTree:<input type=checkbox value=1 name=ut%s> " "useDisk:<input type=checkbox value=1 name=ud%s> " "useCache:<input type=checkbox value=1 name=uc%s> " "ADD:<input type=checkbox value=1 name=add%s> " "DELETE:<input type=checkbox value=1 name=del%s>" "</td></tr><tr><td>" "query:" "</td><td>" "<input type=text name=q value=\"%s\" size=20>" "</td><td>" "collection:" "</td><td>" "<input type=text name=c value=\"%s\" size=10>" "</td></tr><tr><td>" "termId:" "</td><td>" "<input type=text name=t value=%lli size=20>" "</td><td>" "numRecs:" "</td><td>" "<input type=text name=numRecs value=%li size=10> " "</td></tr><tr><td>" "docId:" "</td><td>" "<input type=text name=d value=%lli size=20> " "</td><td>" "score:" "</td><td>" "<input type=text name=score value=%li size=10> " "</td><td>" "<input type=submit value=ok border=0>" "</td></tr>" "<tr><td colspan=2>" "term appears in about %lli docs +/- %li" "</td></tr>" //"<tr><td colspan=2>" //"this indexlist held by host #%li and twins" //"</td></tr>" "</table>" "</form><br><br>" , ubs, uts, uds, ucs, add, del, st->m_query , st->m_coll , st->m_termId , st->m_numRecs , st->m_docId , (long)st->m_score , st->m_termFreq , 2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * base->getNumFiles() ); //hostnum ); if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){ if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno)); else pbuf->safePrintf("List is empty"); pbuf->safePrintf("</center>"); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage(s , pbuf->getBufStart(), pbuf->length() ); // delete it mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st); return status; } pbuf->safePrintf ( "<table cellpadding=1 border=1>" "<tr><td>#</td><td>score</td>" "<td>docId</td><td>domHash</td></tr>"); //if ( searchingEvents // now print the score/docId of indexlist long i = 0; for ( st->m_list.resetListPtr () ; ! st->m_list.isExhausted () ; st->m_list.skipCurrentRecord () ) { // break if buf is low //if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list.getCurrentDocId () ; //unsigned long groupId = getGroupIdFromDocId ( docId ); long shardNum = getShardNumFromDocId ( docId ); // get the first host's hostId in this groupId //Host *h = g_hostdb.getFastestHostInGroup ( groupId ); Host *hosts = g_hostdb.getShard ( shardNum ); // just pick a host now... Host *h = &hosts[0]; // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // log the first docid so we can blaster url: queries // to PageIndexdb and see if they are in indexdb if ( i == 0 ) logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query); // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } unsigned long date = 0; if ( st->m_useDatedb ) date = (unsigned long)st->m_list.getCurrentDate(); uint8_t dh = g_titledb.getDomHash8FromDocId ( docId ); char ds[32]; ds[0]=0; if ( st->m_useDatedb ) sprintf (ds,"%lu/",date); pbuf->safePrintf ( "<tr><td>%li.</td>" "<td>%s%i</td>" "<td>" //"<a href=http://%s:%hu/admin/titledb?d=%llu>" "<a href=/admin/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td>" "<td>" "0x%02lx" "</td>" "</tr>\n" , i++, ds, (int)st->m_list.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId , (long)dh ); } pbuf->safePrintf ( "</table>" ); /* if ( ! st->m_list2.isEmpty() ) p += sprintf ( p , "<br>" "<br>" "<table cellpadding=1 border=1>" "<tr><td>#</td><td>termId</td>" "<td>date</td><td>score</td>" "<td>docId</td></tr>"); // now print the score/docId of datedb list i = 0; for ( st->m_list2.resetListPtr () ; ! st->m_list2.isExhausted () ; st->m_list2.skipCurrentRecord () ) { // break if buf is low if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list2.getCurrentDocId () ; unsigned long groupId = g_titledb.getGroupId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } // debug char kb[16]; st->m_list2.getCurrentKey(kb); //log(LOG_INFO,"debug: n1=%016llx n0=%016llx", // *(long long *)(kb+8),*(long long *)(kb+0)); //if ( (unsigned long)st->m_list2.getCurrentDate() == 0 ) // log("STOP"); sprintf ( p , "<tr><td>%li.</td>" "<td>%llu</td>" "<td>%lu</td><td>%i</td>" "<td>" //"<a href=http://%s:%hu/admin/titledb?d=%llu>" "<a href=/admin/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td></tr>\n" , i++, st->m_list2.getTermId16(kb) , (unsigned long)st->m_list2.getCurrentDate() , (int)st->m_list2.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId ); p += gbstrlen ( p ); } */ if ( ! st->m_list.isEmpty() ) pbuf->safePrintf ( "</table>" ); // print msg if we could fit all into buf //if ( p + 1024 >= pend ) { // sprintf ( p ,"... truncated ... no mem" ); // p += gbstrlen ( p ); //} // print the final tail //p += g_httpServer.printTail ( p , pend - p ); pbuf->safePrintf ( "</center>\n"); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage ( s , pbuf->getBufStart() , pbuf->length() ); // delete the state mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st) ; return status; }