bool gotTermFreq ( State10 *st ) { // set the term freq st->m_termFreq = st->m_msg36.getTermFreq(); // reset st->m_i = 0; // . query each indexdb/datedb split // . returns false if blocked, true otherwise if ( ! launchRequests ( st ) ) return false; // if it completed, keep on chugging return gotIndexList ( (void *) st ); }
// . returns false if not all replies have been received (or timed/erroredout) // . returns true if done (or an error finished us) // . sets g_errno on error bool Msg37::gotTermFreq ( Msg36 *msg36 ) { long i ; long j; // if called from above skip down to bottom if ( ! msg36 ) goto skip; // . set our m_errno if there was an error so everyone else knows // . don't overwrite it if it's already set if ( g_errno && ! m_errno ) m_errno = g_errno; // . now m_linkInfo[i] (for some i, i dunno which) is filled m_numReplies++; // extract info we stored in there i = msg36->m_i ; j = msg36->m_j ; // sanity check if ( &m_msg36[j] != msg36 ) { log("query: msg37 failed sanity check 3."); char *xx = NULL; *xx = 0; } // if no error set out term freq if ( ! g_errno ) m_termFreqs[i] = msg36->m_termFreq; // sanity check if ( ! m_inUse[j] ) { log("query: msg37 failed sanity check 2."); char *xx = NULL; *xx = 0; } // mark as available m_inUse[j] = 0; // try to launch more, returns true if all done though if ( ! launchRequests() ) return false; // wait until we got all the replies before we attempt to merge //if ( m_numReplies < m_numRequests ) return false; skip: // . did we have an error from any reply? // . return true if we got all replies // . do not merge since someone had an error if ( m_errno ) { g_errno = m_errno ; return true; } // set all to 1 in case there's an error //for ( long i = 0 ; i < m_numTerms ; i++ ) { // // skip if ignored // //if ( m_termFreqs[i] == 0LL ) continue; // m_termFreqs[i] = m_msg36[i].getTermFreq(); //} // . return true cuz we're done // . g_errno may be set though return true; }
// returns false if blocked, true otherwise bool Images::gotTermFreq ( ) { // error? if ( g_errno ) return true; // bail if less than 10 //long long nt = m_msg36.getTermFreq(); // each key but the first is 12 bytes (compressed) long long nt = (m_list.getListSize() - 6)/ 12; // . return true, without g_errno set, we are done // . if we do not have 10 or more webpages that share this same // template then do not do image extraction at all, it is too risky // that we get a bad image // . MDW: for debugging, do not require 10 pages of same template //if ( nt < 10 ) return true; if ( nt < -2 ) return true; // now see which of the image urls are unique if ( ! launchRequests () ) return false; // i guess we did not block return true; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . "termIds/termFreqs" should NOT be on the stack in case we block // . i based this on ../titled/Msg25.cpp since it sends out multiple msgs at // the same time, too bool Msg37::getTermFreqs ( collnum_t collnum,//char *coll , long maxAge , long long *termIds , long numTerms , long long *termFreqs , void *state , void (* callback)(void *state ) , long niceness , bool exactCount ) { // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: bad collection. msg37."); // we haven't got any responses as of yet or sent any requests m_callback = callback; m_state = state; m_exactCount = exactCount; m_niceness = niceness; m_numReplies = 0; m_numRequests = 0; m_errno = 0; m_numTerms = numTerms; m_termFreqs = termFreqs; m_collnum = collnum; //m_coll = coll; m_maxAge = maxAge; m_termIds = termIds; // set all to 1 in case there's an error for ( long i = 0 ; i < m_numTerms ; i++ ) { //if ( ignore[i] ) m_termFreqs[i] = 0LL; //else m_termFreqs[i] = 1LL; m_termFreqs[i] = 1LL; } // reset m_i = 0; memset ( m_inUse , 0 , MAX_MSG36_OUT ); // launch the requests if ( ! launchRequests() ) return false; // set our array gotTermFreq ( NULL ); // we did not block, return true return true; }
// . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList ( void *state ) { // the state State10 *st = (State10 *) state; // launch more if ( ! launchRequests ( st ) ) return false; /* // get the date list //fprintf(stderr,"termId now=%lli\n",st->m_termId); //fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK)); // . now get the indexList for this termId // . date is complemented, so start with bigger one first key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff); key128_t endKey = g_datedb.makeEndKey ( st->m_termId ,0x0); // get the rdb ptr to titledb's rdb //Rdb *rdb = g_indexdb.getRdb(); // -1 means read from all files in Indexdb long numFiles = -1; // make it zero if caller doesn't want to hit the disk if ( ! st->m_useDisk ) numFiles = 0; // get the title rec at or after this docId if ( ! st->m_msg0.getList ( -1 , 0 , 0 , 0 , // max cache age false , // add to cache? RDB_DATEDB , // rdbId of 2 = indexdb st->m_coll , &st->m_list2 , (char *)&startKey , (char *)&endKey , st->m_numRecs * sizeof(key128_t),//recSizes //st->m_useTree , // include tree? //st->m_useCache , // include cache? //false , // add to cache? //0 , // startFileNum //numFiles , // numFiles st , // state gotIndexListWrapper2 , 0 ) ) // niceness return false; // otherwise call gotResults which returns false if blocked, true else // and sets g_errno on error return gotIndexList2 ( (void *) st , NULL ); } void gotIndexListWrapper2 ( void *state , RdbList *list ) { gotIndexList2 ( state , list ); } void addedKeyWrapper ( void *state ) { gotIndexList2 ( state, NULL ); } // . make a web page from results stored in msg40 // . send it on TcpSocket "s" when done // . returns false if blocked, true otherwise // . sets g_errno on error bool gotIndexList2 ( void *state , RdbList *list ) { // the state State10 *st = (State10 *) state; */ // get the socket TcpSocket *s = st->m_socket; // don't allow pages bigger than 128k in cache //char buf [ 64*1024 ]; // a ptr into "buf" //char *p = buf; //char *pend = buf + 64*1024; /* // get termId key_t k = *(key_t *)st->m_list.getStartKey(); long long termId = g_indexdb.getTermId ( k ); // get groupId from termId //unsigned long groupId = k.n1 & g_hostdb.m_groupMask; unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k ); long hostnum = g_hostdb.makeHostId ( groupId ); */ // check box " checked" strings char *ubs = ""; char *uts = ""; char *uds = ""; char *ucs = ""; char *add = ""; char *del = ""; if ( st->m_useDatedb) ubs = " checked"; if ( st->m_useTree ) uts = " checked"; if ( st->m_useDisk ) uds = " checked"; if ( st->m_useCache ) ucs = " checked"; if ( st->m_add ) add = " checked"; if ( st->m_del ) del = " checked"; SafeBuf *pbuf = &st->m_pbuf; g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r ); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true; // print the standard header for admin pages pbuf->safePrintf ( "<center>\n" "<table cellpadding=2><tr><td colspan=4>" "useDatedb:<input type=checkbox value=1 name=ub%s> " "useTree:<input type=checkbox value=1 name=ut%s> " "useDisk:<input type=checkbox value=1 name=ud%s> " "useCache:<input type=checkbox value=1 name=uc%s> " "ADD:<input type=checkbox value=1 name=add%s> " "DELETE:<input type=checkbox value=1 name=del%s>" "</td></tr><tr><td>" "query:" "</td><td>" "<input type=text name=q value=\"%s\" size=20>" "</td><td>" "collection:" "</td><td>" "<input type=text name=c value=\"%s\" size=10>" "</td></tr><tr><td>" "termId:" "</td><td>" "<input type=text name=t value=%lli size=20>" "</td><td>" "numRecs:" "</td><td>" "<input type=text name=numRecs value=%li size=10> " "</td></tr><tr><td>" "docId:" "</td><td>" "<input type=text name=d value=%lli size=20> " "</td><td>" "score:" "</td><td>" "<input type=text name=score value=%li size=10> " "</td><td>" "<input type=submit value=ok border=0>" "</td></tr>" "<tr><td colspan=2>" "term appears in about %lli docs +/- %li" "</td></tr>" //"<tr><td colspan=2>" //"this indexlist held by host #%li and twins" //"</td></tr>" "</table>" "</form><br><br>" , ubs, uts, uds, ucs, add, del, st->m_query , st->m_coll , st->m_termId , st->m_numRecs , st->m_docId , (long)st->m_score , st->m_termFreq , 2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * base->getNumFiles() ); //hostnum ); if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){ if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno)); else pbuf->safePrintf("List is empty"); pbuf->safePrintf("</center>"); // erase g_errno for sending g_errno = 0; // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage(s , pbuf->getBufStart(), pbuf->length() ); // delete it mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st); return status; } pbuf->safePrintf ( "<table cellpadding=1 border=1>" "<tr><td>#</td><td>score</td>" "<td>docId</td><td>domHash</td></tr>"); //if ( searchingEvents // now print the score/docId of indexlist long i = 0; for ( st->m_list.resetListPtr () ; ! st->m_list.isExhausted () ; st->m_list.skipCurrentRecord () ) { // break if buf is low //if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list.getCurrentDocId () ; unsigned long groupId = getGroupIdFromDocId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // log the first docid so we can blaster url: queries // to PageIndexdb and see if they are in indexdb if ( i == 0 ) logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query); // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } unsigned long date = 0; if ( st->m_useDatedb ) date = (unsigned long)st->m_list.getCurrentDate(); uint8_t dh = g_titledb.getDomHash8FromDocId ( docId ); char ds[32]; ds[0]=0; if ( st->m_useDatedb ) sprintf (ds,"%lu/",date); pbuf->safePrintf ( "<tr><td>%li.</td>" "<td>%s%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td>" "<td>" "0x%02lx" "</td>" "</tr>\n" , i++, ds, (int)st->m_list.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId , (long)dh ); } pbuf->safePrintf ( "</table>" ); /* if ( ! st->m_list2.isEmpty() ) p += sprintf ( p , "<br>" "<br>" "<table cellpadding=1 border=1>" "<tr><td>#</td><td>termId</td>" "<td>date</td><td>score</td>" "<td>docId</td></tr>"); // now print the score/docId of datedb list i = 0; for ( st->m_list2.resetListPtr () ; ! st->m_list2.isExhausted () ; st->m_list2.skipCurrentRecord () ) { // break if buf is low if ( p + 1024 >= pend ) break; // but set the ip/port to a host that has this titleRec // stored locally! long long docId = st->m_list2.getCurrentDocId () ; unsigned long groupId = g_titledb.getGroupId ( docId ); // get the first host's hostId in this groupId Host *h = g_hostdb.getFastestHostInGroup ( groupId ); // . pick the first host to handle the cached titleRec request // . we assume it has the best time and is up!! TODO: fix! // . use local ip though if it was an internal request // . otherwise, use the external ip //unsigned long ip = h->m_externalIp; unsigned long ip = h->m_ip; // use the NAT mapped port unsigned short port = h->m_externalHttpPort; // adjust ip/port if local if ( st->m_isLocal ) { ip = h->m_ip; port = h->m_httpPort; } // debug char kb[16]; st->m_list2.getCurrentKey(kb); //log(LOG_INFO,"debug: n1=%016llx n0=%016llx", // *(long long *)(kb+8),*(long long *)(kb+0)); //if ( (unsigned long)st->m_list2.getCurrentDate() == 0 ) // log("STOP"); sprintf ( p , "<tr><td>%li.</td>" "<td>%llu</td>" "<td>%lu</td><td>%i</td>" "<td>" //"<a href=http://%s:%hu/master/titledb?d=%llu>" "<a href=/master/titledb?c=%s&d=%llu>" "%llu" //"<td><a href=/cgi/4.cgi?d=%llu>%llu" "</td></tr>\n" , i++, st->m_list2.getTermId16(kb) , (unsigned long)st->m_list2.getCurrentDate() , (int)st->m_list2.getCurrentScore() , //iptoa(ip) , port , st->m_coll, docId , docId ); p += gbstrlen ( p ); } */ if ( ! st->m_list.isEmpty() ) pbuf->safePrintf ( "</table>" ); // print msg if we could fit all into buf //if ( p + 1024 >= pend ) { // sprintf ( p ,"... truncated ... no mem" ); // p += gbstrlen ( p ); //} // print the final tail //p += g_httpServer.printTail ( p , pend - p ); pbuf->safePrintf ( "</center>\n"); // now encapsulate it in html head/tail and send it off bool status = g_httpServer.sendDynamicPage ( s , pbuf->getBufStart() , pbuf->length() ); // delete the state mdelete ( st , sizeof(State10) , "PageIndexdb" ); delete (st) ; return status; }
// . get various information for each url in a list of urls // . urls in "urlBuf" are \0 terminated // . used to be called getSiteRecs() // . you can pass in a list of docIds rather than urlPtrs bool Msge0::getTagRecs ( char **urlPtrs , linkflags_t *urlFlags , //Links::m_linkFlags long numUrls , // if skipOldLinks && urlFlags[i]&LF_OLDLINK, skip it bool skipOldLinks , TagRec *baseTagRec , collnum_t collnum, long niceness , void *state , void (*callback)(void *state) ) { reset(); // bail if no urls or linkee if ( numUrls <= 0 ) return true; // save all input parms m_urlPtrs = urlPtrs; m_urlFlags = urlFlags; m_numUrls = numUrls; m_skipOldLinks = skipOldLinks; m_baseTagRec = baseTagRec; m_collnum = collnum; m_niceness = niceness; m_state = state; m_callback = callback; // . how much mem to alloc? // . include an extra 4 bytes for each one to hold possible errno long need = 4 + // error 4 + // tag ptr 4 ; // slab ptr // one per url need *= numUrls; // allocate the buffer to hold all the info we gather m_buf = (char *)mcalloc ( need , "Msge0buf" ); if ( ! m_buf ) return true; m_bufSize = need; // clear it all memset ( m_buf , 0 , m_bufSize ); // set the ptrs! char *p = m_buf; m_tagRecErrors = (long *)p ; p += numUrls * 4; m_tagRecPtrs = (TagRec **)p ; p += numUrls * 4; m_slab = (char **)p ; p += numUrls * 4; // initialize m_numRequests = 0; m_numReplies = 0; // . point to first url to process // . url # m_n m_n = 0; // clear the m_used flags memset ( m_used , 0 , MAX_OUTSTANDING_MSGE0 ); // . launch the requests // . a request can be a msg8a, msgc, msg50 or msg20 request depending // on what we need to get // . when a reply returns, the next request is launched for that url // . we keep a msgESlot state for each active url in the buffer // . we can have up to MAX_ACTIVE urls active if ( ! launchRequests ( 0 ) ) return false; // none blocked, we are done return true; }