void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) { State22 *st = (State22 *)state; // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; // shortcut Msg22Request *r = st->m_r; // breathe QUICKPOLL(r->m_niceness); // send error reply on error if ( g_errno ) { hadError: log("db: Had error getting title record from titledb: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // convenience var RdbList *tlist = &st->m_tlist; // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); if ( pd != st->m_pd ) { log("db: crap probable docids do not match! u=%s", r->m_url); g_errno = EBADENGINEER; goto hadError; } // sanity //if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // the probable docid is the PREFERRED docid in this case if ( r->m_getAvailDocIdOnly ) pd = st->m_r->m_docId; // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; bool docIdWasFound = false; // scan the titleRecs in the list for ( ; ! tlist->isExhausted() ; tlist->skipCurrentRecord ( ) ) { // breathe QUICKPOLL ( r->m_niceness ); // get the rec char *rec = tlist->getCurrentRec(); long recSize = tlist->getCurrentRecSize(); // get that key key_t *k = (key_t *)rec; // skip negative recs, first one should not be negative however if ( ( k->n0 & 0x01 ) == 0x00 ) continue; // get docid of that titlerec long long dd = g_titledb.getDocId(k); if ( r->m_getAvailDocIdOnly ) { // make sure our available docids are availble! if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; continue; } // if we had a url make sure uh48 matches else if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // make sure our available docids are availble! if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // otherwise, check docid else { // compare that if ( r->m_docId != dd ) continue; } // flag that we matched m_docId docIdWasFound = true; // do not set back titlerec if just want avail docid //if ( r->m_getAvailDocIdOnly ) continue; // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // use rec as reply char *reply = rec; // . send this rec back, it's a match // . if only one rec in list, steal the list's memory if ( recSize != tlist->getAllocSize() ) { // otherwise, alloc space for the reply reply = (char *)mmalloc (recSize, "Msg22"); if ( ! reply ) goto hadError; memcpy ( reply , rec , recSize ); } // otherwise we send back the whole list! else { // we stole this from list tlist->m_ownData = false; } // off ya go us->sendReply_ass(reply,recSize,reply,recSize,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); // all done return; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // if "docId" was unmatched that should be the preferred available // docid then... //if(! docIdWasFound && r->m_getAvailDocIdOnly && ad != r->m_docId ) { // char *xx=NULL;*xx=0; } // remember it. this might be zero if none exist! st->m_availDocId = ad; // note it if ( ad == 0LL && (r->m_getAvailDocIdOnly || r->m_url[0]) ) log("msg22: avail docid is 0 for pd=%lli!",pd); // . ok, return an available docid if ( r->m_url[0] || r->m_justCheckTfndb || r->m_getAvailDocIdOnly ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = st->m_availDocId; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // not found! and it was a docid based request... log("msg22: could not find title rec for docid %llu",r->m_docId); g_errno = ENOTFOUND; goto hadError; }
int main(int argc, char **argv) { if (argc < 3) { print_usage(argv[0]); return 1; } if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) { print_usage(argv[0]); return 1; } g_log.m_disabled = true; // initialize library g_mem.init(); hashinit(); // current dir char path[PATH_MAX]; realpath(argv[1], path); size_t pathLen = strlen(path); if (path[pathLen] != '/') { strcat(path, "/"); } g_hostdb.init(-1, NULL, false, false, path); g_conf.init(path); ucInit(); // initialize rdbs g_loop.init(); g_collectiondb.loadAllCollRecs(); g_statsdb.init(); g_posdb.init(); g_titledb.init(); g_tagdb.init(); g_spiderdb.init(); g_doledb.init(); g_spiderCache.init(); g_clusterdb.init(); g_linkdb.init(); g_collectiondb.addRdbBaseToAllRdbsForEachCollRec(); g_log.m_disabled = false; g_log.m_logPrefix = false; uint64_t docId = strtoul(argv[2], NULL, 10); logf(LOG_TRACE, "Getting titlerec for docId=%" PRIu64, docId); Msg5 msg5; RdbList list; key96_t startKey = Titledb::makeFirstKey(docId); key96_t endKey = Titledb::makeLastKey(docId); msg5.getList(RDB_TITLEDB, 0, &list, startKey, endKey, 500000000, true, 0, 0, -1, NULL, NULL, 0, true, NULL, 0, -1, -1LL, false, true); if (list.getNumRecs() != 1) { logf(LOG_TRACE, "Unable to find titlerec for docId=%" PRIu64, docId); cleanup(); exit(1); } XmlDoc xmlDoc; if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) { logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId); cleanup(); exit(1); } logf(LOG_TRACE, "XmlDoc info"); logf(LOG_TRACE, "\tfirstUrl : %.*s", xmlDoc.size_firstUrl, xmlDoc.ptr_firstUrl); logf(LOG_TRACE, "\tredirUrl : %.*s", xmlDoc.size_redirUrl, xmlDoc.ptr_redirUrl); logf(LOG_TRACE, "\trootTitle : %.*s", xmlDoc.size_rootTitleBuf, xmlDoc.ptr_rootTitleBuf); // logf(LOG_TRACE, "\timageData :"); logf(LOG_TRACE, "\t"); loghex(LOG_TRACE, xmlDoc.ptr_utf8Content, xmlDoc.size_utf8Content, "\tutf8Content:"); logf(LOG_TRACE, "\tsite : %.*s", xmlDoc.size_site, xmlDoc.ptr_site); logf(LOG_TRACE, "\tlinkInfo"); LinkInfo* linkInfo = xmlDoc.getLinkInfo1(); logf(LOG_TRACE, "\t\tm_numGoodInlinks : %d", linkInfo->m_numGoodInlinks); logf(LOG_TRACE, "\t\tm_numInlinksInternal : %d", linkInfo->m_numInlinksInternal); logf(LOG_TRACE, "\t\tm_numStoredInlinks : %d", linkInfo->m_numStoredInlinks); int i = 0; for (Inlink *inlink = linkInfo->getNextInlink(NULL); inlink; inlink = linkInfo->getNextInlink(inlink)) { logf(LOG_TRACE, "\t\tinlink #%d", i++); logf(LOG_TRACE, "\t\t\tdocId : %" PRIu64, inlink->m_docId); logf(LOG_TRACE, "\t\t\turl : %s", inlink->getUrl()); logf(LOG_TRACE, "\t\t\tlinktext : %s", inlink->getLinkText()); logf(LOG_TRACE, "\t\t\tcountry : %s", getCountryCode(inlink->m_country)); logf(LOG_TRACE, "\t\t\tlanguage : %s", getLanguageAbbr(inlink->m_language)); } loghex(LOG_TRACE, xmlDoc.ptr_linkdbData, xmlDoc.size_linkdbData, "\tlinkdbData"); logf(LOG_TRACE, "\ttagRec"); TagRec *tagRec = xmlDoc.getTagRec(); for (Tag *tag = tagRec->getFirstTag(); tag; tag = tagRec->getNextTag(tag)) { SafeBuf sb; tag->printDataToBuf(&sb); logf(LOG_TRACE, "\t\t%-12s: %s", getTagStrFromType(tag->m_type), sb.getBufStart()); } logf(LOG_TRACE, "\t"); logf(LOG_TRACE, "Links info"); g_log.m_disabled = true; Links *links = xmlDoc.getLinks(); g_log.m_disabled = false; for (int i = 0; i < links->getNumLinks(); ++i) { logf(LOG_TRACE, "\tlink : %.*s", links->getLinkLen(i), links->getLinkPtr(i)); } cleanup(); return 0; }