// . send as many zids with a msg4d request as you can // . that way we check a bunch off all at once bool Syncdb::gotList ( ) { // bigLoop() can be called again now m_outstanding = false; // error? if ( g_errno ) { log("sync: had error in msg5: %s",mstrerror(g_errno)); return false; } // shortcut RdbList *m = &m_list; // just in case m->resetListPtr(); // get the rec char *rec = m->getCurrentRec(); // get key key128_t k = *(key128_t *)rec; // sanity check if ( k != m_addMe[m_ia] ) { char *xx=NULL;*xx=0;} // . add it using msg4.cpp::addMetaList() // . sets g_errno and returns false on error if ( ! addMetaList ( rec ) ) return false; // we no longer have to add it! m_qt.deleteNode ( 0 , (char *)&k , true ); // point to next m_ia++; // free it m_list.reset(); // success return true; }
// . slot should be auto-nuked upon transmission or error // . TODO: ensure if this sendReply() fails does it really nuke the slot? void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN" ); // get the state State00 *st0 = (State00 *)state; // extract the udp slot and list and msg5 UdpSlot *slot = st0->m_slot; RdbList *list = &st0->m_list; Msg5 *msg5 = &st0->m_msg5; UdpServer *us = st0->m_us; // timing debug if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) { //log("Msg0:hndled request %" PRIu64,gettimeofdayInMilliseconds()); int32_t size = -1; if ( list ) size = list->getListSize(); log(LOG_TIMING|LOG_DEBUG, "net: msg0: Handled request for data. " "Now sending data termId=%" PRIu64" size=%" PRId32 " transId=%" PRId32" ip=%s port=%i took=%" PRId64" " "(niceness=%" PRId32").", g_posdb.getTermId(msg5->m_startKey), size,slot->m_transId, iptoa(slot->m_ip),slot->m_port, gettimeofdayInMilliseconds() - st0->m_startTime , st0->m_niceness ); } // on error nuke the list and it's data if ( g_errno ) { mdelete ( st0 , sizeof(State00) , "Msg0" ); delete (st0); // TODO: free "slot" if this send fails log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } QUICKPOLL(st0->m_niceness); // point to the serialized list in "list" char *data = list->getList(); int32_t dataSize = list->getListSize(); char *alloc = list->getAlloc(); int32_t allocSize = list->getAllocSize(); // tell list not to free the data since it is a reply so UdpServer // will free it when it destroys the slot list->setOwnData ( false ); // keep track of stats Rdb *rdb = getRdbFromId ( st0->m_rdbId ); if ( rdb ) rdb->sentReplyGet ( dataSize ); // TODO: can we free any memory here??? // keep track of how long it takes to complete the send st0->m_startTime = gettimeofdayInMilliseconds(); // debug point int32_t oldSize = msg5->m_minRecSizes; int32_t newSize = msg5->m_minRecSizes + 20; // watch for wrap around if ( newSize < oldSize ) newSize = 0x7fffffff; if ( dataSize > newSize && list->getFixedDataSize() == 0 && // do not annoy me with these linkdb msgs dataSize > newSize+100 ) log(LOG_LOGIC,"net: msg0: Sending more data than what was " "requested. Ineffcient. Bad engineer. dataSize=%" PRId32" " "minRecSizes=%" PRId32".",dataSize,oldSize); // // for linkdb lists, remove all the keys that have the same IP32 // and store a count of what we removed somewhere // if ( st0->m_rdbId == RDB_LINKDB ) { // store compressed list on itself char *dst = list->m_list; // keep stats int32_t totalOrigLinks = 0; int32_t ipDups = 0; int32_t lastIp32 = 0; char *listEnd = list->getListEnd(); // compress the list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // count it totalOrigLinks++; // get rec char *rec = list->getCurrentRec(); int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec ); // same as one before? if ( ip32 == lastIp32 && // are we the last rec? include that for // advancing the m_nextKey in Linkdb more // efficiently. rec + LDBKS < listEnd ) { ipDups++; continue; } // store it gbmemcpy (dst , rec , LDBKS ); dst += LDBKS; // update it lastIp32 = ip32; } // . if we removed one key, store the stats // . caller should recognize reply is not a multiple of // the linkdb key size LDBKS and no its there! if ( ipDups ) { //*(int32_t *)dst = totalOrigLinks; //dst += 4; //*(int32_t *)dst = ipDups; //dst += 4; } // update list parms list->m_listSize = dst - list->m_list; list->m_listEnd = list->m_list + list->m_listSize; data = list->getList(); dataSize = list->getListSize(); } //log("sending replySize=%" PRId32" min=%" PRId32,dataSize,msg5->m_minRecSizes); // . TODO: dataSize may not equal list->getListMaxSize() so // Mem class may show an imblanace // . now g_udpServer is responsible for freeing data/dataSize // . the "true" means to call doneSending_ass() from the signal handler // if need be st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true ); logTrace( g_conf.m_logTraceMsg0, "END" ); }
// . slot should be auto-nuked upon transmission or error // . TODO: ensure if this sendReply() fails does it really nuke the slot? void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) { // get the state State00 *st0 = (State00 *)state; // extract the udp slot and list and msg5 UdpSlot *slot = st0->m_slot; RdbList *list = &st0->m_list; Msg5 *msg5 = &st0->m_msg5; UdpServer *us = st0->m_us; // sanity check -- ensure they match //if ( niceness != st0->m_niceness ) // log("Msg0: niceness mismatch"); // debug msg //if ( niceness != 0 ) // log("HEY! niceness is not 0"); // timing debug if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) { //log("Msg0:hndled request %"UINT64"",gettimeofdayInMilliseconds()); int32_t size = -1; if ( list ) size = list->getListSize(); log(LOG_TIMING|LOG_DEBUG, "net: msg0: Handled request for data. " "Now sending data termId=%"UINT64" size=%"INT32"" " transId=%"INT32" ip=%s port=%i took=%"INT64" " "(niceness=%"INT32").", g_posdb.getTermId(msg5->m_startKey), size,slot->m_transId, iptoa(slot->m_ip),slot->m_port, gettimeofdayInMilliseconds() - st0->m_startTime , st0->m_niceness ); } // debug //if ( ! msg5->m_includeTree ) // log("hotit\n"); // on error nuke the list and it's data if ( g_errno ) { mdelete ( st0 , sizeof(State00) , "Msg0" ); delete (st0); // TODO: free "slot" if this send fails us->sendErrorReply ( slot , g_errno ); return; } QUICKPOLL(st0->m_niceness); // point to the serialized list in "list" char *data = list->getList(); int32_t dataSize = list->getListSize(); char *alloc = list->getAlloc(); int32_t allocSize = list->getAllocSize(); // tell list not to free the data since it is a reply so UdpServer // will free it when it destroys the slot list->setOwnData ( false ); // keep track of stats Rdb *rdb = getRdbFromId ( st0->m_rdbId ); if ( rdb ) rdb->sentReplyGet ( dataSize ); // TODO: can we free any memory here??? // keep track of how long it takes to complete the send st0->m_startTime = gettimeofdayInMilliseconds(); // debug point int32_t oldSize = msg5->m_minRecSizes; int32_t newSize = msg5->m_minRecSizes + 20; // watch for wrap around if ( newSize < oldSize ) newSize = 0x7fffffff; if ( dataSize > newSize && list->getFixedDataSize() == 0 && // do not annoy me with these linkdb msgs dataSize > newSize+100 ) log(LOG_LOGIC,"net: msg0: Sending more data than what was " "requested. Ineffcient. Bad engineer. dataSize=%"INT32" " "minRecSizes=%"INT32".",dataSize,oldSize); /* // always compress these lists if ( st0->m_rdbId == RDB_SECTIONDB ) { // && 1 == 3) { // get sh48, the sitehash key128_t *startKey = (key128_t *)msg5->m_startKey ; int64_t sh48 = g_datedb.getTermId(startKey); // debug //log("msg0: got sectiondblist from disk listsize=%"INT32"", // list->getListSize()); if ( dataSize > 50000 ) log("msg0: sending back list rdb=%"INT32" " "listsize=%"INT32" sh48=0x%"XINT64"", (int32_t)st0->m_rdbId, dataSize, sh48); // save it int32_t origDataSize = dataSize; // store compressed list on itself char *dst = list->m_list; // warn if niceness is 0! if ( st0->m_niceness == 0 ) log("msg0: compressing sectiondb list at niceness 0!"); // compress the list uint32_t lastVoteHash32 = 0LL; SectionVote *lastVote = NULL; for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // get rec char *rec = list->getCurrentRec(); // for ehre key128_t *key = (key128_t *)rec; // the score is the bit which is was set in // Section::m_flags for that docid int32_t secType = g_indexdb.getScore ( (char *)key ); // 0 means it probably used to count # of voters // from this site, so i don't think xmldoc uses // that any more if ( secType == SV_SITE_VOTER ) continue; // treat key like a datedb key and get the taghash uint32_t h32 = g_datedb.getDate ( key ); // get data/vote from the current record in the // sectiondb list SectionVote *sv=(SectionVote *)list->getCurrentData (); // get the average score for this doc float avg = sv->m_score ; if ( sv->m_numSampled > 0.0 ) avg /= sv->m_numSampled; // if same as last guy, add to it if ( lastVoteHash32 == h32 && lastVote ) { // turn possible multi-vote into single docid // into a single vote, with the score averaged. lastVote->m_score += avg; lastVote->m_numSampled++; continue; } // otherwise, add in a new guy! *(key128_t *)dst = *key; dst += sizeof(key128_t); // the new vote SectionVote *dsv = (SectionVote *)dst; dsv->m_score = avg; dsv->m_numSampled = 1; // set this lastVote = dsv; lastVoteHash32 = h32; // skip over dst += sizeof(SectionVote); } // update the list size now for sending back dataSize = dst - data; // if the list was over the requested minrecsizes we need // to set a flag so that the caller will do a re-call. // so making the entire odd, will be the flag. if ( origDataSize > msg5->m_minRecSizes && dataSize < origDataSize ) { *dst++ = '\0'; dataSize++; } // debug //log("msg0: compressed sectiondblist from disk " // "newlistsize=%"INT32"", dataSize); // use this timestamp int32_t now = getTimeLocal();//Global(); // finally, cache this sucker s_sectiondbCache.addRecord ( msg5->m_coll, (char *)startKey,//(char *)&sh48 data, dataSize , now ); // ignore errors g_errno = 0; } */ // // for linkdb lists, remove all the keys that have the same IP32 // and store a count of what we removed somewhere // if ( st0->m_rdbId == RDB_LINKDB ) { // store compressed list on itself char *dst = list->m_list; // keep stats int32_t totalOrigLinks = 0; int32_t ipDups = 0; int32_t lastIp32 = 0; char *listEnd = list->getListEnd(); // compress the list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // count it totalOrigLinks++; // get rec char *rec = list->getCurrentRec(); int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec ); // same as one before? if ( ip32 == lastIp32 && // are we the last rec? include that for // advancing the m_nextKey in Linkdb more // efficiently. rec + LDBKS < listEnd ) { ipDups++; continue; } // store it gbmemcpy (dst , rec , LDBKS ); dst += LDBKS; // update it lastIp32 = ip32; } // . if we removed one key, store the stats // . caller should recognize reply is not a multiple of // the linkdb key size LDBKS and no its there! if ( ipDups ) { //*(int32_t *)dst = totalOrigLinks; //dst += 4; //*(int32_t *)dst = ipDups; //dst += 4; } // update list parms list->m_listSize = dst - list->m_list; list->m_listEnd = list->m_list + list->m_listSize; data = list->getList(); dataSize = list->getListSize(); } //log("sending replySize=%"INT32" min=%"INT32"",dataSize,msg5->m_minRecSizes); // . TODO: dataSize may not equal list->getListMaxSize() so // Mem class may show an imblanace // . now g_udpServer is responsible for freeing data/dataSize // . the "true" means to call doneSending_ass() from the signal handler // if need be st0->m_us->sendReply_ass ( data , dataSize , alloc , // alloc allocSize , // alloc size slot , 60 , st0 , doneSending_ass , -1 , -1 , true ); }
void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) { State22 *st = (State22 *)state; // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; // shortcut Msg22Request *r = st->m_r; // breathe QUICKPOLL(r->m_niceness); // send error reply on error if ( g_errno ) { hadError: log("db: Had error getting title record from titledb: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // convenience var RdbList *tlist = &st->m_tlist; // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); if ( pd != st->m_pd ) { log("db: crap probable docids do not match! u=%s", r->m_url); g_errno = EBADENGINEER; goto hadError; } // sanity //if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // the probable docid is the PREFERRED docid in this case if ( r->m_getAvailDocIdOnly ) pd = st->m_r->m_docId; // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; bool docIdWasFound = false; // scan the titleRecs in the list for ( ; ! tlist->isExhausted() ; tlist->skipCurrentRecord ( ) ) { // breathe QUICKPOLL ( r->m_niceness ); // get the rec char *rec = tlist->getCurrentRec(); long recSize = tlist->getCurrentRecSize(); // get that key key_t *k = (key_t *)rec; // skip negative recs, first one should not be negative however if ( ( k->n0 & 0x01 ) == 0x00 ) continue; // get docid of that titlerec long long dd = g_titledb.getDocId(k); if ( r->m_getAvailDocIdOnly ) { // make sure our available docids are availble! if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; continue; } // if we had a url make sure uh48 matches else if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // make sure our available docids are availble! if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // otherwise, check docid else { // compare that if ( r->m_docId != dd ) continue; } // flag that we matched m_docId docIdWasFound = true; // do not set back titlerec if just want avail docid //if ( r->m_getAvailDocIdOnly ) continue; // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // use rec as reply char *reply = rec; // . send this rec back, it's a match // . if only one rec in list, steal the list's memory if ( recSize != tlist->getAllocSize() ) { // otherwise, alloc space for the reply reply = (char *)mmalloc (recSize, "Msg22"); if ( ! reply ) goto hadError; memcpy ( reply , rec , recSize ); } // otherwise we send back the whole list! else { // we stole this from list tlist->m_ownData = false; } // off ya go us->sendReply_ass(reply,recSize,reply,recSize,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); // all done return; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // if "docId" was unmatched that should be the preferred available // docid then... //if(! docIdWasFound && r->m_getAvailDocIdOnly && ad != r->m_docId ) { // char *xx=NULL;*xx=0; } // remember it. this might be zero if none exist! st->m_availDocId = ad; // note it if ( ad == 0LL && (r->m_getAvailDocIdOnly || r->m_url[0]) ) log("msg22: avail docid is 0 for pd=%lli!",pd); // . ok, return an available docid if ( r->m_url[0] || r->m_justCheckTfndb || r->m_getAvailDocIdOnly ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = st->m_availDocId; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // not found! and it was a docid based request... log("msg22: could not find title rec for docid %llu",r->m_docId); g_errno = ENOTFOUND; goto hadError; }
int main(int argc, char **argv) { if (argc < 3) { print_usage(argv[0]); return 1; } if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) { print_usage(argv[0]); return 1; } g_log.m_disabled = true; // initialize library g_mem.init(); hashinit(); // current dir char path[PATH_MAX]; realpath(argv[1], path); size_t pathLen = strlen(path); if (path[pathLen] != '/') { strcat(path, "/"); } g_hostdb.init(-1, NULL, false, false, path); g_conf.init(path); ucInit(); // initialize rdbs g_loop.init(); g_collectiondb.loadAllCollRecs(); g_statsdb.init(); g_posdb.init(); g_titledb.init(); g_tagdb.init(); g_spiderdb.init(); g_doledb.init(); g_spiderCache.init(); g_clusterdb.init(); g_linkdb.init(); g_collectiondb.addRdbBaseToAllRdbsForEachCollRec(); g_log.m_disabled = false; g_log.m_logPrefix = false; uint64_t docId = strtoul(argv[2], NULL, 10); logf(LOG_TRACE, "Getting titlerec for docId=%" PRIu64, docId); Msg5 msg5; RdbList list; key96_t startKey = Titledb::makeFirstKey(docId); key96_t endKey = Titledb::makeLastKey(docId); msg5.getList(RDB_TITLEDB, 0, &list, startKey, endKey, 500000000, true, 0, 0, -1, NULL, NULL, 0, true, NULL, 0, -1, -1LL, false, true); if (list.getNumRecs() != 1) { logf(LOG_TRACE, "Unable to find titlerec for docId=%" PRIu64, docId); cleanup(); exit(1); } XmlDoc xmlDoc; if (!xmlDoc.set2(list.getCurrentRec(), list.getCurrentRecSize(), "main", NULL, 0)) { logf(LOG_TRACE, "Unable to set XmlDoc for docId=%" PRIu64, docId); cleanup(); exit(1); } logf(LOG_TRACE, "XmlDoc info"); logf(LOG_TRACE, "\tfirstUrl : %.*s", xmlDoc.size_firstUrl, xmlDoc.ptr_firstUrl); logf(LOG_TRACE, "\tredirUrl : %.*s", xmlDoc.size_redirUrl, xmlDoc.ptr_redirUrl); logf(LOG_TRACE, "\trootTitle : %.*s", xmlDoc.size_rootTitleBuf, xmlDoc.ptr_rootTitleBuf); // logf(LOG_TRACE, "\timageData :"); logf(LOG_TRACE, "\t"); loghex(LOG_TRACE, xmlDoc.ptr_utf8Content, xmlDoc.size_utf8Content, "\tutf8Content:"); logf(LOG_TRACE, "\tsite : %.*s", xmlDoc.size_site, xmlDoc.ptr_site); logf(LOG_TRACE, "\tlinkInfo"); LinkInfo* linkInfo = xmlDoc.getLinkInfo1(); logf(LOG_TRACE, "\t\tm_numGoodInlinks : %d", linkInfo->m_numGoodInlinks); logf(LOG_TRACE, "\t\tm_numInlinksInternal : %d", linkInfo->m_numInlinksInternal); logf(LOG_TRACE, "\t\tm_numStoredInlinks : %d", linkInfo->m_numStoredInlinks); int i = 0; for (Inlink *inlink = linkInfo->getNextInlink(NULL); inlink; inlink = linkInfo->getNextInlink(inlink)) { logf(LOG_TRACE, "\t\tinlink #%d", i++); logf(LOG_TRACE, "\t\t\tdocId : %" PRIu64, inlink->m_docId); logf(LOG_TRACE, "\t\t\turl : %s", inlink->getUrl()); logf(LOG_TRACE, "\t\t\tlinktext : %s", inlink->getLinkText()); logf(LOG_TRACE, "\t\t\tcountry : %s", getCountryCode(inlink->m_country)); logf(LOG_TRACE, "\t\t\tlanguage : %s", getLanguageAbbr(inlink->m_language)); } loghex(LOG_TRACE, xmlDoc.ptr_linkdbData, xmlDoc.size_linkdbData, "\tlinkdbData"); logf(LOG_TRACE, "\ttagRec"); TagRec *tagRec = xmlDoc.getTagRec(); for (Tag *tag = tagRec->getFirstTag(); tag; tag = tagRec->getNextTag(tag)) { SafeBuf sb; tag->printDataToBuf(&sb); logf(LOG_TRACE, "\t\t%-12s: %s", getTagStrFromType(tag->m_type), sb.getBufStart()); } logf(LOG_TRACE, "\t"); logf(LOG_TRACE, "Links info"); g_log.m_disabled = true; Links *links = xmlDoc.getLinks(); g_log.m_disabled = false; for (int i = 0; i < links->getNumLinks(); ++i) { logf(LOG_TRACE, "\tlink : %.*s", links->getLinkLen(i), links->getLinkPtr(i)); } cleanup(); return 0; }