void handleRequest22 ( UdpSlot *slot , long netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // get this //char *coll = g_collectiondb.getCollName ( r->m_collnum ); // sanity check long requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %li bytes for title record. " "Need at least 28.", requestSize ); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase; if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) { log("db: Could not get title rec in collection # %li " "because rdbbase is null.", (long)r->m_collnum); g_errno = EBADENGINEER; us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) r->m_maxCacheAge = 0; // keep track of stats //if (r->m_justCheckTfndb) // g_tfndb.getRdb()->readRequestGet(requestSize); // else g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%i): %s", sizeof(State22), mstrerror(g_errno)); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . make the keys for getting recs from tfndb // . url recs map docid to the title file # that contains the titleRec //key_t uk1 ; //key_t uk2 ; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( r->m_docId ); //uk2 = g_tfndb.makeMaxKey ( r->m_docId ); st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { long long pd = r->m_docId; long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { long dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s",r->m_url); g_errno = EBADURL; us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } long long pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( d1 ); //uk2 = g_tfndb.makeMaxKey ( d2 ); // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); /* // shortcut Rdb *tdb = g_titledb.getRdb(); // init this st->m_tfn2 = -1; // skip tfndb lookup if we can. saves some time. if ( g_conf.m_readOnlyMode && // must not be a *url* lookup, it must be a docid lookup ! r->m_url[0] && // tree must be empty too i guess tdb->getTree()->getNumUsedNodes() ==0 ) { // the RdbBase contains the BigFiles for tfndb RdbBase *base = tdb->m_bases[r->m_collnum]; // can only have one titledb file if ( base->getNumFiles() == 1 ) { // now we can get RdbBase st->m_tfn2 = base->m_fileIds2[0]; // sanity check if ( st->m_tfn2 < 0 ) { char *xx = NULL; *xx = 0; } } } // check the tree for this docid RdbTree *tt = tdb->getTree(); // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); long n = tt->getNextNode ( r->m_collnum , startKey ); // there should only be one match, one titlerec per docid! for ( ; n >= 0 ; n = tt->getNextNode ( n ) ) { // break if collnum does not match. we exceeded our tree range. if ( tt->getCollnum ( n ) != r->m_collnum ) break; // get the key of this node key_t k = *(key_t *)tt->getKey(n); // if passed limit, break out, no match if ( k > endKey ) break; // if we had a url make sure uh48 matches if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(&k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // . if we matched a negative key, then skip // . just break out here and enter the normal logic // . it should load tfndb and find that it is not in tfndb // because when you add a negative key to titledb in // Rdb::addList, it adds a negative rec to tfndb immediately // . NO! because we add the negative key to the tree when we // delete the old titledb rec, then we add the new one! // when a negative key is added Rdb::addRecord() removes // the positive key (and vice versa) from the tree. if ( KEYNEG((char *)&k) ) continue; // if just checking for its existence, we are done if ( r->m_justCheckTfndb ) { us->sendReply_ass ( NULL,0,NULL,0,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // ok, we got a match, return it char *data = tt->getData ( n ); long dataSize = tt->getDataSize ( n ); // wierd! if ( dataSize == 0 ) { char *xx=NULL;*xx=0; } // send the whole rec back long need = 12 + 4 + dataSize; // will this copy it? not! char *buf = (char *)mmalloc ( need , "msg22t" ); if ( ! buf ) { us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // log it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: found %s in titledb tree", r->m_url); // store in the buf for sending char *p = buf; // store key *(key_t *)p = k; p += sizeof(key_t); // then dataSize *(long *)p = dataSize; p += 4; // then the data memcpy ( p , data , dataSize ); p += dataSize; // send off the record us->sendReply_ass (buf, need,buf, need,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // if we did not need to consult tfndb cuz we only have one file if ( st->m_tfn2 >= 0 ) { gotUrlListWrapper ( st , NULL , NULL ); return; } // . get the list of url recs for this docid range // . this should not block, tfndb SHOULD all be in memory all the time // . use 500 million for min recsizes to get all in range // . no, using 500MB causes problems for RdbTree::getList, so use // 100k. how many recs can there be? if ( ! st->m_msg5.getList ( RDB_TFNDB , coll , &st->m_ulist , uk1 , // startKey uk2 , // endKey // use 0x7fffffff preceisely because it // will determine eactly how long the // tree list needs to allocate in Msg5.cpp 0x7fffffff , // minRecSizes true , // includeTree? false , // addToCache? 0 , // max cache age 0 , // startFileNum -1 , // numFiles (-1 =all) st , gotUrlListWrapper , r->m_niceness , true ))// error correction? return ; // we did not block gotUrlListWrapper ( st , NULL , NULL ); } static void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) ; void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) { // shortcuts State22 *st = (State22 *)state; UdpServer *us = &g_udpServer; // bail on error if ( g_errno ) { log("db: Had error getting info from tfndb: %s.", mstrerror(g_errno)); log("db: uk1.n1=%li n0=%lli uk2.n1=%li n0=%lli " "d1=%lli d2=%lli.", ((key_t *)st->m_msg5.m_startKey)->n1 , ((key_t *)st->m_msg5.m_startKey)->n0 , ((key_t *)st->m_msg5.m_endKey)->n1 , ((key_t *)st->m_msg5.m_endKey)->n0 , st->m_docId1 , st->m_docId2 ); us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // shortcuts RdbList *ulist = &st->m_ulist; Msg22Request *r = st->m_r; char *coll = g_collectiondb.getCollName ( r->m_collnum ); // point to top just in case ulist->resetListPtr(); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase(RDB_TITLEDB,coll); // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); // sanity if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; long tfn = -1; // sanity check. make sure did not load from tfndb if did not need to if ( ! ulist->isExhausted() && st->m_tfn2 >= 0 ) {char *xx=NULL;*xx=0;} // if only one titledb file and none in memory use it if ( st->m_tfn2 >= 0 ) tfn = st->m_tfn2; // we may have multiple tfndb recs but we should NEVER have to read // multiple titledb files... for ( ; ! ulist->isExhausted() ; ulist->skipCurrentRecord() ) { // breathe QUICKPOLL ( r->m_niceness ); // get first rec key_t k = ulist->getCurrentKey(); // . skip negative keys // . seems to happen when we have tfndb in the tree... if ( KEYNEG((char *)&k) ) continue; // if we have a url and no docid, we gotta check uh48! if ( r->m_url[0] && g_tfndb.getUrlHash48(&k)!=st->m_uh48){ // get docid of that guy long long dd = g_tfndb.getDocId(&k); // if matches avail docid, inc it if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // try next tfndb key continue; } // . get file num this rec is stored in // . this is updated right after the file num is merged by // scanning all records in tfndb. this is very quick if all // of tfndb is in memory, otherwise, it might take a few // seconds. update call done in RdbMerge::incorporateMerge(). tfn = g_tfndb.getTfn ( &k ); // i guess we got a good match! break; } // sanity check. 255 used to mean in spiderdb or in tree if ( tfn >= 255 ) { char *xx=NULL;*xx=0; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // breathe QUICKPOLL ( r->m_niceness); // . log if different // . if our url rec was in there, this could still be different // if there was another url rec in there with the same docid and // a diferent extension, but with a tfn of 255, meaning that it // is just in spiderdb and not in titledb yet. so it hasn't been // assigned a permanent docid... // . another way "ad" may be different now is from the old bug which // did not chain the docid properly because it limited the docid // chaining to one titleRec file. so conceivably we can have // different docs sharing the same docids, but with different // url hash extensions. for instance, on host #9 we have: // 00f3b2ff63aec3a9 docId=261670033643 e=0x58 tfn=117 clean=0 half=0 // 00f3b2ff63af66c9 docId=261670033643 e=0x6c tfn=217 clean=0 half=0 // . Msg16 will only use the avail docid if the titleRec is not found if ( r->m_url[0] && pd != ad ) { //log(LOG_INFO,"build: Docid %lli collided. %s Changing " // // http://www.airliegardens.org/events.asp?dt=2&date=8/5/2011 // // COLLIDES WITH // // http://www.bbonline.com/i/chicago.html // // collision alert! log("spider: Docid %lli collided. %s Changing " "to %lli.", r->m_docId , r->m_url , ad ); // debug this for now //char *xx=NULL;*xx=0; } // remember it st->m_availDocId = ad; // if tfn is -1 then it was not in titledb if ( tfn == -1 ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = ad; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // sanity if ( tfn < 0 ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL ( r->m_niceness ); // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // . compute the file scan range // . tfn is now equivalent to Rdb's id2, a secondary file id, it // follows the hyphen in "titledb0001-023.dat" // . default to just scan the root file AND the tree, cuz we're // assuming restrictToRoot was set to true so we did not get a tfndb // list // . even if a file number is given, always check the tree in case // it got re-spidered // . shit, but we can still miss it if it gets dumped right after // our thread is spawned, in which case we'd fall back to the old // version. no. because if its in the tree now we get it before // spawning a thread. there is no blocking. TRICKY. so if it is in // the tree at this point we'll get it, but may end up scanning the // file with the older version of the doc... not too bad. long startFileNum = tbase->getFileNumFromId2 ( tfn ); // if tfn refers to a missing titledb file... if ( startFileNum < 0 ) { if ( r->m_url[0] ) log("db: titledb missing url %s",r->m_url); else log("db: titledb missing docid %lli", r->m_docId); us->sendErrorReply ( st->m_slot,ENOTFOUND ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // save this st->m_tfn = tfn; */ // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &st->m_msg5b ) ) return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
void handleRequest22 ( UdpSlot *slot , int32_t netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // sanity check int32_t requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %" PRId32" bytes for title record. " "Need at least 28.", requestSize ); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase( RDB_TITLEDB, r->m_collnum ); if ( ! tbase ) { log("db: Could not get title rec in collection # %" PRId32" because rdbbase is null.", (int32_t)r->m_collnum); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) { r->m_maxCacheAge = 0; } g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%" PRId32"): %s", (int32_t)sizeof(State22), mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { int64_t pd = r->m_docId; int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { int32_t dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s from " "hostid %" PRId32" for msg22 call ", r->m_url,slot->m_host->m_hostId); g_errno = EBADURL; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL ) ) // sync point return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }