// init our rdb bool Titledb::init ( ) { // key sanity tests int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL; int64_t docId = 123456789; key96_t k = makeKey(docId,uh48,false); if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);} if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);} const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html"; Url uu; uu.set(url); const char *d1 = uu.getDomain(); int32_t dlen1 = uu.getDomainLen(); int32_t dlen2 = 0; const char *d2 = getDomFast ( url , &dlen2 ); if ( !d1 || !d2 ) { g_process.shutdownAbort(true); } if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); } // another one url = "http://ok/"; uu.set(url); const char *d1a = uu.getDomain(); dlen1 = uu.getDomainLen(); dlen2 = 0; const char *d2a = getDomFast ( url , &dlen2 ); if ( d1a || d2a ) { g_process.shutdownAbort(true); } if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); } // . what's max # of tree nodes? // . assume avg TitleRec size (compressed html doc) is about 1k we get: // . NOTE: overhead is about 32 bytes per node int32_t maxTreeNodes = g_conf.m_titledbMaxTreeMem / (1*1024); // initialize our own internal rdb return m_rdb.init ( "titledb" , -1 , // fixed record size //g_conf.m_titledbMinFilesToMerge , // this should not really be changed... -1, g_conf.m_titledbMaxTreeMem , maxTreeNodes , false, // half keys? 12, // key size false, //isCollectionLess false); //useIndexFile // validate //return verify ( ); }
// init our rdb bool Titledb::init ( ) { // key sanity tests int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL; int64_t docId = 123456789; key_t k = makeKey(docId,uh48,false); if ( getDocId(&k) != docId ) { char *xx=NULL;*xx=0;} if ( getUrlHash48(&k) != uh48 ) { char *xx=NULL;*xx=0;} char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html"; Url uu; uu.set(url); char *d1 = uu.getDomain(); int32_t dlen1 = uu.getDomainLen(); int32_t dlen2 = 0; char *d2 = getDomFast ( url , &dlen2 ); if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; } // another one url = "http://ok/"; uu.set(url); d1 = uu.getDomain(); dlen1 = uu.getDomainLen(); dlen2 = 0; d2 = getDomFast ( url , &dlen2 ); if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; } int64_t maxMem = 200000000; // 200MB // . what's max # of tree nodes? // . assume avg TitleRec size (compressed html doc) is about 1k we get: // . NOTE: overhead is about 32 bytes per node int32_t maxTreeNodes = maxMem / (1*1024); // . we now use a disk page cache for titledb as opposed to the // old rec cache. i am trying to do away with the Rdb::m_cache rec // cache in favor of cleverly used disk page caches, because // the rec caches are not real-time and get stale. // . just hard-code 30MB for now int32_t pcmem = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem; // f**k that we need all the mem! //pcmem = 0; // do not use any page cache if doing tmp cluster in order to // prevent swapping if ( g_hostdb.m_useTmpCluster ) pcmem = 0; int32_t pageSize = GB_INDEXDB_PAGE_SIZE; // init the page cache // . MDW: "minimize disk seeks" not working otherwise i'd enable it! if ( ! m_pc.init ( "titledb", RDB_TITLEDB, pcmem , pageSize ) ) return log("db: Titledb init failed."); // each entry in the cache is usually just a single record, no lists //int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024); // initialize our own internal rdb if ( ! m_rdb.init ( g_hostdb.m_dir , "titledb" , true , // dedup same keys? -1 , // fixed record size //g_hostdb.m_groupMask , //g_hostdb.m_groupId , //g_conf.m_titledbMinFilesToMerge , // this should not really be changed... -1,//3,//230 minfilestomerge mintomerge maxMem, // g_conf.m_titledbMaxTreeMem , maxTreeNodes , // now we balance so Sync.cpp can ordered huge list true , // balance tree? // turn off cache for now because the page cache // is just as fast and does not get out of date // so bad?? //0 , 0,//g_conf.m_titledbMaxCacheMem , 0,//maxCacheNodes , false ,// half keys? false ,// g_conf.m_titledbSav &m_pc , // page cache ptr true ) )// is titledb? return false; return true; // validate //return verify ( ); }
void Blaster::gotDoc2 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: Lost the Request in gotDoc2"); m_launched--; //No need to point p2 // Free stateBD freeStateBD(st); return; } // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it // s->m_readBuf = NULL; long long now = gettimeofdayInMilliseconds(); // So now after getting both docIds, get their contents char *reply1 = st->m_buf1 ; long size1 = st->m_buf1Len; HttpMime mime1; mime1.set ( reply1 , size1 , NULL ); char *content1 = reply1 + mime1.getMimeLen(); long content1Len = size1 - mime1.getMimeLen(); unsigned long h = hash32 ( content1 , content1Len ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u2 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u2 ); if (m_verbose){ log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s", content1Len,content1); log(LOG_WARN,"\n"); } char *reply2 = s->m_readBuf ; long size2 = s->m_readOffset; HttpMime mime2; mime2.set ( reply2 , size2 , NULL ); char *content2 = reply2 + mime2.getMimeLen(); long content2Len = size2 - mime2.getMimeLen(); if (m_verbose) log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s", content2Len,content2); // Now that we've got the contents, lets get the url links out // of these pages.Passing them to function getSearchLinks should // get the first x links found out. /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3"); st->m_links2=st->m_links1+100*MAX_URL_LEN; st->m_numLinks1=100; st->m_numLinks2=100;*/ /* long numLinks1=getSearchLinks(content1,content1Len, st->m_links1,st->m_numLinks1); long numLinks2=getSearchLinks(content2,content2Len, st->m_links2,st->m_numLinks2);*/ content1[content1Len]='\0'; //short csEnum1= get_iana_charset(mime1.getCharset(), // mime1.getCharsetLen()); /* if (csEnum1== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml1; // assume utf8 if (!xml1.set(content1, content1Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); } Links links1; Url parent; parent.set ( st->m_u1); if (!links1.set(false , // userellnofollow &xml1, &parent,//mime1.getLocationUrl(), parent Url false, // setLinkHashes NULL , // baseUrl TITLEREC_CURRENT_VERSION, // version 0 , // niceness false , // parent is permalink? NULL )) { // oldLinks log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2"); } content2[content2Len]='\0'; //short csEnum2= get_iana_charset(mime2.getCharset(), // mime2.getCharsetLen()); /* if (csEnum2== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml2; if (!xml2.set(content2, content2Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); } Links links2; parent.set(st->m_u2); if (!links2.set(0,//siterec xml &xml2, &parent,//&st->m_u2,//mime2.getLocationUrl(), false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2"); } // put the hash of the sites into a hashtable, since we have // about a 100 or so of them HashTableT<unsigned long, bool> urlHash; // put the urls from doc2 into the hastable, but first check if // they are links to google or gigablast (for now). For msn and // yahoo we have to add other checks. char domain2[256]; long dlen = 0; char *dom = getDomFast ( st->m_u2 , &dlen ); if ( dom ) strncpy(domain2,dom,dlen); domain2[dlen]='\0'; for (long i=0;i<links2.getNumLinks();i++){ // The dots check if exactly google or gigablast are present // in the link char *ss=links2.getLink(i); char *p; p=strstr(ss,domain2); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc2=%s" ,links2.getLink(i)); unsigned long h=hash32Lower_a(links2.getLink(i), links2.getLinkLen(i)); //should i check for conflict. no, because it doesn't matter urlHash.addKey(h,1); } // now check if the urls from doc1 are in doc2. save the // ones that are not // in there for later. /* long numUrlsToCheck=links2.getNumLinks();*/ long numUrlsNotFound=0; /*if (numLinks1<numUrlsToCheck) numUrlsToCheck=numLinks1;*/ char domain1[256]; dlen = 0; dom = getDomFast ( st->m_u1 ,&dlen ); if ( dom ) strncpy(domain1,dom,dlen); domain1[dlen]='\0'; for (long i=0;i<links1.getNumLinks();i++){ char *ss=links1.getLink(i); char *p; p=strstr(ss,domain1); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc1=%s" ,links1.getLink(i)); unsigned long h=hash32Lower_a(links1.getLink(i), links1.getLinkLen(i)); long slot= urlHash.getSlot(h); if(slot!=-1) continue; // if url is not present, get its doc. if (m_verbose || m_justDisplay) log(LOG_WARN,"blaster: NOT FOUND %s in %s" ,links1.getLink(i),domain2); numUrlsNotFound++; //Don't do anything else if just have to display the urls if (m_justDisplay) continue; //now get the doc of these urls //initialize st->m_numUrlDocsReceived=0; StateBD2 *st2; try { st2 = new (StateBD2); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD2)); return; } mnew ( st2 , sizeof(StateBD2) , "Blaster4" ); //Point to the big state; st2->m_st=st; //Msg16 does 6 redirects, so I do 6 too st2->m_numRedirects=6; //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i)); st2->m_url = links1.getLink(i); // No need for a proxy ip here, since we are fetching // doc's from different IPs. Faster this way bool status = g_httpServer.getDoc ( st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st2, // state gotDocWrapper3, // callback 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. st->m_numUrlDocsReceived++; } st->m_numUrlDocsSent=numUrlsNotFound; //There might have been an error while sending the docs, so if there //has been put a check if ( st->m_numUrlDocsReceived > 0 && st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){ log(LOG_WARN,"blaster: %li docs could not be sent due to " "error",st->m_numUrlDocsReceived); m_launched--; freeStateBD(st); return; } if (numUrlsNotFound==0){ //job done for this pair log(LOG_WARN,"blaster: All urls from %s found in " "%s",domain1,domain2); m_launched--; // Free stateBD freeStateBD(st); return; } log(LOG_WARN,"blaster: %li urls from %s Not found in %s", numUrlsNotFound,domain1,domain2); if(m_justDisplay){ m_launched--; // Free stateBD freeStateBD(st); } return; }
void handleRequest22 ( UdpSlot *slot , long netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // get this //char *coll = g_collectiondb.getCollName ( r->m_collnum ); // sanity check long requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %li bytes for title record. " "Need at least 28.", requestSize ); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase; if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) { log("db: Could not get title rec in collection # %li " "because rdbbase is null.", (long)r->m_collnum); g_errno = EBADENGINEER; us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) r->m_maxCacheAge = 0; // keep track of stats //if (r->m_justCheckTfndb) // g_tfndb.getRdb()->readRequestGet(requestSize); // else g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%i): %s", sizeof(State22), mstrerror(g_errno)); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . make the keys for getting recs from tfndb // . url recs map docid to the title file # that contains the titleRec //key_t uk1 ; //key_t uk2 ; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( r->m_docId ); //uk2 = g_tfndb.makeMaxKey ( r->m_docId ); st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { long long pd = r->m_docId; long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { long dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s",r->m_url); g_errno = EBADURL; us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } long long pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( d1 ); //uk2 = g_tfndb.makeMaxKey ( d2 ); // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); /* // shortcut Rdb *tdb = g_titledb.getRdb(); // init this st->m_tfn2 = -1; // skip tfndb lookup if we can. saves some time. if ( g_conf.m_readOnlyMode && // must not be a *url* lookup, it must be a docid lookup ! r->m_url[0] && // tree must be empty too i guess tdb->getTree()->getNumUsedNodes() ==0 ) { // the RdbBase contains the BigFiles for tfndb RdbBase *base = tdb->m_bases[r->m_collnum]; // can only have one titledb file if ( base->getNumFiles() == 1 ) { // now we can get RdbBase st->m_tfn2 = base->m_fileIds2[0]; // sanity check if ( st->m_tfn2 < 0 ) { char *xx = NULL; *xx = 0; } } } // check the tree for this docid RdbTree *tt = tdb->getTree(); // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); long n = tt->getNextNode ( r->m_collnum , startKey ); // there should only be one match, one titlerec per docid! for ( ; n >= 0 ; n = tt->getNextNode ( n ) ) { // break if collnum does not match. we exceeded our tree range. if ( tt->getCollnum ( n ) != r->m_collnum ) break; // get the key of this node key_t k = *(key_t *)tt->getKey(n); // if passed limit, break out, no match if ( k > endKey ) break; // if we had a url make sure uh48 matches if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(&k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // . if we matched a negative key, then skip // . just break out here and enter the normal logic // . it should load tfndb and find that it is not in tfndb // because when you add a negative key to titledb in // Rdb::addList, it adds a negative rec to tfndb immediately // . NO! because we add the negative key to the tree when we // delete the old titledb rec, then we add the new one! // when a negative key is added Rdb::addRecord() removes // the positive key (and vice versa) from the tree. if ( KEYNEG((char *)&k) ) continue; // if just checking for its existence, we are done if ( r->m_justCheckTfndb ) { us->sendReply_ass ( NULL,0,NULL,0,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // ok, we got a match, return it char *data = tt->getData ( n ); long dataSize = tt->getDataSize ( n ); // wierd! if ( dataSize == 0 ) { char *xx=NULL;*xx=0; } // send the whole rec back long need = 12 + 4 + dataSize; // will this copy it? not! char *buf = (char *)mmalloc ( need , "msg22t" ); if ( ! buf ) { us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // log it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: found %s in titledb tree", r->m_url); // store in the buf for sending char *p = buf; // store key *(key_t *)p = k; p += sizeof(key_t); // then dataSize *(long *)p = dataSize; p += 4; // then the data memcpy ( p , data , dataSize ); p += dataSize; // send off the record us->sendReply_ass (buf, need,buf, need,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // if we did not need to consult tfndb cuz we only have one file if ( st->m_tfn2 >= 0 ) { gotUrlListWrapper ( st , NULL , NULL ); return; } // . get the list of url recs for this docid range // . this should not block, tfndb SHOULD all be in memory all the time // . use 500 million for min recsizes to get all in range // . no, using 500MB causes problems for RdbTree::getList, so use // 100k. how many recs can there be? if ( ! st->m_msg5.getList ( RDB_TFNDB , coll , &st->m_ulist , uk1 , // startKey uk2 , // endKey // use 0x7fffffff preceisely because it // will determine eactly how long the // tree list needs to allocate in Msg5.cpp 0x7fffffff , // minRecSizes true , // includeTree? false , // addToCache? 0 , // max cache age 0 , // startFileNum -1 , // numFiles (-1 =all) st , gotUrlListWrapper , r->m_niceness , true ))// error correction? return ; // we did not block gotUrlListWrapper ( st , NULL , NULL ); } static void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) ; void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) { // shortcuts State22 *st = (State22 *)state; UdpServer *us = &g_udpServer; // bail on error if ( g_errno ) { log("db: Had error getting info from tfndb: %s.", mstrerror(g_errno)); log("db: uk1.n1=%li n0=%lli uk2.n1=%li n0=%lli " "d1=%lli d2=%lli.", ((key_t *)st->m_msg5.m_startKey)->n1 , ((key_t *)st->m_msg5.m_startKey)->n0 , ((key_t *)st->m_msg5.m_endKey)->n1 , ((key_t *)st->m_msg5.m_endKey)->n0 , st->m_docId1 , st->m_docId2 ); us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // shortcuts RdbList *ulist = &st->m_ulist; Msg22Request *r = st->m_r; char *coll = g_collectiondb.getCollName ( r->m_collnum ); // point to top just in case ulist->resetListPtr(); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase(RDB_TITLEDB,coll); // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); // sanity if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; long tfn = -1; // sanity check. make sure did not load from tfndb if did not need to if ( ! ulist->isExhausted() && st->m_tfn2 >= 0 ) {char *xx=NULL;*xx=0;} // if only one titledb file and none in memory use it if ( st->m_tfn2 >= 0 ) tfn = st->m_tfn2; // we may have multiple tfndb recs but we should NEVER have to read // multiple titledb files... for ( ; ! ulist->isExhausted() ; ulist->skipCurrentRecord() ) { // breathe QUICKPOLL ( r->m_niceness ); // get first rec key_t k = ulist->getCurrentKey(); // . skip negative keys // . seems to happen when we have tfndb in the tree... if ( KEYNEG((char *)&k) ) continue; // if we have a url and no docid, we gotta check uh48! if ( r->m_url[0] && g_tfndb.getUrlHash48(&k)!=st->m_uh48){ // get docid of that guy long long dd = g_tfndb.getDocId(&k); // if matches avail docid, inc it if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // try next tfndb key continue; } // . get file num this rec is stored in // . this is updated right after the file num is merged by // scanning all records in tfndb. this is very quick if all // of tfndb is in memory, otherwise, it might take a few // seconds. update call done in RdbMerge::incorporateMerge(). tfn = g_tfndb.getTfn ( &k ); // i guess we got a good match! break; } // sanity check. 255 used to mean in spiderdb or in tree if ( tfn >= 255 ) { char *xx=NULL;*xx=0; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // breathe QUICKPOLL ( r->m_niceness); // . log if different // . if our url rec was in there, this could still be different // if there was another url rec in there with the same docid and // a diferent extension, but with a tfn of 255, meaning that it // is just in spiderdb and not in titledb yet. so it hasn't been // assigned a permanent docid... // . another way "ad" may be different now is from the old bug which // did not chain the docid properly because it limited the docid // chaining to one titleRec file. so conceivably we can have // different docs sharing the same docids, but with different // url hash extensions. for instance, on host #9 we have: // 00f3b2ff63aec3a9 docId=261670033643 e=0x58 tfn=117 clean=0 half=0 // 00f3b2ff63af66c9 docId=261670033643 e=0x6c tfn=217 clean=0 half=0 // . Msg16 will only use the avail docid if the titleRec is not found if ( r->m_url[0] && pd != ad ) { //log(LOG_INFO,"build: Docid %lli collided. %s Changing " // // http://www.airliegardens.org/events.asp?dt=2&date=8/5/2011 // // COLLIDES WITH // // http://www.bbonline.com/i/chicago.html // // collision alert! log("spider: Docid %lli collided. %s Changing " "to %lli.", r->m_docId , r->m_url , ad ); // debug this for now //char *xx=NULL;*xx=0; } // remember it st->m_availDocId = ad; // if tfn is -1 then it was not in titledb if ( tfn == -1 ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = ad; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // sanity if ( tfn < 0 ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL ( r->m_niceness ); // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // . compute the file scan range // . tfn is now equivalent to Rdb's id2, a secondary file id, it // follows the hyphen in "titledb0001-023.dat" // . default to just scan the root file AND the tree, cuz we're // assuming restrictToRoot was set to true so we did not get a tfndb // list // . even if a file number is given, always check the tree in case // it got re-spidered // . shit, but we can still miss it if it gets dumped right after // our thread is spawned, in which case we'd fall back to the old // version. no. because if its in the tree now we get it before // spawning a thread. there is no blocking. TRICKY. so if it is in // the tree at this point we'll get it, but may end up scanning the // file with the older version of the doc... not too bad. long startFileNum = tbase->getFileNumFromId2 ( tfn ); // if tfn refers to a missing titledb file... if ( startFileNum < 0 ) { if ( r->m_url[0] ) log("db: titledb missing url %s",r->m_url); else log("db: titledb missing docid %lli", r->m_docId); us->sendErrorReply ( st->m_slot,ENOTFOUND ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // save this st->m_tfn = tfn; */ // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &st->m_msg5b ) ) return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
void handleRequest22 ( UdpSlot *slot , int32_t netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // sanity check int32_t requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %" PRId32" bytes for title record. " "Need at least 28.", requestSize ); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase( RDB_TITLEDB, r->m_collnum ); if ( ! tbase ) { log("db: Could not get title rec in collection # %" PRId32" because rdbbase is null.", (int32_t)r->m_collnum); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) { r->m_maxCacheAge = 0; } g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%" PRId32"): %s", (int32_t)sizeof(State22), mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { int64_t pd = r->m_docId; int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { int32_t dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s from " "hostid %" PRId32" for msg22 call ", r->m_url,slot->m_host->m_hostId); g_errno = EBADURL; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL ) ) // sync point return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
// . returns true if all done! // . returns false if still doing stuff bool Test::injectLoop ( ) { long dlen ; char *dom ; long fakeIp ; loop: // advance to next url for ( ; m_urlPtr < m_urlEnd && ! *m_urlPtr ; m_urlPtr++ ) ; // all done? if ( m_urlPtr >= m_urlEnd ) { // flush em out if ( ! flushMsg4Buffers ( this , injectedWrapper ) ) return false; // note it m_isAdding = false; // all done return true; } // error means all done if ( m_errno ) { m_isAdding = false; return true; } // point to it char *u = m_urlPtr; // advance to point to the next url for the next loop! for ( ; m_urlPtr < m_urlEnd && *m_urlPtr ; m_urlPtr++ ) ; // hash it long long h = hash64b ( u ); // dedup it lest we freeze up and stopIt() never gets called because // m_urlsAdded is never decremented all the way to zero in Spider.cpp if ( m_dt.isInTable ( &h ) ) goto loop; // add it. return true with g_errno set on error if ( ! m_dt.addKey ( &h ) ) goto hadError; // make the SpiderRequest from it m_sreq.reset(); // url strcpy ( m_sreq.m_url , u ); // get domain of url dom = getDomFast ( m_sreq.m_url , &dlen ); // make a fake ip fakeIp = 0x123456; // use domain if we got that if ( dom && dlen ) fakeIp = hash32 ( dom , dlen ); // first ip is fake m_sreq.m_firstIp = fakeIp; // 0x123456; // these too m_sreq.m_domHash32 = fakeIp; m_sreq.m_hostHash32 = fakeIp; m_sreq.m_siteHash32 = fakeIp; m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url ); // this crap is fake m_sreq.m_isInjecting = 1; // use test-spider subdir for storing pages and spider times? if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1; // use this later m_sreq.m_hasContent = 0; // injected requests use this as the spider time i guess // so we can sort them by this m_sreq.m_addedTime = ++s_count; // no, because to compute XmlDoc::m_min/maxPubDate we need this to // be valid for our test run.. no no we will fix it to be // basically 2 days before spider time in the code... //m_sreq.m_addedTime = spiderTime; m_sreq.m_fakeFirstIp = 1; // make the key (parentDocId=0) m_sreq.setKey ( fakeIp, 0LL , false ); // test it if ( g_spiderdb.getFirstIp(&m_sreq.m_key) != fakeIp ) { char *xx=NULL;*xx=0;} // sanity check. check for http(s):// if ( m_sreq.m_url[0] != 'h' ) { char *xx=NULL;*xx=0; } // reset this g_errno = 0; // count it m_urlsAdded++; // note it //log("crazyout: %s",m_sreq.m_url ); logf(LOG_DEBUG,"spider: injecting test url %s",m_sreq.m_url); // the receiving end will realize that we are injecting into the test // collection and use the "/test/" subdir to load the file // "ips.txt" to do our ip lookups, and search for any downloads in // that subdirectory as well. if ( ! m_msg4.addMetaList ( (char *)&m_sreq , m_sreq.getRecSize() , m_coll , NULL , injectedWrapper , MAX_NICENESS , RDB_SPIDERDB ) ) // return false if blocked return false; // error? if ( g_errno ) { // jump down here from above on error hadError: // save it m_errno = g_errno; // flag it m_isAdding = false; // note it log("test: inject had error: %s",mstrerror(g_errno)); // stop, we are all done! return true; } // add the next spider request goto loop; }
// . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long urlLen = 0; char *url = r->getString ( "u" , &urlLen , NULL /*default*/); // see if they provided a url of a file of urls if they did not // provide a url to add directly //bool isAdmin = g_collectiondb.isAdmin ( r , s ); bool isAdmin = r->getIsLocal(); long ufuLen = 0; char *ufu = NULL; if ( isAdmin ) // get the url of a file of urls (ufu) ufu = r->getString ( "ufu" , &ufuLen , NULL ); // can't be too long, that's obnoxious if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) { g_errno = EBUFTOOSMALL; g_msg = " (error: url too long)"; return g_httpServer.sendErrorReply(s,500,"url too long"); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // get collection rec CollectionRec *cr = g_collectiondb.getRec ( coll ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; g_msg = " (error: no collection)"; return g_httpServer.sendErrorReply(s,500,"no coll rec"); } // . make sure the ip is not banned // . we may also have an exclusive list of IPs for private collections if ( ! cr->hasSearchPermission ( s ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a new state State1 *st1 ; try { st1 = new (State1); } catch ( ... ) { g_errno = ENOMEM; log("PageAddUrl: new(%i): %s", sizeof(State1),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } mnew ( st1 , sizeof(State1) , "PageAddUrl" ); // save socket and isAdmin st1->m_socket = s; st1->m_isAdmin = isAdmin; // assume no url buf yet, set below //st1->m_ubuf = NULL; //st1->m_ubufAlloc = NULL; //st1->m_metaList = NULL; // save the url st1->m_url[0] = '\0'; if ( url ) { // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer st1->m_urlLen=cleanInput ( st1->m_url, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); // point to that as the url "buf" to add //st1->m_ubuf = st1->m_url; //st1->m_ubufSize = urlLen; //st1->m_ubufAlloc = NULL; // do not free it! } // save the "ufu" (url of file of urls) st1->m_ufu[0] = '\0'; st1->m_ufuLen = ufuLen; memcpy ( st1->m_ufu , ufu , ufuLen ); st1->m_ufu[ufuLen] = '\0'; st1->m_doTuringTest = cr->m_doTuringTest; char *username = g_users.getUsername(r); if(username) strcpy(st1->m_username,username); //st1->m_user = g_pages.getUserType ( s , r ); st1->m_spiderLinks = true; st1->m_strip = true; //st1->m_raw = r->getLong("raw",0); // init state2 for ( long i = 0; i < 5; i++ ){ st1->m_state2[i].m_buf = NULL; st1->m_state2[i].m_bufLen = 0; st1->m_state2[i].m_bufMaxLen = 0; } // save the collection name in the State1 class if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; strncpy ( st1->m_coll , coll , collLen ); st1->m_coll [ collLen ] = '\0'; // assume they answered turing test correctly st1->m_goodAnswer = true; // if addurl is turned off, just print "disabled" msg if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false ); // can also be turned off in the collection rec if ( ! cr->m_addUrlEnabled ) return sendReply ( st1 , false ); // or if in read-only mode if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false ); // cannot add if another Msg10 from here is still in progress if ( s_inprogress ) return sendReply ( st1 , true ); // use now as the spiderTime // get ip of submitter //unsigned long h = ipdom ( s->m_ip ); // . use top 2 bytes now, some isps have large blocks // . if this causes problems, then they can do pay for inclusion unsigned long h = iptop ( s->m_ip ); long codeLen; char* code = r->getString("code", &codeLen); if(g_autoBan.hasCode(code, codeLen, s->m_ip)) { long uipLen = 0; char* uip = r->getString("uip",&uipLen); long hip = 0; //use the uip when we have a raw query to test if //we can submit if(uip) { hip = atoip(uip, uipLen); h = iptop( hip ); } } st1->m_strip = r->getLong("strip",0); // Remember, for cgi, if the box is not checked, then it is not // reported in the request, so set default return value to 0 long spiderLinks = r->getLong("spiderLinks",-1); // also support all lowercase like PageInject.cpp uses if ( spiderLinks == -1 ) spiderLinks = r->getLong("spiderlinks",0); // . should we force it into spiderdb even if already in there // . use to manually update spider times for a url // . however, will not remove old scheduled spider times // . mdw: made force on the default st1->m_forceRespider = r->getLong("force",1); // 0); long now = getTimeGlobal(); // . allow 1 submit every 1 hour // . restrict by submitter domain ip if ( ! st1->m_isAdmin && ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) { // return error page g_errno = ETOOEARLY; return sendReply ( st1 , true ); } //st1->m_query = r->getString( "qts", &st1->m_queryLen ); // check it, if turing test is enabled for this collection if ( ! st1->m_isAdmin && cr->m_doTuringTest && ! g_turingTest.isHuman(r) ) { // log note so we know it didn't make it g_msg = " (error: bad answer)"; //log("PageAddUrl:: addurl failed for %s : bad answer", // iptoa(s->m_ip)); st1->m_goodAnswer = false; return sendReply ( st1 , true /*addUrl enabled?*/ ); } //if ( st1->m_queryLen > 0 ) // return getPages( st1 ); // if no url given, just print a blank page if ( ! url ) return sendReply ( st1 , true ); // // make a SpiderRequest // SpiderRequest *sreq = &st1->m_sreq; // reset it sreq->reset(); // make the probable docid long long probDocId = g_titledb.getProbableDocId ( st1->m_url ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); // . now fill it up // . TODO: calculate the other values... lazy!!! (m_isRSSExt, // m_siteNumInlinks,...) sreq->m_isNewOutlink = 1; sreq->m_isAddUrl = 1; sreq->m_addedTime = now; sreq->m_fakeFirstIp = 1; sreq->m_probDocId = probDocId; sreq->m_firstIp = firstIp; sreq->m_hopCount = 0; // its valid if root Url uu; uu.set ( st1->m_url ); if ( uu.isRoot() ) sreq->m_hopCountValid = true; // too big? //long len = st1->m_urlLen; // the url! includes \0 strcpy ( sreq->m_url , st1->m_url ); // call this to set sreq->m_dataSize now sreq->setDataSize(); // make the key dude -- after setting url sreq->setKey ( firstIp , 0LL, false ); // need a fake first ip lest we core! //sreq->m_firstIp = (pdocId & 0xffffffff); // how to set m_firstIp? i guess addurl can be throttled independently // of the other urls??? use the hash of the domain for it! long dlen; char *dom = getDomFast ( st1->m_url , &dlen ); // fake it for this... //sreq->m_firstIp = hash32 ( dom , dlen ); // sanity if ( ! dom ) { g_errno = EBADURL; return sendReply ( st1 , true ); } // shortcut Msg4 *m = &st1->m_msg4; // now add that to spiderdb using msg4 if ( ! m->addMetaList ( (char *)sreq , sreq->getRecSize() , coll , st1 , // state addedStuff , MAX_NICENESS , RDB_SPIDERDB ) ) // we blocked return false; // send back the reply return sendReply ( st1 , true ); }