bool AdultBit::isDirty ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_dirty[] = { "anal", "analsex", "b*****b", "blowjobs", "boob", "boobs", "clitoris", "c**k", "cocks", "cum", "dick", "dicks", "g******g", "gangbangs", "gangbanging", "movie", "movies", "oral", "oralsex", "p**n", "porno", "pussy", "pussies", "sex", "sexy", "tit", "t**s", "video", "videos", "xxx", "xxxx", "xxxx" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_dtable.set ( 8,4,sizeof(s_dirty )*2,NULL,0,false,0, "adulttab")) return log("build: Error initializing " "dirty word hash table." ); // now add in all the dirty words int32_t n = (int32_t)sizeof(s_dirty)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_dirty [i] ); if ( ! s_dtable.addTerm (&h, i+1) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_dtable.getScore ( &h ); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }
bool AdultBit::isObscene ( char *s , int32_t len ) { static bool s_isInitialized = false; static char *s_obscene[] = { "c**t", "clits", // "cum", magna cum laude "cums", "cumshot", "c**t", "cunts", "milf", "rimjob", "felch", "f**k", "f****d", "f****r", "f*****g", "f***s", "w***e", "w****s" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_otable.set ( 8,4,sizeof(s_obscene)*2,NULL,0,false,0, "obscenetab") ) return log("build: Error initializing " "obscene word hash table." ); // now add in all the stop words int32_t n = sizeof(s_obscene) / sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_obscene[i] ); if ( ! s_otable.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } // compute the hash of the word "s" int64_t h = hash64Lower_a ( s , len ); // get from table return s_otable.getScore ( &h ); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,4096,NULL,0,false,"enttbl" ) ) { log("build: Could not init table of HTML entities."); return false; } // now add in all the html entities const int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // convert the unicode codepoints to an utf8 string char *buf = (char *)s_entities[i].utf8; for(int j=0; j<s_entities[i].codepoints; j++) { UChar32 codepoint = s_entities[i].codepoint[j]; int32_t len = utf8Encode(codepoint,buf); if ( len == 0 ) { g_process.shutdownAbort(true); } // make modification to make parsing easier if ( codepoint == 160 ) { // nbsp buf[0] = ' '; len = 1; } buf += len; } s_entities[i].utf8Len = (size_t)(buf-s_entities[i].utf8); // must not exist! if ( s_table.isInTable(&h) ) { g_process.shutdownAbort(true);} // store the entity index in the hash table as score if ( ! s_table.addTerm(h, i+1) ) return false; } s_isInitialized = true; } return true; }
// . make a key for caching the search results page based on this input // . do not use all vars, like the m_*ToDisplay should not be included key_t SearchInput::makeKey ( ) { // hash the query long n = m_q->getNumTerms (); long long *termIds = m_q->getTermIds (); char *signs = m_q->getTermSigns (); key_t k; k.n1 = 0; k.n0 = hash64 ( (char *)termIds , n * sizeof(long long) ); k.n0 = hash64 ( (char *)signs , n , k.n0 ); // user defined weights, for weighting each query term separately for ( long i = 0 ; i < n ; i++ ) { k.n0 = hash64 ((char *)&m_q->m_qterms[i].m_userWeight,4, k.n0); k.n0 = hash64 ((char *)&m_q->m_qterms[i].m_userType ,1, k.n0); } // space separated, NULL terminated, list of meta tag names to display if ( m_displayMetas ) k.n0 = hash64b ( m_displayMetas , k.n0 ); // name of collection in external cluster to get titleRecs for // related pages from if ( m_rp_getExternalPages && m_rp_externalColl ) k.n0 = hash64b ( m_rp_externalColl , k.n0 ); // collection e import from if ( m_importColl ) k.n0 = hash64b ( m_importColl , k.n0 ); // the special query parm if ( m_sq && m_sqLen > 0 ) k.n0 = hash64 ( m_sq , m_sqLen , k.n0 ); if ( m_noDocIds && m_noDocIdsLen ) k.n0 = hash64 ( m_noDocIds , m_noDocIdsLen , k.n0 ); if ( m_noSiteIds && m_noSiteIdsLen ) k.n0 = hash64 ( m_noSiteIds , m_noSiteIdsLen , k.n0 ); // no need to hash these again separately, they are in between // m_START and m_END_HASH // language //if ( m_language ) // k.n0 = hash64 ( m_language , k.n0 ); //if ( m_gblang ) // k.n0 = hash64 ( m_gblang , k.n0 ); // . now include the hash of the search parameters // . nnot incuding m_docsToScanForTopics since since we got TopicGroups char *a = ((char *)&m_START) + 4 ; // msg40->m_dpf; char *b = (char *)&m_END_HASH ; // msg40->m_topicGroups; long size = b - a; // push and flush some parms that should not contribute //long save1 = m_refs_numToDisplay; //long save2 = m_rp_numToDisplay; //long save3 = m_numTopicsToDisplay; //m_refs_numToDisplay = 0; //m_rp_numToDisplay = 0; //m_numTopicsToDisplay = 0; // and hash it all up k.n0 = hash64 ( a , size , k.n0 ); // and pop out the parms that did not contribute //m_refs_numToDisplay = save1; //m_rp_numToDisplay = save2; //m_numTopicsToDisplay = save3; // hash each topic group for ( long i = 0 ; i < m_numTopicGroups ; i++ ) { TopicGroup *t = &m_topicGroups[i]; //k.n0 = hash64 ( t->m_numTopics , k.n0 ); k.n0 = hash64 ( t->m_maxTopics , k.n0 ); k.n0 = hash64 ( t->m_docsToScanForTopics , k.n0 ); k.n0 = hash64 ( t->m_minTopicScore , k.n0 ); k.n0 = hash64 ( t->m_maxWordsPerTopic , k.n0 ); k.n0 = hash64b( t->m_meta , k.n0 ); k.n0 = hash64 ( t->m_delimeter , k.n0 ); k.n0 = hash64 ( t->m_useIdfForTopics , k.n0 ); k.n0 = hash64 ( t->m_dedup , k.n0 ); } // . boolean queries have operators (AND OR NOT ( ) ) that we need // to consider in this hash as well. so // . so just hash the whole damn query if ( m_q->m_isBoolean ) { char *q = m_q->getQuery(); long qlen = m_q->getQueryLen(); k.n0 = hash64 ( q , qlen , k.n0 ); } // Language stuff k.n0 = hash64(m_defaultSortLanguage, m_defaultSortLanguageLen, k.n0); k.n0 = hash64(m_defaultSortCountry , m_defaultSortCountryLen , k.n0); // debug //logf(LOG_DEBUG,"query: q=%s k.n0=%llu",m_q->getQuery(),k.n0); //Msg1aParms* m1p = msg40->getReferenceParms(); //if( m1p ) { // k.n0=hash64(((char*)m1p)+sizeof(long), // sizeof(Msg1aParms)-8,k.n0); //} return k; }
void handleRequest22 ( UdpSlot *slot , long netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // get this //char *coll = g_collectiondb.getCollName ( r->m_collnum ); // sanity check long requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %li bytes for title record. " "Need at least 28.", requestSize ); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase; if ( ! (tbase=getRdbBase(RDB_TITLEDB,r->m_collnum) ) ) { log("db: Could not get title rec in collection # %li " "because rdbbase is null.", (long)r->m_collnum); g_errno = EBADENGINEER; us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) r->m_maxCacheAge = 0; // keep track of stats //if (r->m_justCheckTfndb) // g_tfndb.getRdb()->readRequestGet(requestSize); // else g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%i): %s", sizeof(State22), mstrerror(g_errno)); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . make the keys for getting recs from tfndb // . url recs map docid to the title file # that contains the titleRec //key_t uk1 ; //key_t uk2 ; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( r->m_docId ); //uk2 = g_tfndb.makeMaxKey ( r->m_docId ); st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { long long pd = r->m_docId; long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { long dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s",r->m_url); g_errno = EBADURL; us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } long long pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); long long d1 = g_titledb.getFirstProbableDocId ( pd ); long long d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // there are no del bits in tfndb //uk1 = g_tfndb.makeMinKey ( d1 ); //uk2 = g_tfndb.makeMaxKey ( d2 ); // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); /* // shortcut Rdb *tdb = g_titledb.getRdb(); // init this st->m_tfn2 = -1; // skip tfndb lookup if we can. saves some time. if ( g_conf.m_readOnlyMode && // must not be a *url* lookup, it must be a docid lookup ! r->m_url[0] && // tree must be empty too i guess tdb->getTree()->getNumUsedNodes() ==0 ) { // the RdbBase contains the BigFiles for tfndb RdbBase *base = tdb->m_bases[r->m_collnum]; // can only have one titledb file if ( base->getNumFiles() == 1 ) { // now we can get RdbBase st->m_tfn2 = base->m_fileIds2[0]; // sanity check if ( st->m_tfn2 < 0 ) { char *xx = NULL; *xx = 0; } } } // check the tree for this docid RdbTree *tt = tdb->getTree(); // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); long n = tt->getNextNode ( r->m_collnum , startKey ); // there should only be one match, one titlerec per docid! for ( ; n >= 0 ; n = tt->getNextNode ( n ) ) { // break if collnum does not match. we exceeded our tree range. if ( tt->getCollnum ( n ) != r->m_collnum ) break; // get the key of this node key_t k = *(key_t *)tt->getKey(n); // if passed limit, break out, no match if ( k > endKey ) break; // if we had a url make sure uh48 matches if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(&k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // . if we matched a negative key, then skip // . just break out here and enter the normal logic // . it should load tfndb and find that it is not in tfndb // because when you add a negative key to titledb in // Rdb::addList, it adds a negative rec to tfndb immediately // . NO! because we add the negative key to the tree when we // delete the old titledb rec, then we add the new one! // when a negative key is added Rdb::addRecord() removes // the positive key (and vice versa) from the tree. if ( KEYNEG((char *)&k) ) continue; // if just checking for its existence, we are done if ( r->m_justCheckTfndb ) { us->sendReply_ass ( NULL,0,NULL,0,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // ok, we got a match, return it char *data = tt->getData ( n ); long dataSize = tt->getDataSize ( n ); // wierd! if ( dataSize == 0 ) { char *xx=NULL;*xx=0; } // send the whole rec back long need = 12 + 4 + dataSize; // will this copy it? not! char *buf = (char *)mmalloc ( need , "msg22t" ); if ( ! buf ) { us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // log it if ( g_conf.m_logDebugSpider ) logf(LOG_DEBUG,"spider: found %s in titledb tree", r->m_url); // store in the buf for sending char *p = buf; // store key *(key_t *)p = k; p += sizeof(key_t); // then dataSize *(long *)p = dataSize; p += 4; // then the data memcpy ( p , data , dataSize ); p += dataSize; // send off the record us->sendReply_ass (buf, need,buf, need,slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // if we did not need to consult tfndb cuz we only have one file if ( st->m_tfn2 >= 0 ) { gotUrlListWrapper ( st , NULL , NULL ); return; } // . get the list of url recs for this docid range // . this should not block, tfndb SHOULD all be in memory all the time // . use 500 million for min recsizes to get all in range // . no, using 500MB causes problems for RdbTree::getList, so use // 100k. how many recs can there be? if ( ! st->m_msg5.getList ( RDB_TFNDB , coll , &st->m_ulist , uk1 , // startKey uk2 , // endKey // use 0x7fffffff preceisely because it // will determine eactly how long the // tree list needs to allocate in Msg5.cpp 0x7fffffff , // minRecSizes true , // includeTree? false , // addToCache? 0 , // max cache age 0 , // startFileNum -1 , // numFiles (-1 =all) st , gotUrlListWrapper , r->m_niceness , true ))// error correction? return ; // we did not block gotUrlListWrapper ( st , NULL , NULL ); } static void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) ; void gotUrlListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) { // shortcuts State22 *st = (State22 *)state; UdpServer *us = &g_udpServer; // bail on error if ( g_errno ) { log("db: Had error getting info from tfndb: %s.", mstrerror(g_errno)); log("db: uk1.n1=%li n0=%lli uk2.n1=%li n0=%lli " "d1=%lli d2=%lli.", ((key_t *)st->m_msg5.m_startKey)->n1 , ((key_t *)st->m_msg5.m_startKey)->n0 , ((key_t *)st->m_msg5.m_endKey)->n1 , ((key_t *)st->m_msg5.m_endKey)->n0 , st->m_docId1 , st->m_docId2 ); us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // shortcuts RdbList *ulist = &st->m_ulist; Msg22Request *r = st->m_r; char *coll = g_collectiondb.getCollName ( r->m_collnum ); // point to top just in case ulist->resetListPtr(); // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase(RDB_TITLEDB,coll); // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); // sanity if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; long tfn = -1; // sanity check. make sure did not load from tfndb if did not need to if ( ! ulist->isExhausted() && st->m_tfn2 >= 0 ) {char *xx=NULL;*xx=0;} // if only one titledb file and none in memory use it if ( st->m_tfn2 >= 0 ) tfn = st->m_tfn2; // we may have multiple tfndb recs but we should NEVER have to read // multiple titledb files... for ( ; ! ulist->isExhausted() ; ulist->skipCurrentRecord() ) { // breathe QUICKPOLL ( r->m_niceness ); // get first rec key_t k = ulist->getCurrentKey(); // . skip negative keys // . seems to happen when we have tfndb in the tree... if ( KEYNEG((char *)&k) ) continue; // if we have a url and no docid, we gotta check uh48! if ( r->m_url[0] && g_tfndb.getUrlHash48(&k)!=st->m_uh48){ // get docid of that guy long long dd = g_tfndb.getDocId(&k); // if matches avail docid, inc it if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // try next tfndb key continue; } // . get file num this rec is stored in // . this is updated right after the file num is merged by // scanning all records in tfndb. this is very quick if all // of tfndb is in memory, otherwise, it might take a few // seconds. update call done in RdbMerge::incorporateMerge(). tfn = g_tfndb.getTfn ( &k ); // i guess we got a good match! break; } // sanity check. 255 used to mean in spiderdb or in tree if ( tfn >= 255 ) { char *xx=NULL;*xx=0; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // breathe QUICKPOLL ( r->m_niceness); // . log if different // . if our url rec was in there, this could still be different // if there was another url rec in there with the same docid and // a diferent extension, but with a tfn of 255, meaning that it // is just in spiderdb and not in titledb yet. so it hasn't been // assigned a permanent docid... // . another way "ad" may be different now is from the old bug which // did not chain the docid properly because it limited the docid // chaining to one titleRec file. so conceivably we can have // different docs sharing the same docids, but with different // url hash extensions. for instance, on host #9 we have: // 00f3b2ff63aec3a9 docId=261670033643 e=0x58 tfn=117 clean=0 half=0 // 00f3b2ff63af66c9 docId=261670033643 e=0x6c tfn=217 clean=0 half=0 // . Msg16 will only use the avail docid if the titleRec is not found if ( r->m_url[0] && pd != ad ) { //log(LOG_INFO,"build: Docid %lli collided. %s Changing " // // http://www.airliegardens.org/events.asp?dt=2&date=8/5/2011 // // COLLIDES WITH // // http://www.bbonline.com/i/chicago.html // // collision alert! log("spider: Docid %lli collided. %s Changing " "to %lli.", r->m_docId , r->m_url , ad ); // debug this for now //char *xx=NULL;*xx=0; } // remember it st->m_availDocId = ad; // if tfn is -1 then it was not in titledb if ( tfn == -1 ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = ad; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // sanity if ( tfn < 0 ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL ( r->m_niceness ); // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // . compute the file scan range // . tfn is now equivalent to Rdb's id2, a secondary file id, it // follows the hyphen in "titledb0001-023.dat" // . default to just scan the root file AND the tree, cuz we're // assuming restrictToRoot was set to true so we did not get a tfndb // list // . even if a file number is given, always check the tree in case // it got re-spidered // . shit, but we can still miss it if it gets dumped right after // our thread is spawned, in which case we'd fall back to the old // version. no. because if its in the tree now we get it before // spawning a thread. there is no blocking. TRICKY. so if it is in // the tree at this point we'll get it, but may end up scanning the // file with the older version of the doc... not too bad. long startFileNum = tbase->getFileNumFromId2 ( tfn ); // if tfn refers to a missing titledb file... if ( startFileNum < 0 ) { if ( r->m_url[0] ) log("db: titledb missing url %s",r->m_url); else log("db: titledb missing docid %lli", r->m_docId); us->sendErrorReply ( st->m_slot,ENOTFOUND ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // save this st->m_tfn = tfn; */ // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL , // sync point &st->m_msg5b ) ) return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
static bool initEntityTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8,4,255,NULL,0,false,0,"enttbl" ) ) return log("build: Could not init table of " "HTML entities."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_entities) / (int32_t)sizeof(Entity); for ( int32_t i = 0 ; i < n ; i++ ) { int64_t h = hash64b ( s_entities[i].entity ); // grab the unicode code point UChar32 up = s_entities[i].unicode; // now we are 100% up if ( ! up ) { char *xx=NULL;*xx=0; } // point to it char *buf = (char *)s_entities[i].utf8; // if uchar32 not 0 then set the utf8 with it int32_t len = utf8Encode(up,buf); // // make my own mods to make parsing easier // if ( up == 160 ) { // nbsp buf[0] = ' '; len = 1; } // make all quotes equal '\"' (34 decimal) // double and single curling quotes //http://www.dwheeler.com/essays/quotes-test-utf-8.html // “, 201d, 2018, 2019 (unicode values, not utf8) // &ldquo, &rdquo, &lsquo, &rsquo /* if ( up == 171 || up == 187 || up == 8216 || up == 8217 || up == 8218 || up == 8220 || up == 8221 || up == 8222 || up == 8249 || up == 8250 ) { buf[0] = '\"'; len = 1; } // and normalize all dashes (mdash,ndash) if ( up == 8211 || up == 8212 ) { buf[0] = '-'; len = 1; } */ // // end custom mods // // set length s_entities[i].utf8Len = len; // check it if ( len == 0 ) { char *xx=NULL;*xx=0; } // must not exist! if ( s_table.isInTable(&h) ) { char*xx=NULL;*xx=0;} // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; }
void handleRequest22 ( UdpSlot *slot , int32_t netnice ) { // shortcut UdpServer *us = &g_udpServer; // get the request Msg22Request *r = (Msg22Request *)slot->m_readBuf; // sanity check int32_t requestSize = slot->m_readBufSize; if ( requestSize < r->getMinSize() ) { log("db: Got bad request size of %" PRId32" bytes for title record. " "Need at least 28.", requestSize ); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , EBADREQUESTSIZE ); return; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *tbase = getRdbBase( RDB_TITLEDB, r->m_collnum ); if ( ! tbase ) { log("db: Could not get title rec in collection # %" PRId32" because rdbbase is null.", (int32_t)r->m_collnum); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } // overwrite what is in there so niceness conversion algo works r->m_niceness = netnice; // if just checking tfndb, do not do the cache lookup in clusterdb if ( r->m_justCheckTfndb ) { r->m_maxCacheAge = 0; } g_titledb.getRdb()->readRequestGet (requestSize); // breathe QUICKPOLL ( r->m_niceness); // sanity check if ( r->m_collnum < 0 ) { char *xx=NULL;*xx=0; } // make the state now State22 *st ; try { st = new (State22); } catch ( ... ) { g_errno = ENOMEM; log("query: Msg22: new(%" PRId32"): %s", (int32_t)sizeof(State22), mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st , sizeof(State22) , "Msg22" ); // store ptr to the msg22request st->m_r = r; // save for sending back reply st->m_slot = slot; // then tell slot not to free it since m_r references it! // so we'll have to free it when we destroy State22 st->m_slotAllocSize = slot->m_readBufMaxSize; st->m_slotReadBuf = slot->m_readBuf; slot->m_readBuf = NULL; // . if docId was explicitly specified... // . we may get multiple tfndb recs if ( ! r->m_url[0] ) { st->m_docId1 = r->m_docId; st->m_docId2 = r->m_docId; } // but if we are requesting an available docid, it might be taken // so try the range if ( r->m_getAvailDocIdOnly ) { int64_t pd = r->m_docId; int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // make sure we get a decent sample in titledb then in // case the docid we wanted is not available st->m_docId1 = d1; st->m_docId2 = d2; } // . otherwise, url was given, like from Msg15 // . we may get multiple tfndb recs if ( r->m_url[0] ) { int32_t dlen = 0; // this causes ip based urls to be inconsistent with the call // to getProbableDocId(url) below char *dom = getDomFast ( r->m_url , &dlen ); // bogus url? if ( ! dom ) { log("msg22: got bad url in request: %s from " "hostid %" PRId32" for msg22 call ", r->m_url,slot->m_host->m_hostId); g_errno = EBADURL; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } int64_t pd = g_titledb.getProbableDocId (r->m_url,dom,dlen); int64_t d1 = g_titledb.getFirstProbableDocId ( pd ); int64_t d2 = g_titledb.getLastProbableDocId ( pd ); // sanity - bad url with bad subdomain? if ( pd < d1 || pd > d2 ) { char *xx=NULL;*xx=0; } // store these st->m_pd = pd; st->m_docId1 = d1; st->m_docId2 = d2; st->m_uh48 = hash64b ( r->m_url ) & 0x0000ffffffffffffLL; } QUICKPOLL ( r->m_niceness ); // make the cacheKey ourself, since Msg5 would make the key wrong // since it would base it on startFileNum and numFiles key_t cacheKey ; cacheKey.n1 = 0; cacheKey.n0 = r->m_docId; // make titledb keys key_t startKey = g_titledb.makeFirstKey ( st->m_docId1 ); key_t endKey = g_titledb.makeLastKey ( st->m_docId2 ); // . load the list of title recs from disk now // . our file range should be solid // . use 500 million for min recsizes to get all in range if ( ! st->m_msg5.getList ( RDB_TITLEDB , r->m_collnum , &st->m_tlist , startKey , // startKey endKey , // endKey 500000000 , // minRecSizes true , // includeTree false,//r->m_addToCache , // addToCache? 0,//r->m_maxCacheAge , // max cache age 0,//startFileNum , -1 , // numFiles st , // state , gotTitleList , r->m_niceness , true , // do error correct? &cacheKey , 0 , // retry num -1 , // maxRetries true , // compensate for merge -1LL ) ) // sync point return ; // we did not block, nice... in cache? gotTitleList ( st , NULL , NULL ); }
// . returns true if all done! // . returns false if still doing stuff bool Test::injectLoop ( ) { long dlen ; char *dom ; long fakeIp ; loop: // advance to next url for ( ; m_urlPtr < m_urlEnd && ! *m_urlPtr ; m_urlPtr++ ) ; // all done? if ( m_urlPtr >= m_urlEnd ) { // flush em out if ( ! flushMsg4Buffers ( this , injectedWrapper ) ) return false; // note it m_isAdding = false; // all done return true; } // error means all done if ( m_errno ) { m_isAdding = false; return true; } // point to it char *u = m_urlPtr; // advance to point to the next url for the next loop! for ( ; m_urlPtr < m_urlEnd && *m_urlPtr ; m_urlPtr++ ) ; // hash it long long h = hash64b ( u ); // dedup it lest we freeze up and stopIt() never gets called because // m_urlsAdded is never decremented all the way to zero in Spider.cpp if ( m_dt.isInTable ( &h ) ) goto loop; // add it. return true with g_errno set on error if ( ! m_dt.addKey ( &h ) ) goto hadError; // make the SpiderRequest from it m_sreq.reset(); // url strcpy ( m_sreq.m_url , u ); // get domain of url dom = getDomFast ( m_sreq.m_url , &dlen ); // make a fake ip fakeIp = 0x123456; // use domain if we got that if ( dom && dlen ) fakeIp = hash32 ( dom , dlen ); // first ip is fake m_sreq.m_firstIp = fakeIp; // 0x123456; // these too m_sreq.m_domHash32 = fakeIp; m_sreq.m_hostHash32 = fakeIp; m_sreq.m_siteHash32 = fakeIp; m_sreq.m_probDocId = g_titledb.getProbableDocId( m_sreq.m_url ); // this crap is fake m_sreq.m_isInjecting = 1; // use test-spider subdir for storing pages and spider times? if ( g_conf.m_testSpiderEnabled ) m_sreq.m_useTestSpiderDir = 1; // use this later m_sreq.m_hasContent = 0; // injected requests use this as the spider time i guess // so we can sort them by this m_sreq.m_addedTime = ++s_count; // no, because to compute XmlDoc::m_min/maxPubDate we need this to // be valid for our test run.. no no we will fix it to be // basically 2 days before spider time in the code... //m_sreq.m_addedTime = spiderTime; m_sreq.m_fakeFirstIp = 1; // make the key (parentDocId=0) m_sreq.setKey ( fakeIp, 0LL , false ); // test it if ( g_spiderdb.getFirstIp(&m_sreq.m_key) != fakeIp ) { char *xx=NULL;*xx=0;} // sanity check. check for http(s):// if ( m_sreq.m_url[0] != 'h' ) { char *xx=NULL;*xx=0; } // reset this g_errno = 0; // count it m_urlsAdded++; // note it //log("crazyout: %s",m_sreq.m_url ); logf(LOG_DEBUG,"spider: injecting test url %s",m_sreq.m_url); // the receiving end will realize that we are injecting into the test // collection and use the "/test/" subdir to load the file // "ips.txt" to do our ip lookups, and search for any downloads in // that subdirectory as well. if ( ! m_msg4.addMetaList ( (char *)&m_sreq , m_sreq.getRecSize() , m_coll , NULL , injectedWrapper , MAX_NICENESS , RDB_SPIDERDB ) ) // return false if blocked return false; // error? if ( g_errno ) { // jump down here from above on error hadError: // save it m_errno = g_errno; // flag it m_isAdding = false; // note it log("test: inject had error: %s",mstrerror(g_errno)); // stop, we are all done! return true; } // add the next spider request goto loop; }