// . get collectionRec from name // . returns NULL if not available CollectionRec *Collectiondb::getRec ( char *coll ) { if ( ! coll ) coll = ""; return getRec ( coll , gbstrlen(coll) ); }
// we only come back up here 1) in the very beginning or 2) when a url // completes its pipeline of requests bool Msge0::launchRequests ( long starti ) { // reset any error code g_errno = 0; loop: // stop if no more urls. return true if we got all replies! no block. if ( m_n >= m_numUrls ) return (m_numRequests == m_numReplies); // if all hosts are getting a diffbot reply with 50 spiders and they // all timeout at the same time we can very easily clog up the // udp sockets, so use this to limit... i've seen the whole // spider tables stuck with "getting outlink tag rec vector"statuses long maxOut = MAX_OUTSTANDING_MSGE0; if ( g_udpServer.m_numUsedSlots > 500 ) maxOut = 1; // if we are maxed out, we basically blocked! if (m_numRequests - m_numReplies >= maxOut ) return false; // . skip if "old" // . we are not planning on adding this to spiderdb, so Msg16 // want to skip the ip lookup, etc. if ( m_urlFlags && (m_urlFlags[m_n] & LF_OLDLINK) && m_skipOldLinks ) { m_numRequests++; m_numReplies++; m_n++; goto loop; } // if url is same host as the tagrec provided, just reference that! if ( m_urlFlags && (m_urlFlags[m_n] & LF_SAMEHOST) && m_baseTagRec) { m_tagRecPtrs[m_n] = (TagRec *)m_baseTagRec; m_numRequests++; m_numReplies++; m_n++; goto loop; } // . get the next url // . if m_xd is set, create the url from the ad id char *p = m_urlPtrs[m_n]; // get the length long plen = gbstrlen(p); // . grab a slot // . m_msg8as[i], m_msgCs[i], m_msg50s[i], m_msg20s[i] long i; // make this 0 since "maxOut" now changes!! for ( i = 0 /*starti*/ ; i < MAX_OUTSTANDING_MSGE0 ; i++ ) if ( ! m_used[i] ) break; // sanity check if ( i >= MAX_OUTSTANDING_MSGE0 ) { char *xx = NULL; *xx = 0; } // normalize the url m_urls[i].set ( p , plen ); // save the url number, "n" m_ns [i] = m_n; // claim it m_used[i] = true; // note it //if ( g_conf.m_logDebugSpider ) // log(LOG_DEBUG,"spider: msge0: processing url %s", // m_urls[i].getUrl()); // . start it off // . this will start the pipeline for this url // . it will set m_used[i] to true if we use it and block // . it will increment m_numRequests and NOT m_numReplies if it blocked sendMsg8a ( i ); // consider it launched m_numRequests++; // inc the url count m_n++; // try to do another goto loop; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( long long hostId , // host to ask (-1 if none) long ip , // info on hostId short port , long maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb char *coll , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , long minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , long niceness , bool doErrorCorrection , bool includeTree , bool doMerge , long firstHostId , long startFileNum , long numFiles , long timeout , long long syncPoint , long preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , long forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_coll = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // did they force it? core until i figure out what this is if ( forceParitySplit >= 0 ) m_groupId = g_hostdb.getGroupId ( forceParitySplit ); else m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); // how is this used? if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%li.",(long)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && g_hostdb.m_groupId == m_groupId ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* long long singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { long long d1 = g_posdb.getDocId(m_startKey); long long d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, coll , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "group=%li listPtr=%li minRecSizes=%li termId=%llu " //"startKey.n1=%lx,n0=%llx (niceness=%li)", "startKey.n1=%llx,n0=%llx (niceness=%li)", g_hostdb.makeHostId ( m_groupId ) ,(long)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (long)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (long)m_niceness); char *replyBuf = NULL; long replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was short 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %li bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(long long *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(long *) p = m_minRecSizes ; p += 4; *(long *) p = startFileNum ; p += 4; *(long *) p = numFiles ; p += 4; *(long *) p = maxCacheAge ; p += 4; *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %lli.", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; unsigned short port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%llu, " // "groupNum=%lu", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top long of the key // . i guess this will work for 128 bit keys... hmmmmm long keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { long need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( long i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( long i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); long gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? //#ifdef SPLIT_INDEXDB gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in group " "#%li over network: %s.", getDbnameFromId(m_rdbId),m_groupId, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }
char *getMatches2 ( Needle *needles , int32_t numNeedles , char *haystack , int32_t haystackSize , char *linkPos , int32_t *needleNum , bool stopAtFirstMatch , bool *hadPreMatch , bool saveQuickTables , int32_t niceness ) { // assume not if ( hadPreMatch ) *hadPreMatch = false; // empty haystack? then no matches if ( ! haystack || haystackSize <= 0 ) return NULL; // JAB: no needles? then no matches if ( ! needles || numNeedles <= 0 ) return NULL; //char tmp[8192]; //char *t = tmp; //char *tend = tmp + 8192; // reset counts to 0 //if ( ! stopAtFirstMatch ) // for ( int32_t i=0 ; i < numNeedles ; i++ ) // needles[i].m_count = 0; // are we responsible for init'ing string lengths? this is much // faster than having to specify lengths manually. for ( int32_t i=0 ; i < numNeedles; i++ ) { // breathe QUICKPOLL(niceness); // clear needles[i].m_count = 0; needles[i].m_firstMatch = NULL; // set the string size in bytes if not provided if ( needles[i].m_stringSize == 0 ) needles[i].m_stringSize = gbstrlen(needles[i].m_string); } // . set up the quick tables. // . utf16 is not as effective here because half the bytes are zeroes! // . TODO: use a static cache of like 4 of these tables where the key // is the Needles ptr ... done int32_t numNeedlesToInit = numNeedles; char space[256 * 4 * sizeof(BITVEC)]; char *buf = NULL; BITVEC *s0; BITVEC *s1; BITVEC *s2; BITVEC *s3; /* static bool s_quickTableInit = false; static char s_qtbuf[128*(12+1)*2]; int32_t slot = -1; if(saveQuickTables) { if ( ! s_quickTableInit ) { s_quickTableInit = true; s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx"); } uint64_t key = (uint32_t)needles; slot = s_quickTables.getSlot(&key); if ( slot >= 0 ) { buf = s_quickTables.getValueFromSlot(slot); numNeedlesToInit = 0; } } */ if(!buf) { buf = space; memset ( buf , 0 , sizeof(BITVEC)*256*4); } /* if( useQuickTables && slot == -1 ) { //buf = (char*)mcalloc(sizeof(uint32_t)*256*5, // "matches"); if(buf) s_quickTables.addKey(&key, &buf); //sanity check, no reason why there needs to be a //limit, I just don't expect there to be this many //static needles at this point. if(s_quickTables.getNumSlotsUsed() > 32){ char *xx=NULL; *xx = 0; } } */ // try 64 bit bit vectors now since we doubled # of needles int32_t offset = 0; s0 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s1 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s2 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; s3 = (BITVEC *)(buf + offset); offset += sizeof(BITVEC)*256; BITVEC mask; // set the letter tables, s0[] through sN[], for each needle for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) { // breathe QUICKPOLL(niceness); unsigned char *w = (unsigned char *)needles[i].m_string; unsigned char *wend = w + needles[i].m_stringSize; // BITVEC is now 64 bits mask = (1<<(i&0x3f)); // (1<<(i%64)); // if the needle is small, fill up the remaining letter tables // with its mask... so it matches any character in haystack. s0[(unsigned char)to_lower_a(*w)] |= mask; s0[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s1[j] |= mask; s2[j] |= mask; s3[j] |= mask; } continue; } s1[(unsigned char)to_lower_a(*w)] |= mask; s1[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s2[j] |= mask; s3[j] |= mask; } continue; } s2[(unsigned char)to_lower_a(*w)] |= mask; s2[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; if ( w >= wend ) { for ( int32_t j = 0 ; j < 256 ; j++ ) { s3[j] |= mask; } continue; } s3[(unsigned char)to_lower_a(*w)] |= mask; s3[(unsigned char)to_upper_a(*w)] |= mask; w += 1;//step; } // return a ptr to the first match if we should, this is it char *retVal = NULL; // debug vars //int32_t debugCount = 0; //int32_t pp = 0; // now find the first needle in the haystack unsigned char *p = (unsigned char *)haystack; unsigned char *pend = (unsigned char *)haystack + haystackSize; char *dend = (char *)pend; // do not breach! pend -= 4; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(niceness); //if ( (char *)p - (char *)haystack >= 12508 ) // log("hey"); // analytics... // is this a possible match? (this should be VERY fast) mask = s0[*(p+0)]; if ( ! mask ) continue; mask &= s1[*(p+1)]; if ( ! mask ) continue; mask &= s2[*(p+2)]; if ( ! mask ) continue; mask &= s3[*(p+3)]; if ( ! mask ) continue; //debugCount++; /* // display char oo[148]; char *xx ; xx = oo; //gbmemcpy ( xx , p , 8 ); for ( int32_t k = 0 ; k < 5 ; k++ ) { *xx++ = p[k]; } gbmemcpy ( xx , "..." , 3 ); xx += 3; */ // // XXX: do a hashtable lookup here so we have the candidate // matches in a chain... // XXX: for small needles which match frequently let's have // a single char hash table, a 2 byte char hash table, // etc. so if we have small needles we check the hash // in those tables first, but only if mask & SMALL_NEEDLE // is true! the single byte needle hash table can just // be a lookup table. just XOR the bytes together for // the hash. // XXX: just hash the mask into a table to get candidate // matches in a chain? but there's 4B hashes!! // we got a good candidate, loop through all the needles for ( int32_t j = 0 ; j < numNeedles ; j++ ) { // skip if does not match mask, will save time if ( ! ((1<<(j&0x3f)) & mask) ) continue; if( needles[j].m_stringSize > 3) { // ensure first 4 bytes matches this needle's if (needles[j].m_string[0]!=to_lower_a(*(p+0))) continue; if (needles[j].m_string[1]!=to_lower_a(*(p+1))) continue; if (needles[j].m_string[2]!=to_lower_a(*(p+2))) continue; if (needles[j].m_string[3]!=to_lower_a(*(p+3))) continue; } // get needle size int32_t msize = needles[j].m_stringSize; // can p possibly be big enough? if ( pend - p < msize ) continue; // needle is "m" now char *m = needles[j].m_string; char *mend = needles[j].m_stringSize + m; // use a tmp ptr for ptr into haystack char *d = (char *)p; // skip first 4 bytes since we know they match if(msize > 3) { d += 4; m += 4; } // loop over each char in "m" //for ( ; *m ; m++ ) { for ( ; m < mend ; m++ ) { //while ( ! *d && d < dend ) d++; //while ( ! *m && m < mend ) m++; // if we are a non alnum, that will match // any string of non-alnums, like a space // for instance. the 0 byte does not count // because it is used in utf16 a lot. this // may trigger some false matches in utf16 // but, oh well... this way "link partner" // will match "link - partner" in the haystk if ( is_wspace_a(*m) && m < mend ) { // skip all in "d" then. while (d<dend&&is_wspace_a(*d)) d++; // advance m then continue; } // make sure we match otherwise if ( *m != to_lower_a(*d) ) break; // ok, we matched, go to next d++; } // if not null, keep going if ( m < mend ) continue; // if this needle is "special" AND it occurs AFTER // linkPos, then do not consider it a match. this is // if we have a comment section indicator, like // "div id=\"comment" AND it occurs AFTER linkPos // (the char ptr to our link in the haystack) then // the match does not count. if ( linkPos && needles[j].m_isSection && (char *)p>linkPos ) { // record this for LinkText.cpp if ( hadPreMatch ) *hadPreMatch = true; continue; } // store ptr if NULL if ( ! needles[j].m_firstMatch ) needles[j].m_firstMatch = (char *)p; // return ptr to needle in "haystack" if ( stopAtFirstMatch ) { // ok, we got a match if ( needleNum ) *needleNum = j; //return (char *)p; retVal = (char *)p; p = pend; break; } // otherwise, just count it needles[j].m_count++; // see if we match another needle, fixes bug // of matching "anal" but not "analy[tics]" continue; // advance to next char in the haystack break; } // ok, we did not match any needles, advance p and try again } // // HACK: // // repeat above loop but for the last 4 characters in haystack!! // this fixes a electric fence mem breach core // // it is slower because we check for \0 // pend += 4; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(niceness); //if ( (char *)p - (char *)haystack >= 12508 ) // log("hey"); // is this a possible match? (this should be VERY fast) mask = s0[*(p+0)]; if ( ! mask ) continue; if ( p+1 < pend ) { mask &= s1[*(p+1)]; if ( ! mask ) continue; } if ( p+2 < pend ) { mask &= s2[*(p+2)]; if ( ! mask ) continue; } if ( p+3 < pend ) { mask &= s3[*(p+3)]; if ( ! mask ) continue; } //debugCount++; /* // display char oo[148]; char *xx ; xx = oo; //gbmemcpy ( xx , p , 8 ); for ( int32_t k = 0 ; k < 5 ; k++ ) { *xx++ = p[k]; } gbmemcpy ( xx , "..." , 3 ); xx += 3; */ // // XXX: do a hashtable lookup here so we have the candidate // matches in a chain... // XXX: for small needles which match frequently let's have // a single char hash table, a 2 byte char hash table, // etc. so if we have small needles we check the hash // in those tables first, but only if mask & SMALL_NEEDLE // is true! the single byte needle hash table can just // be a lookup table. just XOR the bytes together for // the hash. // XXX: just hash the mask into a table to get candidate // matches in a chain? but there's 4B hashes!! // we got a good candidate, loop through all the needles for ( int32_t j = 0 ; j < numNeedles ; j++ ) { // skip if does not match mask, will save time if ( ! ((1<<(j&0x3f)) & mask) ) continue; if( needles[j].m_stringSize > 3) { // ensure first 4 bytes matches this needle's if (needles[j].m_string[0]!=to_lower_a(*(p+0))) continue; if (!p[1] || needles[j].m_string[1]!=to_lower_a(*(p+1))) continue; if (!p[2] || needles[j].m_string[2]!=to_lower_a(*(p+2))) continue; if (!p[3] || needles[j].m_string[3]!=to_lower_a(*(p+3))) continue; } // get needle size int32_t msize = needles[j].m_stringSize; // can p possibly be big enough? if ( pend - p < msize ) continue; // needle is "m" now char *m = needles[j].m_string; char *mend = needles[j].m_stringSize + m; // use a tmp ptr for ptr into haystack char *d = (char *)p; // skip first 4 bytes since we know they match if(msize > 3) { d += 4; m += 4; } // loop over each char in "m" //for ( ; *m ; m++ ) { for ( ; m < mend ; m++ ) { //while ( ! *d && d < dend ) d++; //while ( ! *m && m < mend ) m++; // if we are a non alnum, that will match // any string of non-alnums, like a space // for instance. the 0 byte does not count // because it is used in utf16 a lot. this // may trigger some false matches in utf16 // but, oh well... this way "link partner" // will match "link - partner" in the haystk if ( is_wspace_a(*m) && m < mend ) { // skip all in "d" then. while (d<dend&&is_wspace_a(*d)) d++; // advance m then continue; } // make sure we match otherwise if ( *m != to_lower_a(*d) ) break; // ok, we matched, go to next d++; } // if not null, keep going if ( m < mend ) continue; // if this needle is "special" AND it occurs AFTER // linkPos, then do not consider it a match. this is // if we have a comment section indicator, like // "div id=\"comment" AND it occurs AFTER linkPos // (the char ptr to our link in the haystack) then // the match does not count. if ( linkPos && needles[j].m_isSection && (char *)p>linkPos ) { // record this for LinkText.cpp if ( hadPreMatch ) *hadPreMatch = true; continue; } // store ptr if NULL if ( ! needles[j].m_firstMatch ) needles[j].m_firstMatch = (char *)p; // return ptr to needle in "haystack" if ( stopAtFirstMatch ) { // ok, we got a match if ( needleNum ) *needleNum = j; //return (char *)p; retVal = (char *)p; p = pend; break; } // otherwise, just count it needles[j].m_count++; // advance to next char in the haystack break; } // ok, we did not match any needles, advance p and try again } //if ( debugCount > 0 ) pp = haystackSize / debugCount; //log("build: debug count = %"INT32" uc=%"INT32" hsize=%"INT32" " // "1 in %"INT32" chars matches.", // debugCount,(int32_t)isHaystackUtf16,haystackSize,pp); // before we exit, clean up return retVal; }
int main ( int argc , char *argv[] ) { bool addWWW = true; bool stripSession = true; // check for arguments for (int32_t i = 1; i < argc; i++) { if (strcmp(argv[i], "-w") == 0) addWWW = false; else if (strcmp(argv[i], "-s") == 0) stripSession = false; } // initialize //g_mem.init(100*1024); hashinit(); //g_conf.m_tfndbExtBits = 23; loop: // read a url from stddin char sbuf[1024]; if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1); char *s = sbuf; char fbuf[1024]; // decode if we should if ( strncmp(s,"http%3A%2F%2F",13) == 0 || strncmp(s,"https%3A%2F%2F",13) == 0 ) { urlDecode(fbuf,s,gbstrlen(s)); s = fbuf; } // old url printf("###############\n"); printf("old: %s",s); int32_t slen = gbstrlen(s); // remove any www. if !addWWW if (!addWWW) { if (slen >= 4 && strncasecmp(s, "www.", 4) == 0) { slen -= 4; memmove(s, &s[4], slen); } else { // get past a :// int32_t si = 0; while (si < slen && ( s[si] != ':' || s[si+1] != '/' || s[si+2] != '/' ) ) si++; // remove the www. if (si + 7 < slen) { si += 3; if (strncasecmp(&s[si], "www.", 4) == 0) { slen -= 4; memmove(&s[si], &s[si+4], slen-si); } } } } // set it Url u; u.set ( s , slen , addWWW , /*add www?*/ stripSession ); /*strip session ids?*/ // print it char out[1024*4]; char *p = out; p += sprintf(p,"tld: "); gbmemcpy ( p, u.getTLD(),u.getTLDLen()); p += u.getTLDLen(); char c = *p; *p = '\0'; printf("%s\n",out); *p = c; // dom p = out; sprintf ( p , "dom: "); p += gbstrlen ( p ); gbmemcpy ( p , u.getDomain() , u.getDomainLen() ); p += u.getDomainLen(); c = *p; *p = '\0'; printf("%s\n",out); *p = c; // host p = out; sprintf ( p , "host: "); p += gbstrlen ( p ); gbmemcpy ( p , u.getHost() , u.getHostLen() ); p += u.getHostLen(); c = *p; *p = '\0'; printf("%s\n",out); *p = c; // then the whole url printf("url: %s\n", u.getUrl() ); /* int32_t siteLen; char *site = u.getSite ( &siteLen , NULL , false ); if ( site ) { c = site[siteLen]; site[siteLen] = '\0'; } printf("site: %s\n", site ); if ( site ) site[siteLen] = c; */ SiteGetter sg; sg.getSite ( u.getUrl() , NULL , // tagrec 0 , // timestamp NULL, // coll 0 , // niceness //false , // addtags NULL , // state NULL ); // callback if ( sg.m_siteLen ) printf("site: %s\n",sg.m_site); printf("isRoot: %"INT32"\n",(int32_t)u.isRoot()); /* bool perm = ::isPermalink ( NULL , // coll NULL , // Links ptr &u , // the url CT_HTML , // contentType NULL , // LinkInfo ptr false );// isRSS? printf ("isPermalink: %"INT32"\n",(int32_t)perm); */ // print the path too p = out; p += sprintf ( p , "path: " ); gbmemcpy ( p , u.getPath(), u.getPathLen() ); p += u.getPathLen(); if ( u.getFilename() ) { p += sprintf ( p , "\nfilename: " ); gbmemcpy ( p , u.getFilename(), u.getFilenameLen() ); p += u.getFilenameLen(); *p = '\0'; printf("%s\n", out ); } // encoded char dst[MAX_URL_LEN+200]; urlEncode ( dst,MAX_URL_LEN+100, u.getUrl(), u.getUrlLen(), false ); // are we encoding a request path? printf("encoded: %s\n",dst); // the probable docid int64_t pd = g_titledb.getProbableDocId(&u); printf("pdocid: %"UINT64"\n", pd ); printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) ); //printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u)); if ( u.isLinkLoop() ) printf("islinkloop: yes\n"); else printf("islinkloop: no\n"); int64_t hh64 = u.getHostHash64(); printf("hosthash64: 0x%016"XINT64"\n",hh64); uint32_t hh32 = u.getHostHash32(); printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32); int64_t dh64 = u.getDomainHash64(); printf("domhash64: 0x%016"XINT64"\n",dh64); int64_t uh64 = u.getUrlHash64(); printf("urlhash64: 0x%016"XINT64"\n",uh64); //if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n"); //else printf("unregulated: no\n"); goto loop; }
// returns false on bad mime bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) { // reset locUrl to 0 m_locUrl.reset(); // return if we have no valid complete mime if ( mimeLen == 0 ) return false; // status is on first line m_status = -1; // skip HTTP/x.x till we hit a space char *p = mime; char *pend = mime + mimeLen; while ( p < pend && !is_wspace_a(*p) ) p++; // then skip over spaces while ( p < pend && is_wspace_a(*p) ) p++; // return false on a problem if ( p == pend ) return false; // then read in the http status m_status = atol2 ( p , pend - p ); // if no Content-Type: mime field was provided, assume html m_contentType = CT_HTML; // assume default charset m_charset = NULL; m_charsetLen = 0; // set contentLen, lastModifiedDate, m_cookie p = mime; while ( p < pend ) { // compute the length of the string starting at p and ending // at a \n or \r long len = 0; while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++; // . if we could not find a \n or \r there was an error // . MIMEs must always end in \n or \r if ( &p[len] >= pend ) return false; // . stick a NULL at the end of the line // . overwrites \n or \r TEMPORARILY char c = p [ len ]; p [ len ] = '\0'; // parse out some meaningful data if ( strncasecmp ( p , "Content-Length:" ,15) == 0 ) { m_contentLengthPos = p + 15; m_contentLen = atol( m_contentLengthPos); } else if ( strncasecmp ( p , "Last-Modified:" ,14) == 0 ) { m_lastModifiedDate=atotime(p+14); // do not let them exceed current time for purposes // of sorting by date using datedb (see Msg16.cpp) time_t now = time(NULL); if (m_lastModifiedDate > now) m_lastModifiedDate = now; } else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) m_contentType = getContentTypePrivate ( p + 13 ); else if ( strncasecmp ( p , "Set-Cookie: " ,11) == 0 ) { m_cookie = p + 11; m_cookieLen = gbstrlen ( p + 11 ); } else if ( strncasecmp ( p , "Location:" , 9) == 0 ) { // point to it char *tt = p + 9; // skip if space if ( *tt == ' ' ) tt++; if ( *tt == ' ' ) tt++; // at least set this for Msg13.cpp to use m_locationField = tt; m_locationFieldLen = gbstrlen(tt); // . we don't add the "www." because of slashdot.com // . we skip initial spaces in this Url::set() routine if(url) m_locUrl.set ( url, p + 9, len - 9, false/*addWWW?*/); } else if ( strncasecmp ( p , "Content-Encoding:", 17) == 0 ) { //only support gzip now, it doesn't seem like servers //implement the other types much m_contentEncodingPos = p+17; if(strstr(m_contentEncodingPos, "gzip")) { m_contentEncoding = ET_GZIP; } else if(strstr(m_contentEncodingPos, "deflate")) { //zlib's compression m_contentEncoding = ET_DEFLATE; } } //else if ( strncasecmp ( p, "Cookie:", 7) == 0 ) // log (LOG_INFO, "mime: Got Cookie = %s", (p+7)); // re-insert the character that we replaced with a '\0' p [ len ] = c; // go to next line p += len; // skip over the cruft at the end of this line while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++; } return true; }
// this should be called when all docs have finished spidering void Test::stopIt ( ) { // sanity if ( m_isAdding ) { char *xx=NULL;*xx=0; } // flag that we are done m_isRunning = false; // print time log("test: took %lli ms to complete injections.", gettimeofdayInMilliseconds() - m_testStartTime ); // get this before setting testParserEnabled to false char *testDir = g_test.getTestDir(); // turn this off now too g_conf.m_testParserEnabled = false; g_conf.m_testSpiderEnabled = false; // save all! bool disabled = g_threads.m_disabled; g_threads.disableThreads(); // save it blocking style g_process.save(); if ( ! disabled ) g_threads.enableThreads(); // save ips.txt saveTestBuf ( testDir ); log("test: test completed. making qa.html"); // // // NOW MAKE THE qa.html FILE // // // only analyze up to last 7 runs long start = m_runId - 7; if ( start < 0 ) start = 0; SafeBuf sb; sb.safePrintf("<table border=1>\n"); sb.safePrintf("<tr>" "<td><b><nobr>run id</nobr></b></td>" "<td><b><nobr>conf diff</nobr></b></td>" "<td><b><nobr>coll diff</nobr></b></td>" "<td><b><nobr>run info</nobr></b></td>" "</tr>\n"); // take diffs between this run and the last run for confparms for ( long i = m_runId ; i > start ; i-- ) { // shortcut char *dir = g_hostdb.m_dir; // make diff filename char diff1[200]; sprintf(diff1,"%s/%s/run.%li.confparms.txt.diff",dir, testDir,i); File f1; f1.set(diff1); if ( ! f1.doesExist() ) { char df1[200]; char df2[200]; sprintf(df1,"%s/%s/run.%li.confparms.txt",dir, testDir,i); sprintf(df2,"%s/%s/run.%li.confparms.txt",dir, testDir,i-1); // do the diff char cmd[600]; sprintf(cmd,"diff %s %s > %s",df1,df2,diff1); log("test: system(\"%s\")",cmd); system (cmd); } long fs1 = f1.getFileSize(); sb.safePrintf("<tr><td>%li</td><td>%li</td>", i,fs1); // make diff filename char diff2[200]; sprintf(diff2,"%s/%s/run.%li.collparms.txt.diff",dir, testDir,i); File f2; f2.set(diff2); if ( ! f2.doesExist() ) { char df1[200]; char df2[200]; sprintf(df1,"%s/%s/run.%li.collparms.txt",dir, testDir,i); sprintf(df2,"%s/%s/run.%li.collparms.txt",dir, testDir,i-1); // do the diff char cmd[600]; sprintf(cmd,"diff %s %s > %s",df1,df2,diff2); log("test: system(\"%s\")",cmd); system (cmd); } long fs2 = f2.getFileSize(); sb.safePrintf("<td>%li</td>", fs2); // the version char vf[200]; sprintf(vf,"%s/%s/run.%li.version.txt",dir,testDir,i); File f3; f3.set ( vf ); long fs3 = f3.getFileSize(); char vbuf[1000]; vbuf[0] = 0; if ( fs3 > 0 ) { f3.open(O_RDONLY); long rs = f3.read(vbuf,fs3,0); vbuf[fs3] = '\0'; if ( rs <= 0 ) continue; f3.close(); } // show it sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf); } sb.safePrintf("</table>\n"); sb.safePrintf("<br>\n"); // // now diff each parser output file for each url in urls.txt // // // loop over url buf first so we can print one table per url // char *next = NULL; // reset the url buf ptr m_urlPtr = m_urlBuf; // count em long count = 0; // ptrs to each url table long un = 0; long uptr [5000]; // offsets now, not char ptr since buf gets reallocd char udiff[5000]; long ulen [5000]; long uhits[5000]; // critical errors! validateOutput() choked! long uunchecked[5000]; // events/addresses found but were not validatd long umiss[5000]; long usort[5000]; long uevents[5000]; SafeBuf tmp; long niceness = MAX_NICENESS; // advance to next url for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) { // breathe QUICKPOLL(niceness); // we converted all non-url chars into \0's so skip those! for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ ); // breach check if ( m_urlPtr >= m_urlEnd ) break; // set this up next = m_urlPtr; // compute next url ptr for ( ; next < m_urlEnd && *next ; next++ ); // point to this url char *u = m_urlPtr; // get hash long long h = hash64 ( u , gbstrlen(u) ); // shortcut char *dir = g_hostdb.m_dir; // print into a secondary safe buf with a ptr to // it so we can sort that and transfer into the // primary safebuf later uptr[un] = tmp.length(); // assume no diff udiff[un] = 0; // print number tmp.safePrintf("%li) ",count++); // . link to our stored http server reply // . TODO: link it to our [cached] copy in the test coll!!! char local[1200]; sprintf(local,"/%s/doc.%llu.html",testDir,h); tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u); // link to live page tmp.safePrintf(" <a href=\"%s\">live</a> ",u); // link to page parser char ubuf[2000]; urlEncode(ubuf,2000,u,gbstrlen(u),true); tmp.safePrintf(" <a href=\"/master/parser?c=test&" "u=%s\">parser</a> ",ubuf); //tmp.safePrintf(" (%llu)",h); tmp.safePrintf("<br>\n"); //tmp.safePrintf("<br>\n"); tmp.safePrintf("<table border=1>\n"); tmp.safePrintf("<tr>" "<td><b><nobr>run id</nobr></b></td>" "<td><b><nobr>crit hits</nobr></b></td>" "<td><b><nobr>crit errors</nobr></b></td>" "<td><b><nobr># e</nobr></b></td>" "<td><b><nobr>unchecked</nobr></b></td>" "<td><b><nobr>diff chars</nobr></b></td>" "<td><b><nobr>diff file</nobr></b></td>" "<td><b><nobr>full output</nobr></b></td>" "</tr>\n"); //SafeBuf sd; // loop over all the runs now, starting with latest run first for ( long ri = m_runId ; ri >= start ; ri-- ) { QUICKPOLL(niceness); // the diff filename char pdiff[200]; sprintf(pdiff,"%s/%s/parse.%llu.%li.html.diff",dir, testDir,h,ri); File f; f.set(pdiff); long fs = f.getFileSize(); if ( ! f.doesExist() && ri > 0 ) { // make the parse filename char pbuf1[200]; char pbuf2[200]; sprintf(pbuf1,"%s/%s/parse.%llu.%li.html", dir,testDir,h,ri); sprintf(pbuf2,"%s/%s/parse.%llu.%li.html", dir,testDir,h,ri-1); // sanity check //File tf; tf.set(pbuf1); //if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;} // tmp file name char tmp1[200]; char tmp2[200]; sprintf(tmp1,"%s/%s/t1.html",dir,testDir); sprintf(tmp2,"%s/%s/t2.html",dir,testDir); // filter first char cmd[600]; sprintf(cmd, "cat %s | " "grep -v \"<!--ignore-->\" " " > %s", pbuf1,tmp1); system(cmd); sprintf(cmd, "cat %s | " "grep -v \"<!--ignore-->\" " " > %s", pbuf2,tmp2); system(cmd); // make the system cmd to do the diff sprintf(cmd, "echo \"<pre>\" > %s ; " "diff -w --text %s %s " // ignore this table header row //" | grep -v \"R#4\"" " >> %s", pdiff, tmp1,tmp2,pdiff); log("test: system(\"%s\")",cmd); system(cmd); // try again f.set(pdiff); fs = f.getFileSize(); } QUICKPOLL(niceness); // this means 0 . it just has the <pre> tag in it! if ( fs < 0 || fs == 6 ) fs = 0; // . if no diff and NOT current run, do not print it // . print it if the run right before the current // now always too if ( ri != m_runId && ri != m_runId-1 && fs == 0 ) continue; // relative filename char rel[200]; sprintf(rel,"/%s/parse.%llu.%li.html.diff", testDir,h,ri); char full[200]; sprintf(full,"/%s/parse.%llu.%li.html", testDir,h,ri); char validate[200]; sprintf(validate, "/%s/parse-shortdisplay.%llu.%li.html", testDir,h,ri); // use red font for current run that has a diff! char *t1 = ""; char *t2 = ""; if ( ri == m_runId && fs != 0 ) { t1 = "<font color=pink><b>"; t2 = "</b></font>"; // a diff udiff[un] = 1; } // . get critical errors // . i.e. XmlDoc::validateOutput() could not validate // a particular event or address that was in the // url's "validated.uh64.txt" file since the admin // clicked on the checkbox in the page parser output // . if we do not find such a tag in the parser output // any more then Spider.cpp creates this file! if ( ri == m_runId ) { char cfile[256]; sprintf(cfile,"%s/%s/critical.%llu.%li.txt", g_hostdb.m_dir,testDir,h,ri); SafeBuf ttt; ttt.fillFromFile(cfile); // first long is misses, then hits then events umiss[un] = 0; uhits[un] = 0; uevents[un] = 0; uunchecked[un] = 0; if ( ttt.length() >= 3 ) sscanf(ttt.getBufStart(), "%li %li %li %li", &umiss[un], &uhits[un], &uevents[un], &uunchecked[un]); usort[un] = umiss[un] + uunchecked[un]; //File cf; //cf.set(cfile); //if ( cf.doesExist()) ucrit[un] = 1; //else ucrit[un] = 0; } // more critical? if ( ri == m_runId && umiss[un] != 0 ) { t1 = "<font color=red><b>"; t2 = "</b></font>"; } // . these are good to have // . if you don't have 1+ critical hits then you // probably need to be validate by the qa guy char *uhb1 = ""; char *uhb2 = ""; if ( ri == m_runId && uhits[un] != 0 ) { uhb1 = "<font color=green><b>**"; uhb2 = "**</b></font>"; } QUICKPOLL(niceness); char *e1 = "<td>"; char *e2 = "</td>"; long ne = uevents[un]; if ( ne ) { e1="<td bgcolor=orange><b><font color=brown>"; e2="</font></b></td>"; } char *u1 = "<td>"; char *u2 = "</td>"; if ( uunchecked[un] ) { u1="<td bgcolor=purple><b><font color=white>"; u2="</font></b></td>"; } // print the row! tmp.safePrintf("<tr>" "<td>%s%li%s</td>" "<td>%s%li%s</td>" // critical hits "<td>%s%li%s</td>" // critical misses "%s%li%s" // # events "%s%li%s" // unchecked "<td>%s%li%s</td>" // filesize of diff // diff filename "<td><a href=\"%s\">%s%s%s</a></td>" // full parser output "<td>" "<a href=\"%s\">full</a> | " "<a href=\"%s\">validate</a> " "</td>" "</tr>\n", t1,ri,t2, uhb1,uhits[un],uhb2, t1,umiss[un],t2, e1,ne,e2, u1,uunchecked[un],u2, t1,fs,t2, rel,t1,rel,t2, full, validate); // only fill "sd" for the most recent guy if ( ri != m_runId ) continue; // now concatenate the parse-shortdisplay file // to this little table so qa admin can check/uncheck // validation checkboxes for addresses and events //sprintf(cfile, // "%s/test/parse-shortdisplay.%llu.%li.html", // g_hostdb.m_dir,h,ri); //sd.fillFromFile ( cfile ); } // end table tmp.safePrintf("</table>\n"); // . and a separate little section for the checkboxes // . should already be in tables, etc. // . each checkbox should provide its own uh64 when it // calls senddiv() when clicked now //tmp.cat ( sd ); tmp.safePrintf("<br>\n"); tmp.safePrintf("<br>\n"); // set this ulen[un] = tmp.length() - uptr[un] ; // sanity check if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; } // inc it un++; // increase the 5000!! if ( un >= 5000 ) { char *xx=NULL; *xx=0; } } char flag ; bubble: flag = 0; // sort the url tables for ( long i = 0 ; i < un - 1 ; i++ ) { QUICKPOLL(niceness); if ( usort[i] > usort[i+1] ) continue; if ( usort[i] == usort[i+1] ) if ( udiff[i] >= udiff[i+1] ) continue; // swap em long tp = uptr[i]; long td = udiff[i]; long um = umiss[i]; long us = usort[i]; long uh = uhits[i]; long tl = ulen [i]; uptr[i] = uptr[i+1]; umiss[i] = umiss[i+1]; usort[i] = usort[i+1]; uhits[i] = uhits[i+1]; udiff[i] = udiff[i+1]; ulen[i] = ulen[i+1]; uptr[i+1] = tp; umiss[i+1] = um; usort[i+1] = us; uhits[i+1] = uh; udiff[i+1] = td; ulen [i+1] = tl; flag = 1; } if ( flag ) goto bubble; // transfer into primary safe buf now for ( long i = 0 ; i < un ; i++ ) sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]); sb.safePrintf("</html>\n"); char dfile[200]; sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir); sb.dumpToFile ( dfile ); // free the buffer of urls reset(); // turn off spiders g_conf.m_spideringEnabled = 0; // all done return; }
void Blaster::startBlastering(){ int64_t now=gettimeofdayInMilliseconds(); if(m_print && m_totalDone>0 && (m_totalDone % 20)==0){ log("blaster: Processed %"INT32" urls in %"INT32" ms",m_totalDone, (int32_t) (now-m_startTime)); m_print=false; } //Launch the maximum number of threads that are allowed while ( m_p1 < m_p1end && m_launched < m_maxNumThreads && m_totalUrls){ // clear any error g_errno = 0; // make a new state StateBD *st; try { st = new (StateBD); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %"INT32" bytes for query. " "Returning HTTP status of 500.", (int32_t)sizeof(StateBD)); return; } mnew ( st , sizeof(StateBD) , "BlasterDiff3" ); st->m_buf1=NULL; m_totalUrls--; // make into a url class. Set both u1 and u2 here. //st->m_u1.set ( m_p1 , gbstrlen(m_p1) ); st->m_u1 = m_p1; // is it an injection url if ( m_doInjection || m_doInjectionWithLinks ) { // get host #0 i guess Host *h0 = g_hostdb.getHost(0); if ( ! h0 ) { char *xx=NULL;*xx=0; } static bool s_flag = true; if ( s_flag ) { s_flag = false; log("blaster: injecting to host #0 at %s on " "http/tcp port %"INT32"", iptoa(h0->m_ip), (int32_t)h0->m_httpPort); } // use spiderlinks=1 so we add the outlinks to spiderdb // but that will slow the spider rate down since it // will have to do a dns lookup on the domain of every // outlink. st->m_injectUrl.safePrintf("http://127.0.0.1:8000/" "admin/inject?"); if ( m_doInjectionWithLinks ) st->m_injectUrl.safePrintf("spiderlinks=1&"); else st->m_injectUrl.safePrintf("spiderlinks=0&"); st->m_injectUrl.safePrintf("u="); st->m_injectUrl.urlEncode(m_p1); st->m_injectUrl.pushChar('\0'); st->m_u1 = st->m_injectUrl.getBufStart(); } // skip to next url m_p1 += gbstrlen ( m_p1 ) + 1; if (m_blasterDiff){ //st->m_u2.set ( m_p2 , gbstrlen(m_p2) ); st->m_u2 = m_p2; m_p2 += gbstrlen ( m_p2 ) + 1; } // log(LOG_WARN,"\n"); log(LOG_WARN,"blaster: Downloading %s",st->m_u1); // set port if port switch is true //if ( m_portSwitch ) { // int32_t r = rand() % 32; // u.setPort ( 8000 + r ); //} // count it m_launched++; int32_t ip=0; int32_t port=0; if (m_useProxy){ ip=atoip("66.154.102.20",13); port=3128; } // get it bool status = g_httpServer.getDoc ( st->m_u1 , // url 0, // ip 0 , // offset -1 , // size 0 , // ifModifiedSince st , // state gotDocWrapper1, // callback 60*1000, // timeout ip, port, 30*1024*1024, //maxLen 30*1024*1024); // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. m_launched--; // log msg log("From file1, got doc1 %s: %s", st->m_u1 , mstrerror(g_errno) ); // we gotta wait break; } // bail if not done yet //if ( m_launched > 0 ) return; if (m_totalUrls) return; //otherwise return if launched have not come back if (m_launched) return; // exit now // g_conf.save(); // closeALL(NULL,NULL); exit ( 0 ); }
bool CatRec::set ( Url *url , char *data , long dataSize , bool gotByIp ) { //char rdbId ) { // assume url does not have a rec in tagdb m_hadRec = false; // set our collection //if ( coll ) memcpy ( m_coll , coll , collLen ); //m_collLen = collLen; // . if "data" is i guess the rec did not exist... so make a dummy rec // . MDW: why? if ( ! data || dataSize <= 0 ) { // default m_site to the hostname m_site.set (url->getHost(),url->getHostLen(),false/*addwww?*/); // steal ip from url m_site.setIp ( url->getIp() ); // default xml for this collection //m_xml = g_tagdb.getSiteXml ( 0,/*filenum*/ // coll, collLen); //, NULL , 0 ); m_filenum = 0 ; //if ( m_xml ) return true; //g_errno = ENODATA; //return log("db: Could not find the ruleset file " // "%stagdb0.xml.",g_hostdb.m_dir); return true; } // return false and set g_errno if buf too small if ( dataSize >= CATREC_BUF_SIZE ) { g_errno = EBUFTOOSMALL; return false; } // copy the raw data memcpy(m_data, data, dataSize); m_dataSize = dataSize; // set up a parsing ptr into "data" //char *p = data; char *p = m_data; // get the catids if using catdb //if (rdbId == RDB_CATDB) { m_numCatids = *(unsigned char*)p; p++; m_catids = (long*)p; p += 4*m_numCatids; //} // point to the filenum so we can mod it! //m_filenumPtr = p; // get the filenum (0 is default) //m_filenum = *(long *) p ; p += 4; m_filenum = *(long *) p ; p += 3; // get the version if ( m_filenum == -1 ) { m_version = 0; p++; } else { m_filenum &= 0x00FFFFFF; m_version = *p; p++; } // calc site url length if ( m_version == 0 ) { m_urlLen = dataSize - 4; //if (rdbId == RDB_CATDB) m_urlLen -= (4*m_numCatids) + 1; } else m_urlLen = gbstrlen(p); // set our site url m_url = p; m_site.set ( p , m_urlLen , false/*addwww?*/); // move p to end of url p += m_urlLen; if ( m_version >= 1 ) p++; // add time stamp, comment, username /* if ( m_version >= 2 && rdbId != RDB_CATDB ) { // time stamp m_timeStamp = *(long*)p; p += 4; // comment m_comment = p; p += gbstrlen(m_comment) + 1; // username m_username = p; p += gbstrlen(m_username) + 1; } unsigned char siteFlags = 0; m_spamBits = 0; m_adultLevel = 0; if ( m_version >= 3 && rdbId != RDB_CATDB ) { siteFlags = *p++; m_spamBits = siteFlags & 0xc0; } //we've added a 1 byte quality and 2 bits for adult content level. if ( m_version >= 4 && rdbId != RDB_CATDB ) { m_siteQuality = *p++; m_adultLevel = (siteFlags & 0x30); } m_incHere = NULL; m_addHere = NULL; if ( m_version >= 5 && rdbId != RDB_CATDB ) { // a marker for addSiteType() function below m_incHere = (long *)p; m_numTypes = *(uint8_t*)p; p += sizeof(uint8_t); for(long i = 0; i < m_numTypes; i++) { m_siteTypes[i].m_type = *(uint8_t*)p; p += sizeof(uint8_t); // version 6 adds 32-bit scores to site type if (m_version >= 6 && SiteType::isType4Bytes(m_siteTypes[i].m_type)) { m_siteTypes[i].m_score = *(uint32_t*)p; p += sizeof(uint32_t); } else { m_siteTypes[i].m_score = (uint32_t)*(uint8_t*)p; p += sizeof(uint8_t); } } // save ptr for addSiteTypes() m_addHere = p; //now for the languages m_numLangs = *(uint8_t*)p; p += sizeof(uint8_t); for(long i = 0; i < m_numLangs; i++) { m_siteLangs[i].m_type = *(uint8_t*)p; p += sizeof(uint8_t); m_siteLangs[i].m_score = (uint32_t)*(uint8_t*)p; p += sizeof(uint8_t); } } */ // sanity check if ( p - m_data != m_dataSize ) { log ( "tagdb: Deserialized datasize %i != %li for url %s so " "ignoring tagdb record.", p - m_data, m_dataSize , url->getUrl() ); return false; char *xx = NULL; *xx = 0; } // if hostname is same as url we can use the ip from url if ( url && m_site.getHostLen() == url->getHostLen() ) m_site.setIp ( url->getIp() ); // . this url had it's own rec in the db // . Msg16 needs to know this so it won't auto-detect p**n/spam in // this url itself and delete it from tfndb m_hadRec = true; // if rec was in tagdb, data will be non-null.. did we get the rec // from tagdb by matching an IP? (as oppossed to canonical name) m_gotByIp = gotByIp; // get the xml for this filenum //m_xml = g_tagdb.getSiteXml ( m_filenum , coll , collLen ); //if ( m_xml ) return true; // should NEVER be NULL //g_errno = ENODATA; //return log("db: Could not find the ruleset file %stagdb%li.xml.", // g_hostdb.m_dir,m_filenum); return true; }
void Blaster::runBlaster(char *file1,char *file2, int32_t maxNumThreads, int32_t wait, bool isLogFile, bool verbose,bool justDisplay, bool useProxy , bool injectUrlWithLinks , bool injectUrl ) { if (!init()) return; m_blasterDiff=true; if (!file2) m_blasterDiff=false; // set File class File f1; f1.set ( file1 ); // open files if ( ! f1.open ( O_RDONLY ) ) { log("blaster:open: %s %s",file1,mstrerror(g_errno)); return; } // get file size int32_t fileSize1 = f1.getFileSize() ; // store a \0 at the end int32_t m_bufSize1 = fileSize1 + 1; m_doInjectionWithLinks = injectUrlWithLinks; m_doInjection = injectUrl; // make buffers to hold all m_buf1 = (char *) mmalloc ( m_bufSize1 , "blaster1" ); if ( ! m_buf1) { log("blaster:mmalloc: %s",mstrerror(errno)); return; } //char *bufEnd = buf + bufSize; // set m_p1 m_p1 = m_buf1; m_p1end = m_buf1 + m_bufSize1 - 1; // read em all in if ( ! f1.read ( m_buf1 , fileSize1 , 0 ) ) { log("blaster:read: %s %s",file1,mstrerror(g_errno)); return; } // change \n to \0 //char *p = buf; int32_t n = 0; for ( int32_t i = 0 ; i < m_bufSize1 ; i++ ) { if ( m_buf1[i] != '\n' ) continue; m_buf1[i] = '\0'; n++; } if (m_blasterDiff){ File f2; f2.set ( file2 ); if ( ! f2.open ( O_RDONLY ) ) { log("blaster:open: %s %s",file2,mstrerror(g_errno)); return; } int32_t fileSize2 = f2.getFileSize() ; int32_t m_bufSize2 = fileSize2 + 1; m_buf2 = (char *) mmalloc ( m_bufSize2 , "blaster2" ); if ( ! m_buf2) { log("blaster:mmalloc: %s",mstrerror(errno)); return; } // set m_p2 m_p2 = m_buf2; m_p2end = m_buf2 + m_bufSize2 - 1; if ( ! f2.read ( m_buf2 , fileSize2 , 0 ) ) { log("blaster:read: %s %s",file2,mstrerror(g_errno)); return; } int32_t m=0; for ( int32_t i = 0 ; i < m_bufSize2 ; i++ ) { if ( m_buf2[i] != '\n' ) continue; m_buf2[i] = '\0'; m++; } // Working on only the least number of urls from both files, //because we need to work in pairs if (m<n) n=m; else m=n; m_totalUrls=n; // should we print out all the logs? m_verbose=verbose; // Should we use the proxy for getting the first Doc m_useProxy=useProxy; // Should we just display the not present links and not fetch // the page to see if they are actually present ? m_justDisplay=justDisplay; } else{ m_isLogFile=isLogFile; /*if reading a gigablast log file, find the lines that have GET and POST commands for search, and register a sleep callback for those lines with sleepWrapperLog*/ if(!isLogFile) m_totalUrls=n; else { m_totalUrls=0; char *p=m_buf1; char *pend=p+m_bufSize1; // start is the time in milliseconds of the first log // message int64_t start=atoll(m_buf1); while(p<pend) { char *lineStart=p; char *urlStart=strstr(p," GET /search"); if (!urlStart) urlStart=strstr(p," POST /search"); if(!urlStart){ p+=gbstrlen(p)+1; //goto next line continue; } urlStart++; m_wait=atoll(lineStart)-start; // register it here g_loop.registerSleepCallback(m_wait , urlStart, sleepWrapperLog); m_totalUrls++; p+=gbstrlen(p)+1; } } } log(LOG_INIT,"blaster: read %"INT32" urls into memory", m_totalUrls ); if(!isLogFile){ // get min time bewteen each spider in milliseconds m_wait = wait; // # of threads m_maxNumThreads = maxNumThreads; m_launched=0; m_portSwitch = 0; //if ( argc == 4 ) m_portSwitch = 1; //else m_portSwitch = 0; // start our spider loop //startSpidering( ); // wakeup wrapper every X ms g_loop.registerSleepCallback ( m_wait , NULL , sleepWrapper ); } // this print to print how many docs have been processed m_print=false; m_startTime=gettimeofdayInMilliseconds(); m_totalDone=0; // . now start g_loops main interrupt handling loop // . it should block forever // . when it gets a signal it dispatches to a server or db to handle it if ( ! g_loop.runLoop() ) { log("blaster::runLoop failed" ); return; } // dummy return (0-->normal exit status for the shell) return; }
void Blaster::gotDoc4 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; st->m_numUrlDocsReceived++; if (!s) { //Shouldn't happen, but still putting a checkpoint log (LOG_WARN,"blaster: Got a null s in gotDoc4." "Happened because ip could not be found for gigablast" "server"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; } // bail if got cut off if ( s->m_readOffset == 0 ) { log("blasterDiff : lost the Request in gotDoc4"); if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; freeStateBD(st); } return; } char *reply = s->m_readBuf ; int32_t size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); int32_t contentLen = size - mime.getMimeLen(); //int16_t csEnum = get_iana_charset(mime.getCharset(), // mime.getCharsetLen()); /* if (csEnum == csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/ Xml xml; if (!xml.set( content, contentLen, false, 0, false, TITLEREC_CURRENT_VERSION, true, // setparents 0, // niceness CT_XML )){ log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4"); } Links links; Url *url=mime.getLocationUrl(); if (!links.set(0,//siterec xml &xml, url, false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN, "blaster: Coudn't set Links class in gotDoc4"); } for (int32_t i=0;i<links.getNumLinks();i++){ char *ss=links.getLink(i); char *p; // This page *should* always be a gigablast page. So not adding // checks for msn or yahoo or google page. p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: Link Present on server2=%s",ss); } // So if one of the links that is returned is the exact url, // then we know that the url is present.So get the url from the // mime, search for it in the links that are returned. char tmp[1024]; char *sendBuf=s->m_sendBuf; char *p1,*p2; // First get the Host, which is the domain. Since socket s is going to // be useless after this function, changing m_sendBuf instead of using // more space p1=strstr(sendBuf,"%3A"); if(p1){ p1+=3; p2=strstr(p1," HTTP"); if (p2){ //Since I do not care about the sendbuf anymore *p2='\0'; } } if (!p1 || !p2){ log(LOG_WARN,"blasterdiff: Could not find search link" "from m_sendBuf in gotdoc4"); } else{ sprintf(tmp,"%s",p1); //log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp); bool isFound=false; // So now we search for tmp in the links for (int32_t i=0;i<links.getNumLinks();i++){ if(strstr(links.getLink(i),tmp) && links.getLinkLen(i)==(int)gbstrlen(tmp)){ isFound=true; log(LOG_WARN,"blaster: %s in results1 but not" " in results2 for query %s but does exist" " in server2",tmp,st->m_u1);//->getQuery() } } if (!isFound) log(LOG_WARN,"blaster: %s in results1 but not" " in results2 for query %s and does NOT exist" " in server2",tmp,st->m_u1); // ->getQuery() } if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){ m_launched--; // Free stateBD freeStateBD(st); } return; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the config of this host // . call g_httpServer.sendDynamicPage() to send it bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long queryLen = 0; char *query = r->getString ( "q" , &queryLen , NULL /*default*/); // ensure query not too big if ( queryLen >= MAX_QUERY_LEN ) { g_errno = EQUERYTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } CollectionRec *cr = g_collectiondb.getRec(coll); if ( ! cr ) { return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a state State10 *st ; try { st = new (State10); } catch ( ... ) { g_errno = ENOMEM; log("PageIndexdb: new(%i): %s", sizeof(State10),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State10) , "PageIndexdb" ); // password, too long pwdLen = 0 ; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // get # of records to retreive from IndexList st->m_numRecs = r->getLong ( "numRecs" , 100 ); // use disk, tree, or cache? st->m_useDisk = r->getLong ("ud" , 0 ); st->m_useTree = r->getLong ("ut" , 0 ); st->m_useCache = r->getLong ("uc" , 0 ); st->m_useDatedb= r->getLong ("ub" , 0 ); st->m_add = r->getLong ("add", 0 ); st->m_del = r->getLong ("del", 0 ); // get the termId, if any, from the cgi vars st->m_termId = r->getLongLong ("t", 0LL ) ; // get docid and score st->m_docId = r->getLongLong ("d", 0LL ); st->m_score = r->getLong ("score", 0 ); // copy query/collection memcpy ( st->m_query , query , queryLen ); st->m_queryLen = queryLen; st->m_query [ queryLen ] ='\0'; //memcpy ( st->m_coll , coll , collLen ); //st->m_collLen = collLen; //st->m_coll [ collLen ] ='\0'; st->m_coll = coll; st->m_collnum = cr->m_collnum; // save the TcpSocket st->m_socket = s; // and if the request is local/internal or not st->m_isAdmin = g_conf.isCollAdmin ( s , r ); st->m_isLocal = r->isLocal(); st->m_r.copy ( r ); // . check for add/delete request if ( st->m_add || st->m_del ) { key_t startKey = g_indexdb.makeStartKey ( st->m_termId ); key_t endKey = g_indexdb.makeEndKey ( st->m_termId ); // construct the key to add/delete st->m_key = g_indexdb.makeKey ( st->m_termId, st->m_score , st->m_docId , st->m_del ); // make an RdbList out of the key st->m_keyList.set ( (char*)&st->m_key, sizeof(key_t), (char*)&st->m_key, sizeof(key_t), startKey, endKey, 0, false, true ); log ( LOG_INFO, "build: adding indexdb key to indexdb: " "%lx %llx", st->m_key.n1, st->m_key.n0 ); // call msg1 to add/delete key if ( ! st->m_msg1.addList ( &st->m_keyList, RDB_INDEXDB, st->m_collnum, st, addedKeyWrapper, false, MAX_NICENESS ) ) return false; // continue to page if no block return gotIndexList ( st ); } if ( ! st->m_query[0] ) return gotIndexList(st); // . set query class // . a boolFlag of 0 means query is not boolean Query q; q.set2 ( query , langUnknown , true ); // 0 = boolFlag, not boolean! // reset st->m_msg36.m_termFreq = 0LL; // if query was provided, use that, otherwise use termId if ( q.getNumTerms() > 0 ) st->m_termId = q.getTermId(0); // skip if nothing else return gotTermFreq ( st ); // get the termfreq of this term! if ( ! st->m_msg36.getTermFreq ( st->m_collnum , 0 , st->m_termId, st , gotTermFreqWrapper ) ) return false; // otherwise, we didn't block return gotTermFreq ( st ); }
int32_t getContentTypeFromStr ( const char *s ) { int32_t slen = gbstrlen(s); // trim off spaces at the end char tmp[64]; if ( s[slen-1] == ' ' ) { strncpy(tmp,s,63); tmp[63] = '\0'; int32_t newLen = gbstrlen(tmp); s = tmp; char *send = tmp + newLen; for ( ; send>s && send[-1] == ' '; send-- ); *send = '\0'; } int32_t ct = CT_UNKNOWN; if ( !strncasecmp(s, "text/", 5) ) { if ( !strcasecmp(s,"text/html") ) { ct = CT_HTML; } else if ( !strcasecmp(s,"text/plain" ) ) { ct = CT_TEXT; } else if ( !strcasecmp(s,"text/xml" ) ) { ct = CT_XML; } else if ( !strcasecmp(s,"text/txt" ) ) { ct = CT_TEXT; } else if ( !strcasecmp(s,"text/javascript" ) ) { ct = CT_JS; } else if ( !strcasecmp(s,"text/x-js" ) ) { ct = CT_JS; } else if ( !strcasecmp(s,"text/js" ) ) { ct = CT_JS; } else if ( !strcasecmp(s,"text/css" ) ) { ct = CT_CSS; } else if ( !strcasecmp(s,"text/x-vcard" ) ) { // . semicolon separated list of info, sometimes an element is html // . these might have an address in them... ct = CT_HTML; } else { ct = CT_TEXT; } } else if (!strcasecmp(s,"text" ) ) ct = CT_TEXT; else if (!strcasecmp(s,"txt" ) ) ct = CT_TEXT; else if (!strcasecmp(s,"application/xml" ) ) ct = CT_XML; // we were not able to spider links on an xhtml doc because // this was set to CT_XML, so try CT_HTML else if (!strcasecmp(s,"application/xhtml+xml" ) ) ct = CT_HTML; else if (!strcasecmp(s,"application/rss+xml" ) ) ct = CT_XML; else if (!strcasecmp(s,"rss" ) ) ct = CT_XML; else if (!strcasecmp(s,"application/rdf+xml" ) ) ct = CT_XML; else if (!strcasecmp(s,"application/atom+xml" ) ) ct = CT_XML; else if (!strcasecmp(s,"atom+xml" ) ) ct = CT_XML; else if (!strcasecmp(s,"application/pdf" ) ) ct = CT_PDF; else if (!strcasecmp(s,"application/msword" ) ) ct = CT_DOC; else if (!strcasecmp(s,"application/vnd.ms-excel") ) ct = CT_XLS; else if (!strcasecmp(s,"application/vnd.ms-powerpoint")) ct = CT_PPT; else if (!strcasecmp(s,"application/mspowerpoint") ) ct = CT_PPT; else if (!strcasecmp(s,"application/postscript" ) ) ct = CT_PS; else if (!strcasecmp(s,"application/warc" ) ) ct = CT_WARC; else if (!strcasecmp(s,"application/arc" ) ) ct = CT_ARC; else if (!strcasecmp(s,"image/gif" ) ) ct = CT_GIF; else if (!strcasecmp(s,"image/jpeg" ) ) ct = CT_JPG; else if (!strcasecmp(s,"image/png" ) ) ct = CT_PNG; else if (!strcasecmp(s,"image/tiff" ) ) ct = CT_TIFF; else if (!strncasecmp(s,"image/",6 ) ) ct = CT_IMAGE; else if (!strcasecmp(s,"application/javascript" ) ) ct = CT_JS; else if (!strcasecmp(s,"application/x-javascript") ) ct = CT_JS; else if (!strcasecmp(s,"application/x-gzip" ) ) ct = CT_GZ; else if (!strcasecmp(s,"application/json" ) ) ct = CT_JSON; // facebook.com: else if (!strcasecmp(s,"application/vnd.wap.xhtml+xml") ) ct =CT_HTML; else if (!strcasecmp(s,"binary/octet-stream") ) ct = CT_UNKNOWN; else if (!strcasecmp(s,"application/octet-stream") ) ct = CT_UNKNOWN; else if (!strcasecmp(s,"application/binary" ) ) ct = CT_UNKNOWN; else if (!strcasecmp(s,"application/x-tar" ) ) ct = CT_UNKNOWN; else if ( !strncmp ( s , "audio/",6) ) ct = CT_UNKNOWN; return ct; }
bool Collectiondb::load ( bool isDump ) { char dname[1024]; // MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir ); sprintf ( dname , "%s" , g_hostdb.m_dir ); Dir d; d.set ( dname ); if ( ! d.open ()) return log("admin: Could not load collection config " "files."); // note it log(LOG_INIT,"admin: Loading collection config files."); // . scan through all subdirs in the collections dir // . they should be like, "coll.main/" and "coll.mycollection/" char *f; while ( ( f = d.getNextFilename ( "*" ) ) ) { // skip if first char not "coll." if ( strncmp ( f , "coll." , 5 ) != 0 ) continue; // must end on a digit (i.e. coll.main.0) if ( ! is_digit (f[gbstrlen(f)-1]) ) continue; // point to collection char *coll = f + 5; // NULL terminate at . char *pp = strchr ( coll , '.' ); if ( ! pp ) continue; *pp = '\0'; // get collnum collnum_t collnum = atol ( pp + 1 ); // add it if ( !addRec ( coll , NULL , 0 , false , collnum , isDump , true ) ) return false; } // note it log(LOG_INIT,"admin: Loaded data for %li collections. Ranging from " "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1); // update the time updateTime(); // don't clean the tree if just dumpin if ( isDump ) return true; // remove any nodes with illegal collnums Rdb *r; //r = g_indexdb.getRdb(); //r->m_tree.cleanTree ((char **)r->m_bases); r = g_posdb.getRdb(); r->m_tree.cleanTree ((char **)r->m_bases); //r = g_datedb.getRdb(); //r->m_tree.cleanTree ((char **)r->m_bases); r = g_titledb.getRdb(); r->m_tree.cleanTree ((char **)r->m_bases); //r = g_revdb.getRdb(); //r->m_tree.cleanTree ((char **)r->m_bases); //r = g_sectiondb.getRdb(); //r->m_tree.cleanTree ((char **)r->m_bases); //r = g_checksumdb.getRdb(); //r->m_tree.cleanTree ((char **)r->m_bases); //r = g_tfndb.getRdb(); //r->m_tree.cleanTree ((char **)r->m_bases); r = g_spiderdb.getRdb(); r->m_tree.cleanTree ((char **)r->m_bases); r = g_doledb.getRdb(); r->m_tree.cleanTree ((char **)r->m_bases); // success return true; }
void gotDocWrapper ( void *state , TcpSocket *s ) { // no longer launched s_launched--; char* url = (char*)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("lost %s",(char *) state); if(s_server) mfree(url, gbstrlen(url)+1, "saved url"); return; } // got one more result page s_total++; // allow printing s_printIt = true; // get time now long long now = gettimeofdayInMilliseconds(); // get hash char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); long contentLen = size - mime.getMimeLen(); long status = mime.getHttpStatus(); unsigned long h = hash32 ( content , contentLen ); char *p = mime.getMime(); char *pend = p + mime.getMimeLen(); char message[256]; long mlen = 0; // parse status message out of response // HTTP/1.0 while ( p < pend && !is_space(*p) ) p++; // skip space while ( p < pend && is_space(*p) ) p++; // copy to end of line while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){ message[mlen++] = *p; } message[mlen] = '\0'; // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (status=%li) (%li) (%lims) %s : " "%s", status, s->m_readOffset , (long)(now - s->m_startTime) , (char *)state , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (status=%li) (%li) (%lims) " "(hash=%lx) %s", status, s->m_readOffset , (long)(now - s->m_startTime) , h , (char *)state ); if(s_server) mfree(url, gbstrlen(url)+1, "saved url"); // try to launch another startSpidering(); }
// . return false if blocked, true otherwise // . sets g_errno on error bool Msg1::sendData ( unsigned long shardNum, char *listData , long listSize) { // debug msg //log("sendData: mcast=%lu listSize=%li", // (long)&m_mcast,(long)listSize); // bail if this is an interface machine, don't write to the main if ( g_conf.m_interfaceMachine ) return true; // return true if no data if ( listSize == 0 ) return true; // how many hosts in this group //long numHosts = g_hostdb.getNumHostsPerShard(); // . NOTE: for now i'm removing this until I handle ETRYAGAIN errors // properly... by waiting and retrying... // . if this is local data just for us just do an addList to OUR rdb /* if ( groupId == g_hostdb.m_groupId && numHosts == 1 ) { // this sets g_errno on error Msg0 msg0; Rdb *rdb = msg0.getRdb ( (char) m_rdbId ); if ( ! rdb ) return true; // make a list from this data RdbList list; list.set (listData,listSize,listSize,rdb->getFixedDataSize(), false) ; // ownData? // this returns false and sets g_errno on error rdb->addList ( &list ); // . if we got a ETRYAGAIN cuz the buffer we add to was full // then we should sleep and try again! // . return false cuz this blocks for a period of time // before trying again if ( g_errno == ETRYAGAIN ) { // try adding again in 1 second registerSleepCallback ( 1000, slot, tryAgainWrapper1 ); // return now return false; } // . always return true cuz we did not block // . g_errno may be set return true; } */ // if the data is being added to our group, don't send ourselves // a msg1, if we can add it right now bool sendToSelf = true; if ( shardNum == getMyShardNum() && ! g_conf.m_interfaceMachine ) { // get the rdb to which it belongs, use Msg0::getRdb() Rdb *rdb = getRdbFromId ( (char) m_rdbId ); if ( ! rdb ) goto skip; // key size long ks = getKeySizeFromRdbId ( m_rdbId ); // reset g_errno g_errno = 0; // . make a list from this data // . skip over the first 4 bytes which is the rdbId // . TODO: embed the rdbId in the msgtype or something... RdbList list; // set the list list.set ( listData , listSize , listData , listSize , rdb->getFixedDataSize() , false , // ownData? rdb->useHalfKeys() , ks ); // note that //log("msg1: local addlist niceness=%li",m_niceness); // this returns false and sets g_errno on error rdb->addList ( m_coll , &list , m_niceness ); // if titledb, add tfndb recs to map the title recs //if ( ! g_errno && rdb == g_titledb.getRdb() && m_injecting ) // // this returns false and sets g_errno on error // updateTfndb ( m_coll , &list , true , m_niceness); // if no error, no need to use a Msg1 UdpSlot for ourselves if ( ! g_errno ) sendToSelf = false; else { log("rdb: msg1 had error: %s",mstrerror(g_errno)); // this is messing up generate catdb's huge rdblist add // why did we put it in there??? from msg9b.cpp //return true; } QUICKPOLL(m_niceness); // if we're the only one in the group, bail, we're done if ( ! sendToSelf && g_hostdb.getNumHostsPerShard() == 1 ) return true; } skip: // . make an add record request to multicast to a bunch of machines // . this will alloc new space, returns NULL on failure //char *request = makeRequest ( listData, listSize, groupId , //m_rdbId , &requestLen ); long collLen = gbstrlen ( m_coll ); // . returns NULL and sets g_errno on error // . calculate total size of the record // . 1 byte for rdbId, 1 byte for flags, // then collection NULL terminated, then list long requestLen = 1 + 1 + collLen + 1 + listSize ; // make the request char *request = (char *) mmalloc ( requestLen ,"Msg1" ); if ( ! request ) return true; char *p = request; // store the rdbId at top of request *p++ = m_rdbId; // then the flags *p = 0; if ( m_injecting ) *p |= 0x80; p++; // then collection name memcpy ( p , m_coll , collLen ); p += collLen; *p++ = '\0'; // sanity check if ( collLen <= 0 ) { log(LOG_LOGIC,"net: No collection specified for list add."); //char *xx = NULL; *xx = 0; g_errno = ENOCOLLREC; return true; } //if ( m_deleteRecs ) request[1] |= 0x80; //if ( m_overwriteRecs ) request[1] |= 0x40; // store the list after coll memcpy ( p , listData , listSize ); QUICKPOLL(m_niceness); // debug msg //if ( ! m_waitForReply ) // (m_rdbId == RDB_SPIDERDB || //m_rdbId == RDB_TFNDB) ) // // if we don't get here we lose it!!!!!!!!!!!!!!!!!!!!! // log("using mcast=%lu rdbId=%li listData=%lu listSize=%lu " // "gid=%lu", // (long)&m_mcast,(long)m_rdbId,(long)listData,(long)listSize, // groupId); // for small packets //long niceness = 2; //if ( requestLen < TMPBUFSIZE - 32 ) niceness = 0; //log("msg1: sending mcast niceness=%li",m_niceness); // . multicast to all hosts in group "groupId" // . multicast::send() returns false and sets g_errno on error // . we return false if we block, true otherwise // . will loop indefinitely if a host in this group is down key_t k; k.setMin(); if ( m_mcast.send ( request , // sets mcast->m_msg to this requestLen , // sets mcast->m_msgLen to this 0x01 , // msgType for add rdb record true , // does multicast own msg? shardNum , // group to send to (groupKey) true , // send to whole group? 0 , // key is useless for us this , // state data NULL , // state data gotReplyWrapper1 , 60 , // timeout in secs m_niceness , // niceness false , // realtime -1 , // first host to try NULL , // replyBuf = NULL , 0 , // replyBufMaxSize = 0 , true , // freeReplyBuf = true , false , // doDiskLoadBalancing = false , -1 , // no max cache age limit //(key_t)0 , // cache key k , // cache key RDB_NONE , // bogus rdbId -1 , // unknown minRecSizes read size sendToSelf )) return false; QUICKPOLL(m_niceness); // g_errno should be set log("net: Had error when sending request to add data to %s in shard " "#%lu: %s.", getDbnameFromId(m_rdbId),shardNum,mstrerror(g_errno)); return true; }
int main ( int argc , char *argv[] ) { // let's ensure our core file can dump struct rlimit lim; lim.rlim_cur = lim.rlim_max = RLIM_INFINITY; if ( setrlimit(RLIMIT_CORE,&lim) ) log("blaster::setrlimit: %s", mstrerror(errno) ); g_conf.m_maxMem = 500000000; // init our table for doing zobrist hashing if ( ! hashinit() ) { log("blaster::hashinit failed" ); return 1; } // init the memory class after conf since it gets maxMem from Conf //if ( ! g_mem.init ( 20000000 ) ) { // log("blaster::Mem init failed" ); return 1; } g_mem.m_maxMem = 200000000; // start up log file if ( ! g_log.init( "/tmp/blasterLog" ) ) { log("blaster::Log open /tmp/blasterLog failed" ); return 1; } // get dns ip from /etc/resolv.conf g_conf.m_dnsIps[0] = 0; FILE *fd = fopen ( "/etc/resolv.conf" , "r" ); if ( ! fd ) { log("blaster::fopen: /etc/resolve.conf %s", mstrerror(errno)); return 1; } char tmp[1024]; while ( fgets ( tmp , 1024 , fd ) ) { // tmp buf ptr char *p = tmp; // skip comments if ( *p == '#' ) continue; // skip nameserver name if ( ! isdigit(*p) ) while ( ! isspace ( *p ) ) p++ ; // skip spaces while ( isspace ( *p ) ) p++; // if this is not a digit, continue if ( ! isdigit(*p) ) continue; // get ip g_conf.m_dnsIps[0] = atoip ( p , gbstrlen(p) ); // done break; } fclose ( fd ); // if no dns server found, bail if ( g_conf.m_dnsIps[0] == 0 ) { log("blaster:: no dns ip found in /etc/resolv.conf");return 1;} // hack # of dns servers g_conf.m_numDns = 1; g_conf.m_dnsPorts[0] = 53; //g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 ); //g_conf.m_dnsClientPort = 9909; g_conf.m_dnsMaxCacheMem = 1024*10; // hack http server port to -1 (none) //g_conf.m_httpPort = 0; g_conf.m_httpMaxSockets = 200; //g_conf.m_httpMaxReadBufSize = 102*1024*1024; g_conf.m_httpMaxSendBufSize = 16*1024; //g_conf.m_httpMaxDownloadSockets = 200; if ( argc != 4 && argc != 5 && argc !=6 ) { printUsage: log("USAGE: blaster [fileOfUrls | -r<num random words><server>] [maxNumThreads] [wait in ms] " "<lines to skip> <string to append>"); log("USAGE: examples:"); log("USAGE: ./blaster queries.fromlog 10 1"); log("USAGE: ./blaster -r3http://www.gigablast.com/index.php?q= 1 100\n"); return 1; } // init the loop if ( ! g_loop.init() ) { log("blaster::Loop init failed" ); return 1; } // . then dns client // . server should listen to a socket and register with g_loop if ( ! g_dns.init(6000) ) { log("blaster::Dns client init failed" ); return 1; } // . then webserver // . server should listen to a socket and register with g_loop for(long i = 0; i < 50; i++) { if ( ! g_httpServer.init( 8333 + i, 9334+i ) ) { log("blaster::HttpServer init failed" ); //return 1; } else break; } // set File class char *fname = argv[1]; long fnameLen = gbstrlen(fname); long fileSize = 0; long bufSize = 0; char *buf = NULL; long n = 0; //should we generate random queries? if(fnameLen > 2 && fname[0] == '-' && fname[1] == 'r') { char *p = fname + 2; s_numRandWords = atoi( p ); while(is_digit(*p)) p++; getWords(); if(*p == '\0') goto printUsage; s_server = p; log("blaster server is %s", s_server); // char x[1024]; // while(1) { // long l = getRandomWords(x, x + 1024, s_numRandWords); // *(x + l) = '\0'; // log("blaster: %s", x); // } // exit(1); } else { //it is a real file File f; f.set ( fname ); // open file if ( ! f.open ( O_RDONLY ) ) { log("blaster::open: %s %s",fname,mstrerror(g_errno)); return 1; } // get file size fileSize = f.getFileSize() ; // store a \0 at the end bufSize = fileSize + 1; // make buffer to hold all buf = (char *) mmalloc ( bufSize , "blaster" ); if ( ! buf) {log("blaster::mmalloc: %s",mstrerror(errno));return 1;} //char *bufEnd = buf + bufSize; // set s_p s_p = buf; s_pend = buf + bufSize - 1; // read em all in if ( ! f.read ( buf , fileSize , 0 ) ) { log("blaster::read: %s %s",fname,mstrerror(g_errno));return 1;} // change \n to \0 //char *p = buf; for ( long i = 0 ; i < bufSize ; i++ ) { if ( buf[i] != '\n' ) continue; buf[i] = '\0'; n++; } f.close(); } // log a msg log(LOG_INIT,"blaster: read %li urls into memory", n ); long linesToSkip = 0; if ( argc >= 5 ) { linesToSkip = atoi ( argv[4] ); log (LOG_INIT,"blaster: skipping %li urls",linesToSkip); } for ( long i = 0; i < linesToSkip && s_p < s_pend; i++ ) s_p += gbstrlen(s_p) + 1; if ( argc == 6 ) { long len = gbstrlen ( argv[5] ); if ( len > 512 ) len = 512; strncpy ( s_append , argv[5] , gbstrlen (argv[5]) ); } else s_append[0] = '\0'; // get min time bewteen each spider in milliseconds s_wait = atoi( argv[3] ); // # of threads s_maxNumThreads = 1; s_maxNumThreads = atoi ( argv[2] ); s_portSwitch = 0; //if ( argc == 4 ) s_portSwitch = 1; //else s_portSwitch = 0; // start our spider loop //startSpidering( ); // wakeup wrapper every X ms g_loop.registerSleepCallback ( s_wait , NULL , sleepWrapper ); //msg10.addUrls ( uu , gbstrlen(uu)+1, NULL,0,time(0),4,true,NULL,NULL); // . now start g_loops main interrupt handling loop // . it should block forever // . when it gets a signal it dispatches to a server or db to handle it if ( ! g_loop.runLoop() ) { log("blaster::runLoop failed" ); return 1; } // dummy return (0-->normal exit status for the shell) return 0; }
// . destroys the slot if false is returned // . this is registered in Msg1::set() to handle add rdb record msgs // . seems like we should always send back a reply so we don't leave the // requester's slot hanging, unless he can kill it after transmit success??? // . TODO: need we send a reply back on success???? // . NOTE: Must always call g_udpServer::sendReply or sendErrorReply() so // read/send bufs can be freed void handleRequest1 ( UdpSlot *slot , long netnice ) { // extract what we read char *readBuf = slot->m_readBuf; long readBufSize = slot->m_readBufSize; long niceness = slot->m_niceness; // select udp server based on niceness UdpServer *us = &g_udpServer; // must at least have an rdbId if ( readBufSize <= 4 ) { g_errno = EREQUESTTOOSHORT; us->sendErrorReply ( slot , g_errno ); return; } char *p = readBuf; char *pend = readBuf + readBufSize; // extract rdbId char rdbId = *p++; // get the rdb to which it belongs, use Msg0::getRdb() Rdb *rdb = getRdbFromId ( (char) rdbId ); if ( ! rdb ) { us->sendErrorReply ( slot, EBADRDBID ); return;} // keep track of stats rdb->readRequestAdd ( readBufSize ); // reset g_errno g_errno = 0; // are we injecting some title recs? bool injecting; if ( *p & 0x80 ) injecting = true; else injecting = false; p++; // then collection char *coll = p; p += gbstrlen (p) + 1; // . make a list from this data // . skip over the first 4 bytes which is the rdbId // . TODO: embed the rdbId in the msgtype or something... RdbList list; // set the list list.set ( p , // readBuf + 4 , pend - p , // readBufSize - 4 , p , // readBuf + 4 , pend - p , // readBufSize - 4 , rdb->getFixedDataSize() , false , // ownData? rdb->useHalfKeys() , rdb->getKeySize () ); // note it //log("msg1: handlerequest1 calling addlist niceness=%li",niceness); //log("msg1: handleRequest1 niceness=%li",niceness); // this returns false and sets g_errno on error rdb->addList ( coll , &list , niceness); // if titledb, add tfndb recs to map the title recs //if ( ! g_errno && rdb == g_titledb.getRdb() && injecting ) // updateTfndb ( coll , &list , true, 0); // but if deleting a "new" and unforced record from spiderdb // then only delete tfndb record if it was tfn=255 //if ( ! g_errno && rdb == g_spiderdb.getRdb() ) // updateTfndb2 ( coll , &list , false ); // retry on some errors addedList ( slot , rdb ); }
void Scraper::gotPhrase ( ) { // error getting random phrase? bail! if ( g_errno ) log("scraper: got error getting random phrase: %s", mstrerror(g_errno)); CollectionRec *cr = g_collectiondb.getRec ( m_coll ); loop: // what type of query should we do? m_qtype = rand() % 3; // make sure web, news, blog is enabled if ( m_qtype == 0 && ! cr->m_scrapingEnabledWeb ) goto loop; if ( m_qtype == 1 && ! cr->m_scrapingEnabledNews ) goto loop; if ( m_qtype == 2 && ! cr->m_scrapingEnabledBlogs ) goto loop; // scraping is off when repairing obviously if ( g_repairMode ) return; // get it char *s = g_wiki.m_randPhrase; // convert _'s to spaces for ( char *p = s ; *p ; p++ ) if ( *p == '_' ) *p = ' '; // . url encode the random phrase // . truncate it to 200 bytes to keep things sane // . Wiki::doneReadingWiki() keeps it below 128 i think anyway char qe[400]; urlEncode(qe, 200, s , gbstrlen(s) ); char *end = qe + 390; // half the time append a random word from dictionary so that we // discovery those tail-end sites better if ( m_qtype == 0 && (rand() % 2) ) { // point into it for appending char *p = qe + gbstrlen(qe); // add a space, url encoded *p++ = '+'; // append a random word to it from dictionary char *rw = g_speller.getRandomWord(); // append that in urlEncode( p , end - p - 1 , rw , gbstrlen(rw) ); } // make a query to scrape char buf[2048]; char *uf ; if ( m_qtype == 0 ) uf="http://www.google.com/search?num=50&q=%s&scoring=d" "&filter=0"; // google news query? sort by date. else if ( m_qtype == 1 ) uf="http://news.google.com/news?num=50&q=%s&sort=n" "&filter=0"; // google blog query? else if ( m_qtype == 2 ) uf="http://www.google.com/blogsearch?num=50&q=%s&scoring=d" "&filter=0"; // sanity check else { char *xx=NULL;*xx=0; } // make the url we will download sprintf ( buf , uf , qe ); SpiderRequest sreq; // set the SpiderRequest strcpy(sreq.m_url, uf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; int32_t firstIp = hash32n(uf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); // forceDEl = false, niceness = 0 m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd.m_useRobotsTxt = false; // call this when index completes m_xd.setCallback ( NULL , indexedDocWrapper ); // assume it blocked m_numSent++; // scraper is special m_xd.m_usePosdb = false; //m_xd.m_useDatedb = false; m_xd.m_useClusterdb = false; m_xd.m_useLinkdb = false; m_xd.m_useSpiderdb = true; // only this one i guess m_xd.m_useTitledb = false; m_xd.m_useTagdb = false; m_xd.m_usePlacedb = false; //m_xd.m_useTimedb = false; //m_xd.m_useSectiondb = false; //m_xd.m_useRevdb = false; // . return false if this blocks // . will add the spider recs to spiderdb of the outlinks // . will add "ingoogle", etc. tags for each outlink if ( ! m_xd.indexDoc ( ) ) return ; // we didn't block indexedDoc ( ); }
// . "uf" is printf url format to scrape with a %s for the query // . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0"; bool Msg7::scrapeQuery ( ) { // advance round now in case we return early m_round++; InjectionRequest *ir = &m_injectionRequest; // error? char *qts = ir->ptr_queryToScrape; if ( ! qts ) { char *xx=NULL;*xx=0; } if ( gbstrlen(qts) > 500 ) { g_errno = EQUERYTOOBIG; return true; } // first encode the query SafeBuf ebuf; ebuf.urlEncode ( qts ); // queryUNEncoded ); ebuf.nullTerm(); char *uf; if ( m_round == 1 ) // set to 1 for debugging uf="http://www.google.com/search?num=20&" "q=%s&scoring=d&filter=0"; //uf = "https://startpage.com/do/search?q=%s"; //uf = "http://www.google.com/" // "/cse?cx=013269018370076798483%3A8eec3papwpi&" // "ie=UTF-8&q=%s&" // "num=20"; else uf="http://www.bing.com/search?q=%s"; // skip bing for now //if ( m_round == 2 ) // return true; //if ( m_round == 1 ) // return true; // make the url we will download char ubuf[2048]; sprintf ( ubuf , uf , ebuf.getBufStart() ); // log it log("inject: SCRAPING %s",ubuf); SpiderRequest sreq; sreq.reset(); // set the SpiderRequest strcpy(sreq.m_url, ubuf); // . tell it to only add the hosts of each outlink for now! // . that will be passed on to when XmlDoc calls Links::set() i guess // . xd will not reschedule the scraped url into spiderdb either sreq.m_isScraping = 1; sreq.m_fakeFirstIp = 1; int32_t firstIp = hash32n(ubuf); if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; sreq.m_firstIp = firstIp; // parent docid is 0 sreq.setKey(firstIp,0LL,false); //char *coll2 = ir->m_coll; CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );//coll2 ); // need to make a new one now XmlDoc *xd; try { xd = new (XmlDoc); } catch ( ... ) { g_errno = ENOMEM; log("PageInject: scrape failed: new(%i): %s", (int)sizeof(XmlDoc),mstrerror(g_errno)); return true; } mnew ( xd, sizeof(XmlDoc) , "PageInject" ); // save it m_xd = xd; // forceDEl = false, niceness = 0 m_xd->set4 ( &sreq , NULL , cr->m_coll , NULL , 0 ); //m_xd.m_isScraping = true; // download without throttling //m_xd.m_throttleDownload = false; // disregard this m_xd->m_useRobotsTxt = false; // this will tell it to index ahrefs first before indexing // the doc. but do NOT do this if we are from ahrefs.com // ourselves to avoid recursive explosion!! if ( m_useAhrefs ) m_xd->m_useAhrefs = true; m_xd->m_reallyInjectLinks = true;//ir->m_injectLinks; // // rather than just add the links of the page to spiderdb, // let's inject them! // m_xd->setCallback ( this , doneInjectingLinksWrapper ); // niceness is 0 m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2"); // do we actually inject the links, or just scrape? if ( ! m_xd->injectLinks ( &m_linkDedupTable , NULL, this , doneInjectingLinksWrapper ) ) return false; // otherwise, just download the google/bing search results so we // can display them in xml //else if ( m_xd.getUtf8Content() == (char **)-1 ) // return false; // print reply.. //printReply(); return true; }
bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) { SafeBuf p; char getBuf[64]; // holds extra values for GET method char formBuf[256]; // holds extra values for forms snprintf(getBuf, 64, "c=%s", r->getString("c", 0, "")); snprintf(formBuf, 256, "<input type=hidden name=\"c\" value=\"%s\">", //"<input type=hidden name=\"pwd\" value=\"%s\">", r->getString("c", 0, "")); g_pages.printAdminTop( &p, s, r); if (r->getLong("cancel", 0) != 0) { g_thesaurus.cancelRebuild(); p.safePrintf("<br><br>\n"); p.safePrintf( "<center><b><font color=#ff0000>" "rebuild canceled" "</font></b></center>"); } if (r->getLong("rebuild", 0) != 0) { bool full = r->getLong("full", 0); p.safePrintf("<br><br>\n"); if (g_thesaurus.rebuild(0, full)) { p.safePrintf( "<center><b><font color=#ff0000>" "error starting rebuild, check log for details" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "rebuild started" "</font></b></center>"); } } if (r->getLong("rebuildaff", 0) != 0) { bool full = r->getLong("full", 0); p.safePrintf("<br><br>\n"); if (g_thesaurus.rebuildAffinity(0, full)) { p.safePrintf( "<center><b><font color=#ff0000>" "error starting rebuild, check log for details" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "rebuild started" "</font></b></center>"); } } if (r->getLong("distribute", 0) != 0) { char cmd[1024]; p.safePrintf("<br><br>\n"); if (g_thesaurus.m_affinityState) { p.safePrintf( "<center><b><font color=#ff0000>" "cannot distribute during rebuild" "</font></b></center>"); } else { for ( long i = 0; i < g_hostdb.getNumHosts() ; i++ ) { Host *h = g_hostdb.getHost(i); snprintf(cmd, 512, "rcp -r " "./dict/thesaurus.* " "%s:%s/dict/ &", iptoa(h->m_ip), h->m_dir); log(LOG_INFO, "admin: %s", cmd); system( cmd ); } p.safePrintf( "<center><b><font color=#ff0000>" "data distributed" "</font></b></center>"); } } if (r->getLong("reload", 0) != 0) { p.safePrintf("<br><br>\n"); if (r->getLong("cast", 0) != 0) { p.safePrintf( "<center><b><font color=#ff0000>" "reload command broadcast" "</font></b></center>"); } else if (g_thesaurus.init()) { p.safePrintf( "<center><b><font color=#ff0000>" "thesaurus data reloaded" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "error reloading thesaurus data" "</font></b></center>"); } } long manualAddLen = 0; char *manualAdd = NULL; SafeBuf manualAddBuf; if ((manualAdd = r->getString("manualadd", &manualAddLen))) { trimWhite(manualAdd); manualAddLen = gbstrlen(manualAdd); File manualFile; manualFile.set(g_hostdb.m_dir, "dict/thesaurus-manual.txt"); if (manualFile.open(O_WRONLY | O_CREAT | O_TRUNC) && (manualFile.write(manualAdd, manualAddLen, 0) == manualAddLen)) { char newl = '\n'; // for write() if (manualAdd[manualAddLen-1] != '\n') manualFile.write(&newl, 1, manualAddLen); p.safePrintf( "<center><b><font color=#ff0000>" "updated manual add file sucessfully" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "error writing manual add file" "</font></b></center>"); } } else { char ff[PATH_MAX]; snprintf(ff, PATH_MAX, "%sdict/thesaurus-manual.txt", g_hostdb.m_dir); if (manualAddBuf.fillFromFile(ff)) { if (*(manualAddBuf.getBuf()-1) != '\n') manualAddBuf.pushChar('\n'); manualAdd = manualAddBuf.getBufStart(); manualAddLen = manualAddBuf.length(); } } long affinityAddLen = 0; char *affinityAdd = NULL; SafeBuf affinityAddBuf; if ((affinityAdd = r->getString("affinityadd", &affinityAddLen))) { trimWhite(affinityAdd); affinityAddLen = gbstrlen(affinityAdd); File affinityFile; affinityFile.set(g_hostdb.m_dir, "dict/thesaurus-affinity.txt"); if (affinityFile.open(O_WRONLY | O_CREAT | O_TRUNC) && (affinityFile.write(affinityAdd, affinityAddLen, 0) == affinityAddLen)) { char newl = '\n'; // for write() if (affinityAdd[affinityAddLen-1] != '\n') affinityFile.write(&newl, 1, affinityAddLen); p.safePrintf( "<center><b><font color=#ff0000>" "updated affinity add file sucessfully" "</font></b></center>"); } else { p.safePrintf( "<center><b><font color=#ff0000>" "error writing affinity add file" "</font></b></center>"); } } else { char ff[PATH_MAX]; snprintf(ff, PATH_MAX, "%sdict/thesaurus-affinity.txt", g_hostdb.m_dir); if (affinityAddBuf.fillFromFile(ff)) { if (*(affinityAddBuf.getBuf()-1) != '\n') affinityAddBuf.pushChar('\n'); affinityAdd = affinityAddBuf.getBufStart(); affinityAddLen = affinityAddBuf.length(); } } char *syn = r->getString("synonym"); long len = 0; if (syn) len = gbstrlen(syn); if (len) { SynonymInfo info; bool r = g_thesaurus.getAllInfo(syn, &info, len, SYNBIT_ALL); p.safePrintf("<br><br>\n"); p.safePrintf ( "<table cellpadding=4 width=100%% bgcolor=#%s border=1>" "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Synonym List (%ld)</b></center>" "</td>" "</tr>\n", LIGHT_BLUE, DARK_BLUE, info.m_numSyns); if (r) { p.safePrintf("<tr>" "<td align=right><tt>%s</tt></td>" "<td align=left>" "<tt>1.000/%08lX (1.000/%08lX)</tt>" "</td>" "</tr>\n", syn, MAX_AFFINITY, MAX_AFFINITY); for (long i = 0; i < info.m_numSyns; i++) { // get the reverse affinity as well long aff = g_thesaurus.getAffinity( info.m_syn[i], syn, info.m_len[i], len); p.safePrintf( "<tr>" "<td width=40%% align=right>" "<tt>"); p.safeMemcpy(info.m_syn[i], info.m_len[i]); p.safePrintf("</tt>" "</td>" "<td width=60%% align=left>" "<tt>"); if (info.m_affinity[i] >= 0) { p.safePrintf("%0.3f/%08lX ", (float)info.m_affinity[i] / MAX_AFFINITY, info.m_affinity[i]); } else { p.safePrintf("u "); } if (aff >= 0) { p.safePrintf("(%0.3f/%08lX) ", (float)aff / MAX_AFFINITY, aff); } else { p.safePrintf("(u) "); } p.safePrintf("(%ld) (%ld) (%ld) (%ld) " "(%lld) (%lld)", (long)info.m_type[i], (long)info.m_sort[i], info.m_firstId[i], info.m_lastId[i], info.m_leftSynHash[i], info.m_rightSynHash[i]); for (int j = info.m_firstId[i]; j <= info.m_lastId[i]; j++) { p.safePrintf(" (%lld)", info.m_termId[j]); } p.safePrintf( "</tt>" "</td>" "</tr>\n"); } p.safePrintf("</table>"); } else { p.safePrintf("<tr>" "<td align=center><font color=#FF0000>" "synonym not found: %s" "</font></td>" "</tr>\n", syn); } } p.safePrintf ( "<br><br>\n" ); p.safePrintf ( "<table cellpadding=4 width=100%% bgcolor=#%s border=1>" "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Thesaurus Controls" "</b></center></td>" "</tr>\n", LIGHT_BLUE, DARK_BLUE); p.safePrintf ( "<tr>" "<td width=37%%><b>rebuild all data</b><br>" "<font size=1>" "rebuilds synonyms and then begins the rebuild process for " "affinity data; this should only be run on one host, as the " "data is copied when the process is finished; full rebuild " "does not use existing affinity data" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">" "rebuild all data</a> <a href=\"/master/thesaurus?" "rebuild=1&full=1&%s\">(full)</a></b></center>" "</td>" "</tr>\n", getBuf, getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>distribute data</b><br>" "<font size=1>" "distributes all thesaurus data to all hosts, this is " "normally done automatically but if there was a problem " "with the copy, this lets you do it manually" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?distribute=1&%s\">" "distribute data</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>reload data</b><br>" "<font size=1>" "reloads the synonyms and affinity table on this host only" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b>" "<a href=\"/master/thesaurus?reload=1&cast=0&%s\">" "reload data</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>reload data (all hosts)</b><br>" "<font size=1>" "reloads the synonyms and affinity table on all hosts" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b>" "<a href=\"/master/thesaurus?reload=1&cast=1&%s\">" "reload data (all hosts)</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>list synonyms</b><br>" "<font size=1>" "enter a word here to list all synonym entries and their " "affinities" "</font>" "</td>" "<td width=12%%>" "<form action=\"/master/thesaurus>\">" "<input type=text name=synonym size=20>" "<input type=submit value=Submit>" "%s" "</form></td>" "</tr>\n", formBuf); p.safePrintf ( "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Affinity Controls" "</b></center></td>" "</tr>\n", DARK_BLUE); p.safePrintf ( "<tr>" "<td width=37%%><b>cancel running rebuild</b><br>" "<font size=1>" "cancels the rebuild and throws all intermediate data away" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?cancel=1&%s\">" "cancel running rebuild</a></b></center>" "</td>" "</tr>\n", getBuf); p.safePrintf ( "<tr>" "<td width=37%%><b>rebuild affinity only</b><br>" "<font size=1>" "begins the rebuild process for affinity data, has no " "effect if a rebuild is already in progress; full rebuild " "does not reuse existing affinity data" "</font>" "</td>" "<td width=12%% bgcolor=#0000ff>" "<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">" "rebuild affinity</a> <a href=\"/master/thesaurus?" "rebuildaff=1&full=1&%s\">(full)</a></b></center>" "</td>" "</tr>\n", getBuf, getBuf); p.safePrintf ( "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Manual File Controls" "</b></td>" "</tr>\n", DARK_BLUE); p.safePrintf ( "<tr>" "<td align=center colspan=2>"); p.safePrintf( "<b>manually added pairs</b><br>\n" "<font size=1>place word pairs here that should be linked " "as synonyms, one pair per line, seperated by a pipe '|' " "character, optionally followed by another pipe and a type " "designation; any badly formatted lines will be silently " "ignored</font><br>\n" "<form action=\"/master/thesaurus\" method=post>" "<textarea name=\"manualadd\" rows=20 cols=80>"); if (manualAdd && manualAddLen) { p.htmlEncode(manualAdd, manualAddLen, true); } p.safePrintf ( "</textarea><br>" "<input type=submit value=Submit>" "<input type=reset value=Reset>" "%s" "</form></td>" "</tr>\n", formBuf); p.safePrintf ( "<tr>" "<td align=center colspan=2>" "<b>affinity value overrides</b><br>\n" "<font size=1>place word/phrase pairs here that should have " "there affinity values overridden, format is " "\"word1|word2|value\", where value is a floating point, " "integer (either decimal or hex), or the word \"max\"; " "any badly formatted lines will be silently ignored; note " "that these pairs will only work if the thesaurus otherwise " "has an entry for them, so add them to the manual add file " "above if need be</font><br>\n" "<form action=\"/master/thesaurus\" method=post>" "<textarea name=\"affinityadd\" rows=20 cols=80>"); if (affinityAdd && affinityAddLen) { p.htmlEncode(affinityAdd, affinityAddLen, true); } p.safePrintf ( "</textarea><br>" "<input type=submit value=Submit>" "<input type=reset value=Reset>" "%s" "</form></td>" "</tr>\n", formBuf); p.safePrintf ( "</table>\n" ); p.safePrintf ( "<br><br>\n" ); p.safePrintf ( "<table cellpadding=4 width=100%% bgcolor=#%s border=1>" "<tr>" "<td colspan=2 bgcolor=#%s>" "<center><b>Affinity Builder Status" "</b></td>" "</tr>\n", LIGHT_BLUE, DARK_BLUE); long long a, b, c, d, e, f, g, h, i, j, k; StateAffinity *aff = g_thesaurus.m_affinityState; if (!aff) { p.safePrintf ( "<tr><td colspan=2>" "<center><b>Not running</b></center>" "</td></tr>\n"); a = b = c = d = e = f = g = h = i = j = k = 0; } else { a = aff->m_oldTable->getNumSlotsUsed(); b = aff->m_oldTable->getNumSlotsUsed() - aff->m_n; c = aff->m_n; d = (gettimeofdayInMilliseconds() - aff->m_time) / 1000; if (!d || !(c / d)) { e = 0; } else { e = b / (c / d); } f = aff->m_sent; g = aff->m_recv; h = aff->m_errors; i = aff->m_old; j = aff->m_cache; k = aff->m_hitsTable.getNumSlotsUsed(); } p.safePrintf ( "<tr><td><b># of total pairs</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of pairs remaining</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of pairs processed</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b>elapsed time in seconds</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b>estimated remaining time in seconds</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of requests sent</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of requests received</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of request errors</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of old values reused</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b># of cache hits</b></td>" "<td>%lli</td></tr>\n" "<tr><td><b>cache size</b></td>" "<td>%lli</td></tr>\n", a, b, c, d, e, f, g, h, i, j, k); p.safePrintf ( "</table>\n" ); return g_httpServer.sendDynamicPage ( s, p.getBufStart(), p.length() ); }
// . sets m_fileOffset and m_bf // . returns false and sets g_errno on error // . returns false if nothing to read too... but does not set g_errno bool ImportState::setCurrentTitleFileAndOffset ( ) { // leave m_bf and m_fileOffset alone if there is more to read if ( m_fileOffset < m_bfFileSize ) return true; CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr ) return false; log("import: import finding next file"); // if ( m_offIsValid ) { // //*off = m_fileOffset; // return &m_bf; // } //m_offIsValid = true; // look for titledb0001.dat etc. files in the // workingDir/inject/ subdir SafeBuf ddd; ddd.safePrintf("%sinject",cr->m_importDir.getBufStart()); // now use the one provided. we should also provide the # of threads if ( cr->m_importDir.getBufStart() && cr->m_importDir.getBufStart()[0] ) { ddd.reset(); ddd.safeStrcpy ( cr->m_importDir.getBufStart() ); } // // assume we are the first filename // set s_fileId to the minimum // Dir dir; dir.set(ddd.getBufStart()); if ( ! dir.open() ) return false; // assume none int32_t minFileId = -1; // getNextFilename() writes into this char pattern[64]; strcpy ( pattern , "titledb*" ); char *filename; while ( ( filename = dir.getNextFilename ( pattern ) ) ) { // filename must be a certain length int32_t filenameLen = gbstrlen(filename); // we need at least "titledb0001.dat" if ( filenameLen < 15 ) continue; // ensure filename starts w/ our m_dbname if ( strncmp ( filename , "titledb", 7 ) != 0 ) continue; // skip if not .dat file if ( ! strstr ( filename , ".dat" ) ) continue; // then a 4 digit number should follow char *s = filename + 7; if ( ! isdigit(*(s+0)) ) continue; if ( ! isdigit(*(s+1)) ) continue; if ( ! isdigit(*(s+2)) ) continue; if ( ! isdigit(*(s+3)) ) continue; // convert digit to id int32_t id = atol(s); // . do not accept files we've already processed // . -1 means we haven't processed any yet if ( m_bfFileId >= 0 && id <= m_bfFileId ) continue; // the min of those we haven't yet processed/injected if ( id < minFileId || minFileId < 0 ) minFileId = id; } // get where we left off if ( ! m_loadedPlaceHolder ) { // read where we left off from file if possible char fname[256]; sprintf(fname,"%slasttitledbinjectinfo.dat",g_hostdb.m_dir); SafeBuf ff; ff.fillFromFile(fname); if ( ff.length() > 1 ) { m_loadedPlaceHolder = true; // get the placeholder sscanf ( ff.getBufStart() , "%"UINT64",%"INT32"" , &m_fileOffset , &minFileId ); } } // if no files! return false to indicate we are done if ( minFileId == -1 ) return false; // set up s_bf then //if ( m_bfFileId != minFileId ) { SafeBuf tmp; tmp.safePrintf("titledb%04"INT32"-000.dat" //,dir.getDirname() ,minFileId); m_bf.set ( dir.getDirname() ,tmp.getBufStart() ); if ( ! m_bf.open( O_RDONLY ) ) { log("inject: import: could not open %s%s for reading", dir.getDirname(),tmp.getBufStart()); return false; } m_bfFileId = minFileId; // reset ptr into file //*off = 0; // and set this m_bfFileSize = m_bf.getFileSize(); m_fileOffset = 0; //} log("import: importing from file %s",m_bf.getFilename()); return true;//&m_bf; }
// . returns false if blocked, true otherwise // . returns true on error and sets g_errno bool SiteGetter::getSiteList ( ) { top: // . setSite() will return TRUE and set g_errno on error, and returns // false if it blocked adding a tag, which will call callback once // tag is added // . stop at this point if ( m_pathDepth >= 3 ) return setSite(); // or if no more if ( m_pathDepth >= m_maxPathDepth ) return setSite(); // . make the termid // . but here we get are based on "m_pathDepth" which ranges // from 1 to N // . if m_pathDepth==0 use "www.xyz.com" as site // . if m_pathDepth==1 use "www.xyz.com/foo/" as site ... char *pend = getPathEnd ( m_url , m_pathDepth ); // hash up to that //char *host = m_u.getHost(); char *host = getHostFast ( m_url , NULL ); // hash the prefix first to match XmlDoc::hashNoSplit() char *prefix = "siteterm"; // hash that and we will incorporate it to match XmlDoc::hashNoSplit() int64_t ph = hash64 ( prefix , gbstrlen(prefix) ); // . this should match basically what is in XmlDoc.cpp::hash() // . and this now does not include pages that have no outlinks // "underneath" them. int64_t termId = hash64 ( host , pend - host , ph ) & TERMID_MASK; // get all pages that have this as their termid! key144_t start ; key144_t end ; g_posdb.makeStartKey ( &start, termId ); g_posdb.makeEndKey ( &end , termId ); // . now see how many urls art at this path depth from this hostname // . if it is a huge # then we know they are all subsites! // because it is too bushy to be anything else // . i'd say 100 nodes is good enough to qualify as a homestead site int32_t minRecSizes = 5000000; // get the group this list is in //uint32_t gid ; //gid = getGroupId ( RDB_POSDB , (char *)&start , false ); //split? //uint32_t shardNum ; //shardNum = getShardNum( RDB_POSDB , (char *)&start , false ); //split? // i guess this is split by termid and not docid???? int32_t shardNum = g_hostdb.getShardNumByTermId ( &start ); // we need a group #. the column #. //int32_t split = g_hostdb.getGroupNum ( gid ); // int16_tcut Msg0 *m = &m_msg0; // get the list. returns false if blocked. if ( ! m->getList ( -1 , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // addToCache RDB_POSDB , m_collnum , &m_list , (char *)&start , (char *)&end , minRecSizes , this , gotSiteListWrapper , m_niceness , // MAX_NICENESS // default parms follow true , // doErrorCorrection? true , // includeTree? true , // doMerge? -1 , // firstHostId 0 , // startFileNum -1 , // numFiles 999999, // timeout -1 , // syncPoint -1 , // preferLocalReads NULL , // msg5 NULL , // msg5b false , // isrealmerge? true , // allowpagecache? false , // forceLocalIndexdb? false , // doIndexdbSplit? nosplit shardNum ) )//split )) return false; // return false if this blocked if ( ! gotSiteList() ) return false; // error? if ( g_errno ) return true; // or all done if ( m_allDone ) return true; // otherwise, try the next path component! goto top; }
// . "sir" is the serialized injectionrequest // . this is called from the http interface, as well as from // XmlDoc::indexWarcOrArc() to inject individual recs/docs from the warc/arc // . returns false and sets g_errno on error, true on success bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir , void *state , void (* callback)(void *) ) { // ensure it is our own if ( &m_injectionRequest != ir ) { char *xx=NULL;*xx=0; } //if ( strcmp ( ir->ptr_url , "http://www.indyweek.com/durham/current/news.html" ) == 0 ) // fprintf(stderr,"ey\n"); // ensure url not beyond limit if ( ir->ptr_url && gbstrlen(ir->ptr_url) > MAX_URL_LEN ) { g_errno = EURLTOOBIG; return log("inject: url too big."); } // hack fix core if ( ir->size_metadata == 0 ) ir->ptr_metadata = NULL; int32_t sirSize = 0; char *sir = serializeMsg2 ( ir , sizeof(InjectionRequest), &ir->ptr_url, &ir->size_url , &sirSize ); // oom? if ( ! sir ) return log("inject: failed to serialize request"); // free any old one if we are being reused if ( m_sir ) { mfree ( m_sir , m_sirSize , "m7ir" ); m_sir = NULL; } m_state = state; m_callback = callback; // save it for freeing later m_sir = sir; m_sirSize = sirSize; // forward it to another shard? Host *host = getHostToHandleInjection ( ir->ptr_url ); log("inject: sending injection request of url %s reqsize=%i " "to host #%"INT32"", ir->ptr_url,(int)sirSize,host->m_hostId); // . ok, forward it to another host now // . and call got gotForwardedReplyWrapper when reply comes in // . returns false and sets g_errno on error // . returns true on success if ( g_udpServer.sendRequest ( sir , // req , sirSize, 0x07 , // msgtype host->m_ip , // ip host->m_port , // port host->m_hostId, NULL, // retslot this,//state, gotUdpReplyWrapper,//acallback, 99999999 , // timeout -1 , // backoff -1 , // maxwait NULL, // replybuf 0, // replybufmaxsize MAX_NICENESS // niceness ) ) // we also return true on success, false on error return true; if ( ! g_errno ) { char *xx=NULL;*xx=0; } // there was an error, g_errno should be set return false; }
bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) { SafeBuf sb(512 * 512,"autobbuf"); //read in all of the possible cgi parms off the bat: //long user = g_pages.getUserType( s , r ); //char *username = g_users.getUsername(r); //char *pwd = r->getString ("pwd"); char *coll = r->getString ("c"); long banIpsLen; char *banIps = r->getString ("banIps" , &banIpsLen , NULL); long allowIpsLen; char *allowIps = r->getString ("allowIps" , &allowIpsLen , NULL); long clearLen; char *clear = r->getString ("clear" , &clearLen , NULL); bool changed = false; long validCodesLen; char *validCodes = r->getString ("validCodes", &validCodesLen, NULL); long showAllIps = r->getLong("showAllIps", 0); long showLongView = r->getLong("longview", 0); // do it all from parm now //long banRegexLen; //char *banRegex = r->getString("banRegex", &banRegexLen, NULL); // char *ss = sb.getBuf(); // char *ssend = sb.getBufEnd(); g_pages.printAdminTop ( &sb, s , r ); //sb.incrementLength(sss - ss); // MDW: moved to here long now = getTime(); long days; long hours; long minutes; long secs; long msecs; if(r->getLong("resetcodes", 0)) { setCodesFromConf(); } sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE); getCalendarFromMs((now - m_codeResetTime) * 1000, &days, &hours, &minutes, &secs, &msecs); sb.safePrintf("<tr><td colspan=18 bgcolor=#%s>" "<center><b>Code Usage " "(<a href=\"/admin/" "autoban?c=%s&resetcodes=1\">reset</a> " "%li days %li hours %li " "minutes %li sec ago)" "</b></center></td></tr>", DARK_BLUE, coll, days, hours, minutes, secs); sb.safePrintf("<tr bgcolor=#%s>" "<td><center><b>Code</b></center></td>" "<td><center><b>IP</b></center></td>" "<td><center><b>Query Count</b></center></td>" "<td><center><b>Bytes Read</b></center></td>" "<td><center><b>Bytes Sent</b></center></td>" "<td><center><b>Outstanding Count</b></center></td>" "<td><center><b>Most Ever Outstanding</b></center></td>" "<td><center><b>Max Outstanding</b></center></td>" "</tr>", LIGHT_BLUE); for(long i = 0; i < m_ht.getNumSlots(); i++) { if ( m_ht.getKey ( i ) == 0 ) continue; CodeVal *cv = m_ht.getValuePointerFromSlot ( i ); if ( ! cv ) continue; sb.safePrintf("<tr>"); sb.safePrintf("<td>"); sb.copyToken(cv->m_code);//m_codeVals[i].m_code); sb.safePrintf("</td>"); sb.safePrintf("<td><center>%s</center> </td>", iptoa(cv->m_ip)); sb.safePrintf("<td><center>%lli</center></td>", cv->m_count); sb.safePrintf("<td><center>%lli</center></td>", cv->m_bytesRead); sb.safePrintf("<td><center>%lli</center></td>", cv->m_bytesSent); sb.safePrintf("<td><center>%li</center></td>", cv->m_outstanding); sb.safePrintf("<td><center>%li</center></td>", cv->m_maxEver); if ( cv->m_maxOutstanding != 50 ) sb.safePrintf("<td><center><b>%li</b></center></td>", cv->m_maxOutstanding); else sb.safePrintf("<td><center>%li</center></td>", cv->m_maxOutstanding); sb.safePrintf("</tr>"); } sb.safePrintf ("</table><br><br>\n" ); if(clear && clearLen < 64) { long ip = atoip(clear, clearLen); if(ip) { removeIp(ip); char *beginning; char ipbuf[64];//gotta NULL terminate for strstr memcpy(ipbuf, clear, clearLen); ipbuf[clearLen] = '\0'; beginning = findToken(g_conf.m_banIps, ipbuf, clearLen); if(beginning) { char *to = beginning; char *from = beginning + clearLen; while(*to) *to++ = *from++; } beginning = findToken(g_conf.m_allowIps, ipbuf, clearLen); if(beginning) { char *to = beginning; char *from = beginning + clearLen; while(*to) *to++ = *from++; } changed = true; } } long allowLen; char *allow = r->getString ( "allow" , &allowLen , NULL ); if(allow && allowLen < 64) { long ip = atoip(allow, allowLen); if(ip) { char *beginning; char ipbuf[64];//gotta NULL terminate for strstr memcpy(ipbuf, allow, allowLen); ipbuf[allowLen] = '\0'; beginning = findToken(g_conf.m_allowIps, ipbuf, allowLen); if(!beginning) { //its not present, so add it. char *p = g_conf.m_allowIps; while(*p) p++; if(p - g_conf.m_allowIps + allowLen + 2 < AUTOBAN_TEXT_SIZE) { *p++ = '\n'; memcpy(p, ipbuf,allowLen); *(p + allowLen) = '\0'; } else { sb.safePrintf("<font color=red>" "Not enough stack space " "to fit allowIps. " "Increase " "AUTOBAN_TEXT_SIZE in " "Conf.h. " "Had %i need %li." "</font>", AUTOBAN_TEXT_SIZE, p - g_conf.m_allowIps + allowLen + 2); goto dontRemove1; } } beginning = findToken(g_conf.m_banIps, ipbuf, allowLen); if(beginning) { //remove it from banned if present. char *to = beginning; char *from = beginning + allowLen; while(*to) *to++ = *from++; } changed = true; } } dontRemove1: long denyLen; char *deny = r->getString ( "deny" , &denyLen , NULL ); if(deny && denyLen < 64) { long ip = atoip(deny, denyLen); if(ip) { char *beginning; char ipbuf[64];//gotta NULL terminate for strstr memcpy(ipbuf, deny, denyLen); ipbuf[denyLen] = '\0'; beginning = findToken(g_conf.m_banIps, ipbuf, denyLen); if(!beginning) { //its not present, so add it. char *p =g_conf.m_banIps; while(*p) p++; if(p - g_conf.m_banIps + denyLen + 2 < AUTOBAN_TEXT_SIZE) { *p++ = '\n'; memcpy(p, ipbuf,denyLen); *(p + denyLen) = '\0'; } else { sb.safePrintf("<font color=red>Not " "enough stack space " "to fit bannedIPs. " "Increase " "AUTOBAN_TEXT_SIZE in " "Conf.h. " "Had %i need %li." "</font>", AUTOBAN_TEXT_SIZE, p - g_conf.m_banIps + denyLen + 2); goto dontRemove2; } } beginning = findToken(g_conf.m_allowIps, ipbuf, denyLen); if(beginning) { //remove it from allowed list if present. char *to = beginning; char *from = beginning + denyLen; while(*to) *to++ = *from++; } changed = true; } } dontRemove2: if(!g_conf.m_doAutoBan) { sb.safePrintf("<center><font color=red><b>Autoban is disabled, " "turn it on in Master Controls.</b></font></center><br>"); } if(validCodes) { if(validCodesLen >= AUTOBAN_TEXT_SIZE) { sb.safePrintf("<font color=red>Not enough stack space " "to fit codes. " "Increase AUTOBAN_TEXT_SIZE in Conf.h. " "Had %i need %li.</font>", AUTOBAN_TEXT_SIZE, validCodesLen); validCodes = NULL; validCodesLen = 0; } else { memcpy(g_conf.m_validCodes, validCodes, validCodesLen); g_conf.m_validCodes[validCodesLen] = '\0'; trimWhite(g_conf.m_validCodes); setCodesFromConf(); } } //first remove all of the ips in the conf, then add the passed in // ones to the conf parm; if (banIps) { //ack, the browser puts in crlf when this comes back, so //we will have a longer string here than the one we sent //out. trim back all extrainious whitespace before we do //bounds checking. trimWhite(banIps); banIpsLen = gbstrlen(banIps); if(banIpsLen >= AUTOBAN_TEXT_SIZE) { sb.safePrintf("<font color=red>Not enough stack space " "to fit bannedIps. " "Increase AUTOBAN_TEXT_SIZE in Conf.h. " "Had %i need %li.</font>", AUTOBAN_TEXT_SIZE, banIpsLen); banIpsLen = AUTOBAN_TEXT_SIZE - 1; } for(long i = 0; i < m_tableSize; i++) { if(m_detectKeys[i] == 0) continue; //check the 'set from conf' bit, and clear those. if(m_detectVals[i].m_flags & FROMCONF) { removeIp(m_detectKeys[i]); } } memcpy(g_conf.m_banIps, banIps, banIpsLen); g_conf.m_banIps[banIpsLen] = '\0'; changed = true; } if (allowIps) { trimWhite(allowIps); allowIpsLen = gbstrlen(allowIps); if(allowIpsLen >= AUTOBAN_TEXT_SIZE) { sb.safePrintf("<font color=red>Not enough stack space " "to fit allowIps. " "Increase AUTOBAN_TEXT_SIZE in Conf.h. " "Had %i need %li.</font>", AUTOBAN_TEXT_SIZE, allowIpsLen); allowIpsLen = AUTOBAN_TEXT_SIZE - 1; } for(long i = 0; i < m_tableSize; i++) { if(m_detectKeys[i] == 0) continue; //check the 'set from conf' bit, and clear those. if(m_detectVals[i].m_flags & FROMCONF) { removeIp(m_detectKeys[i]); } } memcpy(g_conf.m_allowIps, allowIps, allowIpsLen); g_conf.m_allowIps[allowIpsLen] = '\0'; changed = true; } if(changed) { trimWhite(g_conf.m_allowIps); trimWhite(g_conf.m_banIps); setFromConf(); } sb.safePrintf("\n<table %s>\n",TABLE_STYLE); sb.safePrintf("<tr><td colspan=2 bgcolor=#%s>" "<center><b>Add IPs</b></center></td></tr>", DARK_BLUE); // ss = sb.getBuf(); // ssend = sb.getBufEnd(); g_parms.printParms (&sb, s, r); // sb.incrementLength(sss - ss); sb.safePrintf ("<tr><td>" "<center>" "<input type=submit value=\"Update\" " "method=\"POST\" border=0>" "</center></td></tr>"); sb.safePrintf ("</table><br><br>\n" ); if(!showLongView) { sb.safePrintf("<b><a href=\"autoban" "?c=%s" "&showAllIps=%li" "&longview=1\">Show watched ips table...</a></b>", coll, showAllIps); return g_httpServer.sendDynamicPage ( s , sb.getBufStart() , sb.length() , -1 , false); } ///////////////////////////////////////////////////////////////////// sb.safePrintf("\n<table %s>\n",TABLE_STYLE); sb.safePrintf("<tr><td colspan=3 bgcolor=#%s>" "<center><b>Watched Ips</b></center></td></tr>", DARK_BLUE); sb.safePrintf("<tr bgcolor=#%s>" "<td><center><b>IP</b></center></td>" "<td><center><b>Description</b></center></td>" // "<td><center><b>Time Added</b></center></td>" "<td><center><b>Allow/Deny/Clear</b></center></td>" "</tr>", LIGHT_BLUE); long *sortedIndices = (long*)mmalloc(m_tableSize * sizeof(long), "AutoBanH"); if(!sortedIndices) { return g_httpServer.sendErrorReply(s,500,mstrerror(ENOMEM)); } long numEntries = 0; for(long i = 0; i < m_tableSize; i++) { if(m_detectKeys[i] == 0) continue; sortedIndices[numEntries++] = i; } SorterTable = m_detectKeys; gbsort(sortedIndices, numEntries, sizeof(long), ip_cmp); //lets put each class of watched ip in its own safebuf then cat //them together at the end. SafeBuf allowed; SafeBuf banned; SafeBuf feedLeachers; SafeBuf cowBots; SafeBuf *e; for(long j = 0; j < numEntries; j++) { long i = sortedIndices[j]; if(m_detectKeys[i] == 0) continue; //if(!(m_detectVals[i].m_flags & FROMCONF)) continue; bool allow = m_detectVals[i].m_flags & ALLOW && m_detectVals[i].m_flags & FROMCONF; bool deny = m_detectVals[i].m_flags & DENY && m_detectVals[i].m_flags & FROMCONF; bool explicitban = deny && m_detectVals[i].m_flags & FROMCONF; unsigned short dayCount = m_detectVals[i].m_dayCount; unsigned char minuteCount = m_detectVals[i].m_minuteCount; bool day = dayCount >= g_conf.m_numFreeQueriesPerDay; bool minute = minuteCount >= g_conf.m_numFreeQueriesPerMinute; char *description; char *color; if(allow) { color = GREEN; description = "Allowed"; e = &allowed; } else if(explicitban) { color = RED; description = "Banned"; e = &banned; } else if(minute) { color = RED; description = "Cow Bot"; e = &cowBots; } else if(day) { color = RED; description = "Feed Leacher"; e = &feedLeachers; } else { //this can happen when someone was banned due to //exceeding the quota, then the quota was lowered. m_detectVals[i].m_flags &= ~DENY; //log("autoban: ohshit-banning %s",iptoa(s->m_ip)); continue; } e->safePrintf("<tr>"); e->safePrintf("<td bgcolor=#%s><center>%s</center></td><td>" "<center>%s</center></td>" // "<td><center>" // "%li days %li hrs %li min ago" // "</center></td>" "<td><center><a href=\"/admin/" "autoban?c=%s&allow=%s&showAllIps=%li\">" "allow/</a>" "<a href=\"/admin/" "autoban?c=%s&deny=%s&showAllIps=%li\">" "deny/</a>" "<a href=\"/admin/" "autoban?c=%s&clear=%s&showAllIps=%li\">" "clear</a></center>" "</td>",color, iptoa(m_detectKeys[i]), description, // days,hours,minutes, coll, iptoa(m_detectKeys[i]), showAllIps, coll, iptoa(m_detectKeys[i]), showAllIps, coll, iptoa(m_detectKeys[i]), showAllIps); e->safePrintf("</tr>"); } sb.cat(allowed); sb.cat(banned); sb.cat(feedLeachers); sb.cat(cowBots); sb.safePrintf ("</table><br><br>\n" ); // MDW moved from here sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE); sb.safePrintf("<tr><td colspan=5 bgcolor=#%s>" "<center><b>Control Panel</b></center></td></tr>", DARK_BLUE); sb.safePrintf("<tr>" "<td bgcolor=#%s><center><b>Show Ips by Number of Queries" "</b></center></td>", LIGHT_BLUE); sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/" "autoban?c=%s&showAllIps=0\">" "0 Queries</a></b>" "</font></center></td>", coll); sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/" "autoban?c=%s&showAllIps=1\">" "1 Query</a></b>" "</font></center></td>", coll); sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/" "autoban?c=%s&showAllIps=10\">" "10 Queries</a></b>" "</font></center></td>", coll); sb.safePrintf("<td><center><font color=red><b><a href=\"/admin/" "autoban?c=%s&showAllIps=100\">" "100 Queries</a></b>" "</font></center></td></tr>", coll); sb.safePrintf ("</table><br><br>\n"); if(!showAllIps) { char* ss = (char*) sb.getBufStart(); long sslen = sb.length(); mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH"); return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false); } sb.safePrintf("\n<br><br><table %s>\n",TABLE_STYLE); sb.safePrintf("<tr><td colspan=6 bgcolor=#%s>" "<center><b>Queries Today</b></center></td></tr>", DARK_BLUE); sb.safePrintf("<tr bgcolor=#%s>" "<td><center><b>IP</b></center></td>" "<td><center><b>Minute count</b></center></td>" "<td><center><b>Day count</b></center></td>" "<td><center><b>Time Until Reset</b></center></td>" "<td><center><b>Times Banned</b></center></td>" "<td><center><b>Allow/Deny</b></center></td>" "</tr>", LIGHT_BLUE); char minBuf[128]; char dayBuf[128]; unsigned long lastIpGroup = 0; for(long j = 0; j < numEntries; j++) { long i = sortedIndices[j]; long dayCount = m_detectVals[i].m_dayCount; unsigned char minuteCount = m_detectVals[i].m_minuteCount; if(!(m_detectVals[i].m_flags & FROMCONF)) { if(m_detectVals[i].m_minuteExpires < now) minuteCount = 0; if(!(m_detectVals[i].m_flags & DENY) && m_detectVals[i].m_dayExpires < now) dayCount = 0; } //a hack: if( dayCount < showAllIps) continue; char *color = YELLOW; if(m_detectVals[i].m_flags & ALLOW) { color = GREEN; snprintf(minBuf, 128, "--"); snprintf(dayBuf, 128, "%li", dayCount); } else if(m_detectVals[i].m_flags & DENY) { color = RED; snprintf(minBuf, 128, "--"); snprintf(dayBuf, 128, "%li", dayCount); } else { snprintf(minBuf, 128, "%li", (long)minuteCount); snprintf(dayBuf, 128, "%li", (long)dayCount); } unsigned long thisIpGroup = (unsigned long)m_detectKeys[i] & 0x00ffffff; sb.safePrintf("<tr><center>"); if(m_detectVals[i].m_flags & FROMCONF) { sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>" "<td><center>%s</center> </td>" "<td><center>%s</center></td>" "<td><center><font color=red>" "<b>NEVER</b>" "</font></center></td>" "<td><center>--</center></td>", color, (thisIpGroup == lastIpGroup)?"<b>":"", iptoa(m_detectKeys[i]), (thisIpGroup == lastIpGroup)?"</b>":"", minBuf, dayBuf); } else { //they haven't done a query since being unbanned, //unban them now so we don't get negative resets displayed. /* no, don't unban the bots!!! MDW yippy project if(m_detectVals[i].m_dayExpires < now) { m_detectVals[i].m_flags &= ~DENY; //log("autoban: dayexpire-unbanning %s", // iptoa(ip)); m_detectVals[i].m_dayExpires = now + ONE_DAY; m_detectVals[i].m_minuteExpires = now + 60; m_detectVals[i].m_dayCount = 0; m_detectVals[i].m_minuteCount = 0; sb.safePrintf("</center></tr>"); continue; } */ getCalendarFromMs((m_detectVals[i].m_dayExpires - now)* 1000, &days, &hours, &minutes, &secs, &msecs); sb.safePrintf("<td bgcolor=#%s><center>%s%s%s</center></td>" "<td><center>%s</center> </td>" "<td><center>%s</center></td>" "<td><center><font color=red>" "<b>%li days %li hrs %li min %li sec</b>" "</font></center></td>" "<td><center>%i</center></td>", color, (thisIpGroup == lastIpGroup)?"<b>":"", iptoa(m_detectKeys[i]), (thisIpGroup == lastIpGroup)?"</b>":"", minBuf, dayBuf, days, hours, minutes, secs, m_detectVals[i].m_timesBanned); } sb.safePrintf("<td><center>" "<a href=\"/admin/" "autoban?c=%s&allow=%s&showAllIps=%li\">" "allow/</a>" "<a href=\"/admin/" "autoban?c=%s&deny=%s&showAllIps=%li\">" "deny</a></center>" "</td>", coll, iptoa(m_detectKeys[i]), showAllIps, coll, iptoa(m_detectKeys[i]), showAllIps); sb.safePrintf("</center></tr>"); lastIpGroup = thisIpGroup; } sb.safePrintf ("</table><br><br>\n" ); char* ss = (char*) sb.getBufStart(); long sslen = sb.length(); mfree(sortedIndices, m_tableSize * sizeof(long),"AutoBanH"); return g_httpServer.sendDynamicPage ( s , ss , sslen , -1 , false); }
bool sendHttpReply ( void *state ) { // get the state properly Msg7 *msg7= (Msg7 *) state; InjectionRequest *ir = &msg7->m_injectionRequest; // extract info from state TcpSocket *sock = msg7->m_socket; //XmlDoc *xd = msg7->m_xd; int64_t docId = msg7->m_replyDocId; // xd->m_docId; // might already be EURLTOOBIG set from above if ( ! g_errno ) g_errno = msg7->m_replyIndexCode; int32_t hostId = 0;//msg7->m_msg7.m_hostId; // set g_errno to index code //if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno ) // g_errno = xd->m_indexCode; char format = msg7->m_format; // no url parm? if ( ! g_errno && ! ir->ptr_url && format != FORMAT_HTML ) g_errno = EMISSINGINPUT; if ( g_errno && g_errno != EDOCUNCHANGED ) { int32_t save = g_errno; mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); g_errno = save; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,save,msg,NULL); } char abuf[320]; SafeBuf am(abuf,320,0,false); am.setLabel("injbuf"); char *ct = NULL; // a success reply, include docid and url i guess if ( format == FORMAT_XML ) { am.safePrintf("<response>\n"); am.safePrintf("\t<statusCode>%"INT32"</statusCode>\n", (int32_t)g_errno); am.safePrintf("\t<statusMsg><![CDATA["); am.cdataEncode(mstrerror(g_errno)); am.safePrintf("]]></statusMsg>\n"); // if xmldoc was a container of subdocs that XmlDoc::indexDoc() // call indexWarcOrArc() on then docid is not valid since // we do not index container docs. //int64_t docId = xd->m_docId; //if ( ! xd->m_docIdValid ) docId = 0; am.safePrintf("\t<docId>%"INT64"</docId>\n",docId); // this will have to be re-tooled if we deem necessary. // was being use to do section voting for diffbot // upon a url being injected. /* if ( ir->m_getSections ) { SafeBuf *secBuf = xd->getInlineSectionVotingBuf(); am.safePrintf("\t<htmlSrc><![CDATA["); if ( secBuf->length() ) am.cdataEncode(secBuf->getBufStart()); am.safePrintf("]]></htmlSrc>\n"); } */ am.safePrintf("</response>\n"); ct = "text/xml"; } if ( format == FORMAT_JSON ) { am.safePrintf("{\"response\":{\n"); am.safePrintf("\t\"statusCode\":%"INT32",\n",(int32_t)g_errno); am.safePrintf("\t\"statusMsg\":\""); am.jsonEncode(mstrerror(g_errno)); am.safePrintf("\",\n"); am.safePrintf("\t\"docId\":%"INT64",\n",docId);//xd->m_docId); // this will have to be re-tooled if we deem necessary. // was being use to do section voting for diffbot // upon a url being injected. /* if ( ir->m_getSections ) { SafeBuf *secBuf = xd->getInlineSectionVotingBuf(); am.safePrintf("\t\"htmlSrc\":\""); if ( secBuf->length() ) am.jsonEncode(secBuf->getBufStart()); am.safePrintf("\",\n"); } */ // subtract ",\n" am.m_length -= 2; am.safePrintf("\n}\n}\n"); ct = "application/json"; } if ( format == FORMAT_XML || format == FORMAT_JSON ) { mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage(sock, am.getBufStart(), am.length(), 0, false, ct ); } // // debug // /* // now get the meta list, in the process it will print out a // bunch of junk into msg7->m_pbuf if ( xd->m_docId ) { char *metalist = xd->getMetaList ( 1,1,1,1,1,1 ); if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;} // print it out SafeBuf *pbuf = &msg7->m_sbuf; xd->printDoc( pbuf ); bool status = g_httpServer.sendDynamicPage( msg7->m_socket , pbuf->getBufStart(), pbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now mdelete ( st , sizeof(Msg7) , "PageInject" ); delete (st); // return the status return status; } */ // // end debug // char *url = ir->ptr_url; // . if we're talking w/ a robot he doesn't care about this crap // . send him back the error code (0 means success) if ( url && ir->m_shortReply ) { char buf[1024*32]; char *p = buf; // return docid and hostid if ( ! g_errno ) p += sprintf ( p , "0,docId=%"INT64"," "hostId=%"INT32"," , docId , hostId ); // print error number here else p += sprintf ( p , "%"INT32",0,0,", (int32_t)g_errno ); // print error msg out, too or "Success" p += sprintf ( p , "%s", mstrerror(g_errno)); mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage ( sock,buf, gbstrlen(buf) , -1/*cachetime*/); } SafeBuf sb; // print admin bar g_pages.printAdminTop ( &sb, sock , &msg7->m_hr ); // print a response msg if rendering the page after a submission if ( g_errno ) sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>" "</center>", mstrerror(g_errno) , g_errno); else if ( (ir->ptr_url && ir->ptr_url[0]) || (ir->ptr_queryToScrape&&ir->ptr_queryToScrape[0]) ) sb.safePrintf ( "<center><b>Sucessfully injected %s" "</center><br>" , ir->ptr_url //, xd->m_firstUrl.m_url ); // print the table of injection parms g_parms.printParmTable ( &sb , sock , &msg7->m_hr ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // calculate buffer length //int32_t bufLen = p - buf; // nuke state mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); // . send this page // . encapsulates in html header and tail // . make a Mime // . i thought we need -2 for cacheTime, but i guess not return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1/*cachetime*/); }
static bool isTLD ( char *tld , int32_t tldLen ) { int32_t pcount = 0; // now they are random! for ( int32_t i = 0 ; i < tldLen ; i++ ) { // period count if ( tld[i] == '.' ) { pcount++; continue; } if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false; } if ( pcount == 0 ) return true; if ( pcount >= 2 ) return false; // otherwise, if one period, check table to see if qualified // we use this as our hashtable static bool s_isInitialized = false; // . i shrunk this list a lot // . see backups for the hold list static const char * const s_tlds[] = { // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt "AAA", "AARP", "ABB", "ABBOTT", "ABBVIE", "ABOGADO", "ABUDHABI", "AC", "ACADEMY", "ACCENTURE", "ACCOUNTANT", "ACCOUNTANTS", "ACO", "ACTIVE", "ACTOR", "AD", "ADAC", "ADS", "ADULT", "AE", "AEG", "AERO", "AF", "AFL", "AG", "AGAKHAN", "AGENCY", "AI", "AIG", "AIRFORCE", "AIRTEL", "AKDN", "AL", "ALIBABA", "ALIPAY", "ALLFINANZ", "ALLY", "ALSACE", "AM", "AMICA", "AMSTERDAM", "ANALYTICS", "ANDROID", "ANQUAN", "AO", "APARTMENTS", "APP", "APPLE", "AQ", "AQUARELLE", "AR", "ARAMCO", "ARCHI", "ARMY", "ARPA", "ARTE", "AS", "ASIA", "ASSOCIATES", "AT", "ATTORNEY", "AU", "AUCTION", "AUDI", "AUDIO", "AUTHOR", "AUTO", "AUTOS", "AVIANCA", "AW", "AWS", "AX", "AXA", "AZ", "AZURE", "BA", "BABY", "BAIDU", "BAND", "BANK", "BAR", "BARCELONA", "BARCLAYCARD", "BARCLAYS", "BAREFOOT", "BARGAINS", "BAUHAUS", "BAYERN", "BB", "BBC", "BBVA", "BCG", "BCN", "BD", "BE", "BEATS", "BEER", "BENTLEY", "BERLIN", "BEST", "BET", "BF", "BG", "BH", "BHARTI", "BI", "BIBLE", "BID", "BIKE", "BING", "BINGO", "BIO", "BIZ", "BJ", "BLACK", "BLACKFRIDAY", "BLOOMBERG", "BLUE", "BM", "BMS", "BMW", "BN", "BNL", "BNPPARIBAS", "BO", "BOATS", "BOEHRINGER", "BOM", "BOND", "BOO", "BOOK", "BOOTS", "BOSCH", "BOSTIK", "BOT", "BOUTIQUE", "BR", "BRADESCO", "BRIDGESTONE", "BROADWAY", "BROKER", "BROTHER", "BRUSSELS", "BS", "BT", "BUDAPEST", "BUGATTI", "BUILD", "BUILDERS", "BUSINESS", "BUY", "BUZZ", "BV", "BW", "BY", "BZ", "BZH", "CA", "CAB", "CAFE", "CAL", "CALL", "CAMERA", "CAMP", "CANCERRESEARCH", "CANON", "CAPETOWN", "CAPITAL", "CAR", "CARAVAN", "CARDS", "CARE", "CAREER", "CAREERS", "CARS", "CARTIER", "CASA", "CASH", "CASINO", "CAT", "CATERING", "CBA", "CBN", "CC", "CD", "CEB", "CENTER", "CEO", "CERN", "CF", "CFA", "CFD", "CG", "CH", "CHANEL", "CHANNEL", "CHASE", "CHAT", "CHEAP", "CHLOE", "CHRISTMAS", "CHROME", "CHURCH", "CI", "CIPRIANI", "CIRCLE", "CISCO", "CITIC", "CITY", "CITYEATS", "CK", "CL", "CLAIMS", "CLEANING", "CLICK", "CLINIC", "CLINIQUE", "CLOTHING", "CLOUD", "CLUB", "CLUBMED", "CM", "CN", "CO", "COACH", "CODES", "COFFEE", "COLLEGE", "COLOGNE", "COM", "COMMBANK", "COMMUNITY", "COMPANY", "COMPARE", "COMPUTER", "COMSEC", "CONDOS", "CONSTRUCTION", "CONSULTING", "CONTACT", "CONTRACTORS", "COOKING", "COOL", "COOP", "CORSICA", "COUNTRY", "COUPON", "COUPONS", "COURSES", "CR", "CREDIT", "CREDITCARD", "CREDITUNION", "CRICKET", "CROWN", "CRS", "CRUISES", "CSC", "CU", "CUISINELLA", "CV", "CW", "CX", "CY", "CYMRU", "CYOU", "CZ", "DABUR", "DAD", "DANCE", "DATE", "DATING", "DATSUN", "DAY", "DCLK", "DE", "DEALER", "DEALS", "DEGREE", "DELIVERY", "DELL", "DELOITTE", "DELTA", "DEMOCRAT", "DENTAL", "DENTIST", "DESI", "DESIGN", "DEV", "DIAMONDS", "DIET", "DIGITAL", "DIRECT", "DIRECTORY", "DISCOUNT", "DJ", "DK", "DM", "DNP", "DO", "DOCS", "DOG", "DOHA", "DOMAINS", "DOWNLOAD", "DRIVE", "DUBAI", "DURBAN", "DVAG", "DZ", "EARTH", "EAT", "EC", "EDEKA", "EDU", "EDUCATION", "EE", "EG", "EMAIL", "EMERCK", "ENERGY", "ENGINEER", "ENGINEERING", "ENTERPRISES", "EPSON", "EQUIPMENT", "ER", "ERNI", "ES", "ESQ", "ESTATE", "ET", "EU", "EUROVISION", "EUS", "EVENTS", "EVERBANK", "EXCHANGE", "EXPERT", "EXPOSED", "EXPRESS", "EXTRASPACE", "FAGE", "FAIL", "FAIRWINDS", "FAITH", "FAMILY", "FAN", "FANS", "FARM", "FASHION", "FAST", "FEEDBACK", "FERRERO", "FI", "FILM", "FINAL", "FINANCE", "FINANCIAL", "FIRESTONE", "FIRMDALE", "FISH", "FISHING", "FIT", "FITNESS", "FJ", "FK", "FLICKR", "FLIGHTS", "FLORIST", "FLOWERS", "FLSMIDTH", "FLY", "FM", "FO", "FOO", "FOOTBALL", "FORD", "FOREX", "FORSALE", "FORUM", "FOUNDATION", "FOX", "FR", "FRESENIUS", "FRL", "FROGANS", "FRONTIER", "FTR", "FUND", "FURNITURE", "FUTBOL", "FYI", "GA", "GAL", "GALLERY", "GALLO", "GALLUP", "GAME", "GARDEN", "GB", "GBIZ", "GD", "GDN", "GE", "GEA", "GENT", "GENTING", "GF", "GG", "GGEE", "GH", "GI", "GIFT", "GIFTS", "GIVES", "GIVING", "GL", "GLASS", "GLE", "GLOBAL", "GLOBO", "GM", "GMAIL", "GMBH", "GMO", "GMX", "GN", "GOLD", "GOLDPOINT", "GOLF", "GOO", "GOOG", "GOOGLE", "GOP", "GOT", "GOV", "GP", "GQ", "GR", "GRAINGER", "GRAPHICS", "GRATIS", "GREEN", "GRIPE", "GROUP", "GS", "GT", "GU", "GUCCI", "GUGE", "GUIDE", "GUITARS", "GURU", "GW", "GY", "HAMBURG", "HANGOUT", "HAUS", "HDFCBANK", "HEALTH", "HEALTHCARE", "HELP", "HELSINKI", "HERE", "HERMES", "HIPHOP", "HITACHI", "HIV", "HK", "HM", "HN", "HOCKEY", "HOLDINGS", "HOLIDAY", "HOMEDEPOT", "HOMES", "HONDA", "HORSE", "HOST", "HOSTING", "HOTELES", "HOTMAIL", "HOUSE", "HOW", "HR", "HSBC", "HT", "HTC", "HU", "HYUNDAI", "IBM", "ICBC", "ICE", "ICU", "ID", "IE", "IFM", "IINET", "IL", "IM", "IMAMAT", "IMMO", "IMMOBILIEN", "IN", "INDUSTRIES", "INFINITI", "INFO", "ING", "INK", "INSTITUTE", "INSURANCE", "INSURE", "INT", "INTERNATIONAL", "INVESTMENTS", "IO", "IPIRANGA", "IQ", "IR", "IRISH", "IS", "ISELECT", "ISMAILI", "IST", "ISTANBUL", "IT", "ITAU", "IWC", "JAGUAR", "JAVA", "JCB", "JCP", "JE", "JETZT", "JEWELRY", "JLC", "JLL", "JM", "JMP", "JNJ", "JO", "JOBS", "JOBURG", "JOT", "JOY", "JP", "JPMORGAN", "JPRS", "JUEGOS", "KAUFEN", "KDDI", "KE", "KERRYHOTELS", "KERRYLOGISTICS", "KERRYPROPERTIES", "KFH", "KG", "KH", "KI", "KIA", "KIM", "KINDER", "KITCHEN", "KIWI", "KM", "KN", "KOELN", "KOMATSU", "KP", "KPMG", "KPN", "KR", "KRD", "KRED", "KUOKGROUP", "KW", "KY", "KYOTO", "KZ", "LA", "LACAIXA", "LAMBORGHINI", "LAMER", "LANCASTER", "LAND", "LANDROVER", "LANXESS", "LASALLE", "LAT", "LATROBE", "LAW", "LAWYER", "LB", "LC", "LDS", "LEASE", "LECLERC", "LEGAL", "LEXUS", "LGBT", "LI", "LIAISON", "LIDL", "LIFE", "LIFEINSURANCE", "LIFESTYLE", "LIGHTING", "LIKE", "LIMITED", "LIMO", "LINCOLN", "LINDE", "LINK", "LIPSY", "LIVE", "LIVING", "LIXIL", "LK", "LOAN", "LOANS", "LOCUS", "LOL", "LONDON", "LOTTE", "LOTTO", "LOVE", "LR", "LS", "LT", "LTD", "LTDA", "LU", "LUPIN", "LUXE", "LUXURY", "LV", "LY", "MA", "MADRID", "MAIF", "MAISON", "MAKEUP", "MAN", "MANAGEMENT", "MANGO", "MARKET", "MARKETING", "MARKETS", "MARRIOTT", "MBA", "MC", "MD", "ME", "MED", "MEDIA", "MEET", "MELBOURNE", "MEME", "MEMORIAL", "MEN", "MENU", "MEO", "MG", "MH", "MIAMI", "MICROSOFT", "MIL", "MINI", "MK", "ML", "MLS", "MM", "MMA", "MN", "MO", "MOBI", "MOBILY", "MODA", "MOE", "MOI", "MOM", "MONASH", "MONEY", "MONTBLANC", "MORMON", "MORTGAGE", "MOSCOW", "MOTORCYCLES", "MOV", "MOVIE", "MOVISTAR", "MP", "MQ", "MR", "MS", "MT", "MTN", "MTPC", "MTR", "MU", "MUSEUM", "MUTUAL", "MUTUELLE", "MV", "MW", "MX", "MY", "MZ", "NA", "NADEX", "NAGOYA", "NAME", "NATURA", "NAVY", "NC", "NE", "NEC", "NET", "NETBANK", "NETWORK", "NEUSTAR", "NEW", "NEWS", "NEXT", "NEXTDIRECT", "NEXUS", "NF", "NG", "NGO", "NHK", "NI", "NICO", "NIKON", "NINJA", "NISSAN", "NISSAY", "NL", "NO", "NOKIA", "NORTHWESTERNMUTUAL", "NORTON", "NOWRUZ", "NP", "NR", "NRA", "NRW", "NTT", "NU", "NYC", "NZ", "OBI", "OFFICE", "OKINAWA", "OLAYAN", "OM", "OMEGA", "ONE", "ONG", "ONL", "ONLINE", "OOO", "ORACLE", "ORANGE", "ORG", "ORGANIC", "ORIGINS", "OSAKA", "OTSUKA", "OVH", "PA", "PAGE", "PAMPEREDCHEF", "PANERAI", "PARIS", "PARS", "PARTNERS", "PARTS", "PARTY", "PASSAGENS", "PE", "PET", "PF", "PG", "PH", "PHARMACY", "PHILIPS", "PHOTO", "PHOTOGRAPHY", "PHOTOS", "PHYSIO", "PIAGET", "PICS", "PICTET", "PICTURES", "PID", "PIN", "PING", "PINK", "PIZZA", "PK", "PL", "PLACE", "PLAY", "PLAYSTATION", "PLUMBING", "PLUS", "PM", "PN", "POHL", "POKER", "P**N", "POST", "PR", "PRAXI", "PRESS", "PRO", "PROD", "PRODUCTIONS", "PROF", "PROGRESSIVE", "PROMO", "PROPERTIES", "PROPERTY", "PROTECTION", "PS", "PT", "PUB", "PW", "PWC", "PY", "QA", "QPON", "QUEBEC", "QUEST", "RACING", "RE", "READ", "REALTOR", "REALTY", "RECIPES", "RED", "REDSTONE", "REDUMBRELLA", "REHAB", "REISE", "REISEN", "REIT", "REN", "RENT", "RENTALS", "REPAIR", "REPORT", "REPUBLICAN", "REST", "RESTAURANT", "REVIEW", "REVIEWS", "REXROTH", "RICH", "RICOH", "RIO", "RIP", "RO", "ROCHER", "ROCKS", "RODEO", "ROOM", "RS", "RSVP", "RU", "RUHR", "RUN", "RW", "RWE", "RYUKYU", "SA", "SAARLAND", "SAFE", "SAFETY", "SAKURA", "SALE", "SALON", "SAMSUNG", "SANDVIK", "SANDVIKCOROMANT", "SANOFI", "SAP", "SAPO", "SARL", "SAS", "SAXO", "SB", "SBI", "SBS", "SC", "SCA", "SCB", "SCHAEFFLER", "SCHMIDT", "SCHOLARSHIPS", "SCHOOL", "SCHULE", "SCHWARZ", "SCIENCE", "SCOR", "SCOT", "SD", "SE", "SEAT", "SECURITY", "SEEK", "SELECT", "SENER", "SERVICES", "SEVEN", "SEW", "SEX", "SEXY", "SFR", "SG", "SH", "SHARP", "SHAW", "SHELL", "SHIA", "SHIKSHA", "SHOES", "SHOUJI", "SHOW", "SHRIRAM", "SI", "SINA", "SINGLES", "SITE", "SJ", "SK", "SKI", "SKIN", "SKY", "SKYPE", "SL", "SM", "SMILE", "SN", "SNCF", "SO", "SOCCER", "SOCIAL", "SOFTBANK", "SOFTWARE", "SOHU", "SOLAR", "SOLUTIONS", "SONG", "SONY", "SOY", "SPACE", "SPIEGEL", "SPOT", "SPREADBETTING", "SR", "SRL", "ST", "STADA", "STAR", "STARHUB", "STATEBANK", "STATEFARM", "STATOIL", "STC", "STCGROUP", "STOCKHOLM", "STORAGE", "STORE", "STREAM", "STUDIO", "STUDY", "STYLE", "SU", "SUCKS", "SUPPLIES", "SUPPLY", "SUPPORT", "SURF", "SURGERY", "SUZUKI", "SV", "SWATCH", "SWISS", "SX", "SY", "SYDNEY", "SYMANTEC", "SYSTEMS", "SZ", "TAB", "TAIPEI", "TALK", "TAOBAO", "TATAMOTORS", "TATAR", "TATTOO", "TAX", "TAXI", "TC", "TCI", "TD", "TEAM", "TECH", "TECHNOLOGY", "TEL", "TELECITY", "TELEFONICA", "TEMASEK", "TENNIS", "TEVA", "TF", "TG", "TH", "THD", "THEATER", "THEATRE", "TICKETS", "TIENDA", "TIFFANY", "TIPS", "TIRES", "TIROL", "TJ", "TK", "TL", "TM", "TMALL", "TN", "TO", "TODAY", "TOKYO", "TOOLS", "TOP", "TORAY", "TOSHIBA", "TOTAL", "TOURS", "TOWN", "TOYOTA", "TOYS", "TR", "TRADE", "TRADING", "TRAINING", "TRAVEL", "TRAVELERS", "TRAVELERSINSURANCE", "TRUST", "TRV", "TT", "TUBE", "TUI", "TUNES", "TUSHU", "TV", "TVS", "TW", "TZ", "UA", "UBS", "UG", "UK", "UNICOM", "UNIVERSITY", "UNO", "UOL", "US", "UY", "UZ", "VA", "VACATIONS", "VANA", "VC", "VE", "VEGAS", "VENTURES", "VERISIGN", "VERSICHERUNG", "VET", "VG", "VI", "VIAJES", "VIDEO", "VIG", "VIKING", "VILLAS", "VIN", "VIP", "VIRGIN", "VISION", "VISTA", "VISTAPRINT", "VIVA", "VLAANDEREN", "VN", "VODKA", "VOLKSWAGEN", "VOTE", "VOTING", "VOTO", "VOYAGE", "VU", "VUELOS", "WALES", "WALTER", "WANG", "WANGGOU", "WARMAN", "WATCH", "WATCHES", "WEATHER", "WEATHERCHANNEL", "WEBCAM", "WEBER", "WEBSITE", "WED", "WEDDING", "WEIBO", "WEIR", "WF", "WHOSWHO", "WIEN", "WIKI", "WILLIAMHILL", "WIN", "WINDOWS", "WINE", "WME", "WOLTERSKLUWER", "WORK", "WORKS", "WORLD", "WS", "WTC", "WTF", "XBOX", "XEROX", "XIHUAN", "XIN", "XN--11B4C3D", "XN--1CK2E1B", "XN--1QQW23A", "XN--30RR7Y", "XN--3BST00M", "XN--3DS443G", "XN--3E0B707E", "XN--3PXU8K", "XN--42C2D9A", "XN--45BRJ9C", "XN--45Q11C", "XN--4GBRIM", "XN--55QW42G", "XN--55QX5D", "XN--5TZM5G", "XN--6FRZ82G", "XN--6QQ986B3XL", "XN--80ADXHKS", "XN--80AO21A", "XN--80ASEHDB", "XN--80ASWG", "XN--8Y0A063A", "XN--90A3AC", "XN--90AIS", "XN--9DBQ2A", "XN--9ET52U", "XN--9KRT00A", "XN--B4W605FERD", "XN--BCK1B9A5DRE4C", "XN--C1AVG", "XN--C2BR7G", "XN--CCK2B3B", "XN--CG4BKI", "XN--CLCHC0EA0B2G2A9GCD", "XN--CZR694B", "XN--CZRS0T", "XN--CZRU2D", "XN--D1ACJ3B", "XN--D1ALF", "XN--E1A4C", "XN--ECKVDTC9D", "XN--EFVY88H", "XN--ESTV75G", "XN--FCT429K", "XN--FHBEI", "XN--FIQ228C5HS", "XN--FIQ64B", "XN--FIQS8S", "XN--FIQZ9S", "XN--FJQ720A", "XN--FLW351E", "XN--FPCRJ9C3D", "XN--FZC2C9E2C", "XN--G2XX48C", "XN--GCKR3F0F", "XN--GECRJ9C", "XN--H2BRJ9C", "XN--HXT814E", "XN--I1B6B1A6A2E", "XN--IMR513N", "XN--IO0A7I", "XN--J1AEF", "XN--J1AMH", "XN--J6W193G", "XN--JLQ61U9W7B", "XN--JVR189M", "XN--KCRX77D1X4A", "XN--KPRW13D", "XN--KPRY57D", "XN--KPU716F", "XN--KPUT3I", "XN--L1ACC", "XN--LGBBAT1AD8J", "XN--MGB9AWBF", "XN--MGBA3A3EJT", "XN--MGBA3A4F16A", "XN--MGBA7C0BBN0A", "XN--MGBAAM7A8H", "XN--MGBAB2BD", "XN--MGBAYH7GPA", "XN--MGBB9FBPOB", "XN--MGBBH1A71E", "XN--MGBC0A9AZCG", "XN--MGBCA7DZDO", "XN--MGBERP4A5D4AR", "XN--MGBPL2FH", "XN--MGBT3DHD", "XN--MGBTX2B", "XN--MGBX4CD0AB", "XN--MIX891F", "XN--MK1BU44C", "XN--MXTQ1M", "XN--NGBC5AZD", "XN--NGBE9E0A", "XN--NODE", "XN--NQV7F", "XN--NQV7FS00EMA", "XN--NYQY26A", "XN--O3CW4H", "XN--OGBPF8FL", "XN--P1ACF", "XN--P1AI", "XN--PBT977C", "XN--PGBS0DH", "XN--PSSY2U", "XN--Q9JYB4C", "XN--QCKA1PMC", "XN--QXAM", "XN--RHQV96G", "XN--ROVU88B", "XN--S9BRJ9C", "XN--SES554G", "XN--T60B56A", "XN--TCKWE", "XN--UNUP4Y", "XN--VERMGENSBERATER-CTB", "XN--VERMGENSBERATUNG-PWB", "XN--VHQUV", "XN--VUQ861B", "XN--W4R85EL8FHU5DNRA", "XN--WGBH1C", "XN--WGBL6A", "XN--XHQ521B", "XN--XKC2AL3HYE2A", "XN--XKC2DL3A5EE0H", "XN--Y9A3AQ", "XN--YFRO4I67O", "XN--YGBI2AMMX", "XN--ZFR164B", "XPERIA", "XXX", "XYZ", "YACHTS", "YAHOO", "YAMAXUN", "YANDEX", "YE", "YODOBASHI", "YOGA", "YOKOHAMA", "YOU", "YOUTUBE", "YT", "YUN", "ZA", "ZARA", "ZERO", "ZIP", "ZM", "ZONE", "ZUERICH", "ZW", "AB.CA", "AC.AE", "AC.AT", "AC.CN", "AC.CR", "AC.CY", "AC.FJ", "AC.GG", "AC.ID", "AC.IL", "AC.IM", "AC.IN", "AC.JE", "AC.JP", "AC.KR", "AC.NZ", "AC.PA", "AC.TH", "AC.UG", "AC.UK", "AC.YU", "AC.ZA", "AD.JP", "AH.CN", "ALDERNEY.GG", "ALT.ZA", "ART.BR", "ART.DO", "ARTS.CO", "ARTS.VE", "ASN.AU", "ASN.LV", "BBS.TR", "BC.CA", "BIB.VE", "BJ.CN", "CO.AT", "CO.AO", "CO.CK", "CO.CR", "CO.GG", "CO.HU", "CO.ID", "CO.IL", "CO.IM", "CO.IN", "CO.JE", "CO.JP", "CO.KR", "COM.AR", "COM.AU", "COM.AZ", "COM.BB", "COM.BM", "COM.BR", "COM.BS", "COM.CN", "COM.CO", "COM.CU", "COM.CY", "COM.DO", "COM.EC", "COM.EG", "COM.FJ", "COM.GE", "COM.GU", "COM.HK", "COM.JO", "COM.KH", "COM.LA", "COM.LB", "COM.LC", "COM.LV", "COM.LY", "COM.MM", "COM.MO", "COM.MT", "COM.MX", "COM.MY", "COM.NA", "COM.NC", "COM.NI", "COM.NP", "COM.PA", "COM.PE", "COM.PH", "COM.PL", "COM.PY", "COM.RU", "COM.SG", "COM.SH", "COM.SY", "COM.TN", "COM.TR", "COM.TW", "COM.UA", "COM.UY", "COM.VE", "CONF.AU", "CONF.LV", "CO.NZ", "COOP", "CO.AE", "CO.SV", "CO.TH", "CO.UG", "CO.UK", "CO.VE", "CO.VI", "CO.YU", "CO.ZA", "CQ.CN", "CSIRO.AU", "ED.CR", "EDU.BM", "EDU.AR", "EDU.CN", "EDU.CO", "EDU.DO", "EDU.EC", "EDU.EG", "EDU.GE", "EDU.GU", "EDU.JO", "EDU.LC", "EDU.LV", "EDU.MM", "EDU.MO", "EDU.MY", "EDUNET.TN", "EDU.PA", "EDU.PY", "EDU.SG", "EDU.SH", "EDU.TR", "EDU.TW", "EDU.UY", "EDU.VE", "EDU.YU", "EDU.ZA", "ENS.TN", "ERNET.IN", "ESP.BR", "ETC.BR", "EUN.EG", "FI.CR", "FIN.EC", "FIN.TN", "FIRM.CO", "FIRM.VE", "G12.BR", "GD.CN", "GEN.NZ", "GOB.PA", "GO.CR", "GO.ID", "GO.KR", "GO.TH", "GO.UG", "GOV.AE", "GOV.AR", "GOV.AU", "GOV.BM", "GOV.BR", "GOV.CN", "GOV.CO", "GOV.CY", "GOV.DO", "GOV.EC", "GOV.EG", "GOVE.TW", "GOV.FJ", "GOV.GE", "GOV.GG", "GOV.GU", "GOV.IL", "GOV.IM", "GOV.IN", "GOV.JE", "GOV.JO", "GOV.JP", "GOV.LB", "GOV.LC", "GOV.LV", "GOV.MM", "GOV.MO", "GOV.MY", "GOV.SG", "GOV.SH", "GOV.TN", "GOVT.NZ", "GOV.TR", "GOV.UA", "GOV.UK", "GOV.VE", "GOV.ZA", "GS.CN", "GUERNSEY.GG", "GX.CN", "GZ.CN", "HB.CN", "HE.CN", "HI.CN", "HK.CN", "HL.CN", "HN.CN", "ID.AU", "ID.FJ", "ID.LV", "IND.BR", "IND.GG", "IND.JE", "IND.TN", "INF.BR", "INFO.AU", "INFO.CO", "INFO.HU", "INFO.TN", "INFO.VE", "INT.CO", "INTL.TN", "INT.VE", "JERSEY.JE", "JL.CN", "JS.CN", "K12.EC", "K12.IL", "K12.TR", "LKD.CO.IM", "LN.CN", "LTD.GG", "LTD.JE", "LTD.UK", "MB.CA", "MED.EC", "MIL.BR", "MIL.CO", "MIL.DO", "MIL.EC", "MIL.GE", "MIL.GU", "MIL.ID", "MIL.LB", "MIL.LV", "MIL.PH", "MIL.SH", "MIL.TR", "MIL.VE", "MIL.ZA", "MO.CN", "MOD.UK", "MUNI.IL", "MUSEUM", "NAME", "NAT.TN", "NB.CA", "NET.AR", "NET.AU", "NET.AZ", "NET.BB", "NET.BM", "NET.BR", "NET.BS", "NET.CN", "NET.CU", "NET.CY", "NET.DO", "NET.EC", "NET.EG", "NET.GE", "NET.GG", "NET.GU", "NET.HK", "NET.ID", "NET.IL", "NET.IM", "NET.IN", "NET.JE", "NET.JO", "NET.JP", "NET.KH", "NET.LA", "NET.LB", "NET.LC", "NET.LV", "NET.LY", "NET.MM", "NET.MO", "NET.MT", "NET.MX", "NET.MY", "NET.NA", "NET.NC", "NET.NP", "NET.NZ", "NET.PA", "NET.PE", "NET.PH", "NET.PL", "NET.PY", "NET.RU", "NET.SG", "NET.SH", "NET.SY", "NET.TH", "NET.TN", "NET.TR", "NET.TW", "NET.UA", "NET.UK", "NET.UY", "NET.VE", "NET.VI", "NET.ZA", "NF.CA", "NGO.PH", "NGO.ZA", "NHS.UK", "NIC.IM", "NIC.IN", "NM.CN", "NM.KR", "NOM.CO", "NOM.VE", "NOM.ZA", "NS.CA", "NSK.SU", "NT.CA", "NUI.HU", "NX.CN", "ON.CA", "OR.CR", "ORG.AE", "ORG.AR", "ORG.AU", "ORG.AZ", "ORG.BB", "ORG.BM", "ORG.BR", "ORG.BS", "ORG.CN", "ORG.CO", "ORG.CU", "ORG.CY", "ORG.DO", "ORG.EC", "ORG.EG", "ORG.FJ", "ORG.GE", "ORG.GG", "ORG.GU", "ORG.HK", "ORG.HU", "ORG.IL", "ORG.IM", "ORG.JE", "ORG.JP", "ORG.KH", "ORG.LA", "ORG.LB", "ORG.LC", "ORG.LV", "ORG.LY", "ORG.MM", "ORG.MO", "ORG.MT", "ORG.MX", "ORG.MY", "ORG.NA", "ORG.NC", "ORG.NZ", "ORG.PA", "ORG.PE", "ORG.PH", "ORG.PL", "ORG.PY", "ORG.RU", "ORG.SG", "ORG.SH", "ORG.SY", "ORG.TN", "ORG.TR", "ORG.TW", "ORG.UK", "ORG.UY", "ORG.VE", "ORG.VI", "ORG.YU", "ORG.ZA", "OR.ID", "OR.KR", "OR.TH", "ORT.NP", "OR.UG", "OZ.AU", "PE.CA", "PLC.CO.IM", "PLC.UK", "POLICE.UK", "PRIV.HU", "PSI.BR", "PVT.GE", "QC.CA", "QH.CN", "REC.BR", "REC.CO", "REC.VE", "RE.KR", "RES.IN", "RNRT.TN", "RNS.TN", "RNU.TN", "SA.CR", "SARK.GG", "SC.CN", "SCH.GG", "SCH.JE", "SCHOOL.FJ", "SCHOOL.ZA", "SCH.UK", "SCI.EG", "SH.CN", "SK.CA", "SLD.PA", "SN.CN", "STORE.CO", "STORE.VE", "SX.CN", "TEC.VE", "TELEMEMO.AU", "TJ.CN", "TM.HU", "TMP.BR", "TM.ZA", "TOURISM.TN", "TW.CN", "WEB.CO", "WEB.DO", "WEB.VE", "WEB.ZA", "XJ.CN", "XZ.CN", "YK.CA", "YN.CN", "ZJ.CN" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0, "tldtbl") ) return log("build: Could not init table of TLDs."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { const char *d = s_tlds[i]; int32_t dlen = gbstrlen ( d ); int64_t dh = hash64Lower_a ( d , dlen ); if ( ! s_table.addKey (&dh,NULL) ) return log("build: dom table failed"); } s_isInitialized = true; } int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld)); return s_table.isInTable ( &h );//getScoreFromTermId ( h ); }
void startSpidering ( ) { // url class for parsing/normalizing url Url u; // count total urls done static long long s_startTime = 0; // set startTime if ( s_startTime == 0 ) s_startTime = gettimeofdayInMilliseconds(); // get time now long long now = gettimeofdayInMilliseconds(); // elapsed time to do all urls double took = (double)(now - s_startTime) / 1000.0 ; // log this every 20 urls if ( s_printIt && s_total > 0 && ( s_total % 20 ) == 0 ) { logf(LOG_INFO,"did %li urls in %f seconds. %f urls per second." " threads now = %li.", s_total , took , ((double)s_total) / took, s_launched); s_printIt = false; } // did we wait long enough? if ( now - s_lastTime < s_wait ) return; s_lastTime = now; // . use HttpServer.getDoc() to fetch it // . fetch X at a time while ( (s_server || s_p < s_pend) && s_launched < s_maxNumThreads ) { // clear any error g_errno = 0; //append s_append to the url char url[MAX_URL_LEN]; char *p = url; char *pend = url + MAX_URL_LEN; char *t = NULL; if(s_server) { long len = gbstrlen(s_server); memcpy ( p, s_server, len); p += len; p += getRandomWords(p, pend, s_numRandWords); long appendLen = gbstrlen(s_append); if ( p + appendLen < pend ) { memcpy ( p, s_append, gbstrlen(s_append) ); p += gbstrlen(s_append); } *p++ = '\0'; u.set ( url , p - url); t = g_mem.strdup(url, "saved url"); } else { memcpy ( p, s_p, gbstrlen(s_p)); p += gbstrlen ( s_p ); if ( gbstrlen(s_p) + gbstrlen(s_append) < MAX_URL_LEN ) memcpy ( p, s_append, gbstrlen(s_append) ); p += gbstrlen(s_append); //null end *p ='\0'; // make into a url class u.set ( url , gbstrlen(url) ); // set port if port switch is true //if ( s_portSwitch ) { // long r = rand() % 32; // u.setPort ( 8000 + r ); //} // save s_p t = s_p; // skip to next url s_p += gbstrlen ( s_p ) + 1; } // count it s_launched++; // get it bool status = g_httpServer.getDoc ( &u , // url 0 , // offset -1 , // size 0 , // ifModifiedSince (void *)t , // state gotDocWrapper, // callback 20*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) continue; // otherwise, got it right away s_launched--; // log msg log("got doc1 %s: %s", u.getUrl() , mstrerror(g_errno) ); // we gotta wait break; } // bail if not done yet //if ( s_launched > 0 ) return; if ( s_server || s_p < s_pend ) return; // otherwise, we're all done logf(LOG_INFO,"blaster: did %li urls in %f seconds. %f urls per " "second.", s_total , took , ((double)s_total) / took ); // exit now exit ( 0 ); }
bool sendReply ( void *state ) { // get the state properly Msg7 *msg7= (Msg7 *) state; GigablastRequest *gr = &msg7->m_gr; // extract info from state TcpSocket *sock = gr->m_socket; XmlDoc *xd = &msg7->m_xd; // log it //if ( msg7->m_url[0] ) xd->logIt(); // msg7 has the docid for what we injected, iff g_errno is not set //long long docId = msg7->m_msg7.m_docId; //long hostId = msg7->m_msg7.m_hostId; long long docId = xd->m_docId; long hostId = 0;//msg7->m_msg7.m_hostId; // set g_errno to index code if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno ) g_errno = xd->m_indexCode; char format = gr->m_hr.getReplyFormat(); // no url parm? if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML ) g_errno = EMISSINGINPUT; if ( g_errno && g_errno != EDOCUNCHANGED ) { long save = g_errno; mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); g_errno = save; char *msg = mstrerror(g_errno); return g_httpServer.sendErrorReply(sock,save,msg,NULL); } char abuf[320]; SafeBuf am(abuf,320,0,false); am.setLabel("injbuf"); char *ct = NULL; // a success reply, include docid and url i guess if ( format == FORMAT_XML ) { am.safePrintf("<response>\n"); am.safePrintf("\t<statusCode>%li</statusCode>\n", (long)g_errno); am.safePrintf("\t<statusMsg><![CDATA["); am.cdataEncode(mstrerror(g_errno)); am.safePrintf("]]></statusMsg>\n"); am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId); if ( gr->m_getSections ) { SafeBuf *secBuf = xd->getInlineSectionVotingBuf(); am.safePrintf("\t<htmlSrc><![CDATA["); if ( secBuf->length() ) am.cdataEncode(secBuf->getBufStart()); am.safePrintf("]]></htmlSrc>\n"); } am.safePrintf("</response>\n"); ct = "text/xml"; } if ( format == FORMAT_JSON ) { am.safePrintf("{\"response\":{\n"); am.safePrintf("\t\"statusCode\":%li,\n",(long)g_errno); am.safePrintf("\t\"statusMsg\":\""); am.jsonEncode(mstrerror(g_errno)); am.safePrintf("\",\n"); am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId); if ( gr->m_getSections ) { SafeBuf *secBuf = xd->getInlineSectionVotingBuf(); am.safePrintf("\t\"htmlSrc\":\""); if ( secBuf->length() ) am.jsonEncode(secBuf->getBufStart()); am.safePrintf("\",\n"); } // subtract ",\n" am.m_length -= 2; am.safePrintf("\n}\n}\n"); ct = "application/json"; } if ( format == FORMAT_XML || format == FORMAT_JSON ) { mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage(sock, am.getBufStart(), am.length(), 0, false, ct ); } // // debug // /* // now get the meta list, in the process it will print out a // bunch of junk into msg7->m_pbuf if ( xd->m_docId ) { char *metalist = xd->getMetaList ( 1,1,1,1,1,1 ); if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;} // print it out SafeBuf *pbuf = &msg7->m_sbuf; xd->printDoc( pbuf ); bool status = g_httpServer.sendDynamicPage( msg7->m_socket , pbuf->getBufStart(), pbuf->length() , -1, //cachtime false ,//postreply? NULL, //ctype -1 , //httpstatus NULL,//cookie "utf-8"); // delete the state now mdelete ( st , sizeof(Msg7) , "PageInject" ); delete (st); // return the status return status; } */ // // end debug // char *url = gr->m_url; // . if we're talking w/ a robot he doesn't care about this crap // . send him back the error code (0 means success) if ( url && gr->m_shortReply ) { char buf[1024*32]; char *p = buf; // return docid and hostid if ( ! g_errno ) p += sprintf ( p , "0,docId=%lli,hostId=%li," , docId , hostId ); // print error number here else p += sprintf ( p , "%li,0,0,", (long)g_errno ); // print error msg out, too or "Success" p += sprintf ( p , "%s", mstrerror(g_errno)); mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); return g_httpServer.sendDynamicPage ( sock,buf, gbstrlen(buf) , -1/*cachetime*/); } SafeBuf sb; // print admin bar g_pages.printAdminTop ( &sb, sock , &gr->m_hr ); // print a response msg if rendering the page after a submission if ( g_errno ) sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>" "</center>", mstrerror(g_errno) , g_errno); else if ( (gr->m_url&&gr->m_url[0]) || (gr->m_queryToScrape&&gr->m_queryToScrape[0]) ) sb.safePrintf ( "<center><b>Sucessfully injected %s" "</center><br>" , xd->m_firstUrl.m_url ); // print the table of injection parms g_parms.printParmTable ( &sb , sock , &gr->m_hr ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // calculate buffer length //long bufLen = p - buf; // nuke state mdelete ( msg7, sizeof(Msg7) , "PageInject" ); delete (msg7); // . send this page // . encapsulates in html header and tail // . make a Mime // . i thought we need -2 for cacheTime, but i guess not return g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), -1/*cachetime*/); }
// . MDW: TODO: bring this back when we have a subdir for each collection // . add a new rec // . returns false and sets g_errno on error // . use a collnum_t of -1 if it is new bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew , collnum_t collnum , bool isDump , bool saveIt ) { // sanity check if ( ( isNew && collnum >= 0) || (!isNew && collnum < 0) ) { log(LOG_LOGIC,"admin: Bad parms passed to addRec."); char *xx = NULL; *xx = 0; } // ensure coll name is legit char *p = coll; for ( ; *p ; p++ ) { if ( is_alnum_a(*p) ) continue; if ( *p == '-' ) continue; break; } if ( *p ) { g_errno = EBADENGINEER; log("admin: \"%s\" is a malformed collection name because it " "contains the '%c' character.",coll,*p); return false; } // . scan for holes // . i is also known as the collection id long i ; if ( collnum >= 0 ) i = (long)collnum; else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break; // ceiling? if ( i >= MAX_COLLS ) { g_errno = ENOBUFS; return log("admin: Limit of %li collection reached. " "Collection not created.",(long)MAX_COLLS); } // if empty... bail, no longer accepted, use "main" if ( ! coll || !coll[0] ) { g_errno = EBADENGINEER; return log("admin: Trying to create a new collection " "but no collection name provided. Use the \"c\" " "cgi parameter to specify it."); } // or if too big if ( gbstrlen(coll) > MAX_COLL_LEN ) { g_errno = ENOBUFS; return log("admin: Trying to create a new collection " "whose name \"%s\" of %i chars is longer than the " "max of %li chars.",coll,gbstrlen(coll), (long)MAX_COLL_LEN); } // ensure does not already exist in memory if ( getCollnum ( coll ) >= 0 ) { g_errno = EEXIST; return log("admin: Trying to create collection \"%s\" but " "already exists in memory.",coll); } // MDW: ensure not created on disk since time of last load char dname[512]; sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i); if ( isNew && opendir ( dname ) ) { g_errno = EEXIST; return log("admin: Trying to create collection %s but " "directory %s already exists on disk.",coll,dname); } //char fname[512]; // ending '/' is ALWAYS included in g_hostdb.m_dir //sprintf ( fname , "%s%li.%s.conf",g_hostdb.m_dir,i,coll); //File f; //f.set ( fname ); //if ( f.doesExist() ) { // g_errno = EEXIST; // return log("admin: Trying to create collection \"%s\" but " // "file %s already exists on disk.",coll,fname); //} // create the record in memory m_recs[i] = new (CollectionRec); if ( ! m_recs[i] ) return log("admin: Failed to allocated %li bytes for new " "collection record for \"%s\".", (long)sizeof(CollectionRec),coll); mnew ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); // get copy collection CollectionRec *cpcrec = NULL; if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen ); if ( cpc && cpc[0] && ! cpcrec ) log("admin: Collection \"%s\" to copy config from does not " "exist.",cpc); // get the default.conf from working dir if there g_parms.setToDefault( (char *)m_recs[i] ); if ( isNew ) { // the default conf file char tmp1[1024]; sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir ); // . set our parms from the file. // . accepts OBJ_COLLECTIONREC or OBJ_CONF g_parms.setFromFile ( m_recs[i] , NULL , tmp1 ); } // this will override all if ( cpcrec ) { // copy it, but not the timedb hashtable, etc. long size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec; // JAB: bad memcpy - no donut! // this is not how objects are supposed to be copied!!! memcpy ( m_recs[i] , cpcrec , size);//sizeof(CollectionRec) ); // perform the cleanup that a copy constructor might do... //for (int rx = 0; rx < MAX_FILTERS; rx++) // m_recs[i]->m_pRegExParser[rx] = NULL; // don't NUKE the filters! // m_recs[i]->m_numRegExs = 0; // OK - done with cleaning up... // but never copy over the collection hostname, that is // problematic m_recs[i]->m_collectionHostname [0] = '\0'; m_recs[i]->m_collectionHostname1[0] = '\0'; m_recs[i]->m_collectionHostname2[0] = '\0'; } // set coll id and coll name for coll id #i strcpy ( m_recs[i]->m_coll , coll ); m_recs[i]->m_collLen = gbstrlen ( coll ); m_recs[i]->m_collnum = i; // point to this, so Rdb and RdbBase can reference it coll = m_recs[i]->m_coll; // . if has no password or ip add the default password, footbar // . no, just don't have any password, just use the 127.0.0.1 ip // that is the loopback /* if ( m_recs[i]->m_numAdminIps == 0 && m_recs[i]->m_numAdminPwds == 0 ) { m_recs[i]->m_numAdminIps = 1; m_recs[i]->m_adminIps[0] = atoip("0.0.0.0",7); //strcpy ( m_recs[i]->m_adminPwds[0] , "footbar23" ); //m_recs[i]->m_numAdminPwds = 1; //log("admin: Using default password for new collection of " // "'footbar23'."); } */ // collection name HACK for backwards compatibility //if ( strcmp ( coll , "main" ) == 0 ) { // m_recs[i]->m_coll[0] = '\0'; // m_recs[i]->m_collLen = 0; // //coll[0] = '\0'; //} // MDW: create the new directory if ( isNew ) { retry22: if ( ::mkdir ( dname , S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH ) ) { // valgrind? if ( errno == EINTR ) goto retry22; g_errno = errno; mdelete ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); delete ( m_recs[i]); m_recs[i] = NULL; return log("admin: Creating directory %s had error: " "%s.", dname,mstrerror(g_errno)); } // save it into this dir... might fail! if ( ! m_recs[i]->save() ) { mdelete ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); delete ( m_recs[i]); m_recs[i] = NULL; return log("admin: Failed to save file %s: %s", dname,mstrerror(g_errno)); } } // load if not new if ( ! isNew && ! m_recs[i]->load ( coll , i ) ) { mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); delete ( m_recs[i]); m_recs[i] = NULL; return log("admin: Failed to load conf for collection " "\"%s\".",coll); } // mark it as needing to be saved instead m_recs[i]->m_needsSave = false; // force this to off for now //m_recs[i]->m_queryExpansion = false; // reserve it if ( i >= m_numRecs ) m_numRecs = i + 1; // count it m_numRecsUsed++; // update the time updateTime(); // if we are doing a dump from the command line, skip this stuff if ( isDump ) return true; bool verify = true; if(isNew) verify = false; // tell rdbs to add one, too //if ( ! g_indexdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_posdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_datedb.addColl ( coll, verify ) ) goto hadError; if ( ! g_titledb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_revdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_sectiondb.addColl ( coll, verify ) ) goto hadError; if ( ! g_tagdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_catdb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_spiderdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_doledb.addColl ( coll, verify ) ) goto hadError; //if ( ! g_tfndb.addColl ( coll, verify ) ) goto hadError; if ( ! g_clusterdb.addColl ( coll, verify ) ) goto hadError; if ( ! g_linkdb.addColl ( coll, verify ) ) goto hadError; // debug message log ( LOG_INFO, "admin: added collection \"%s\" (%li).",coll,(long)i); // tell SpiderCache about this collection, it will create a // SpiderCollection class for it. //g_spiderCache.reset1(); // . make it set is CollectionRec::m_sortByDateTable now // . everyone else uses setTimeOfDayInMilliseconds() in fctypes.cpp // to call this function once their clock is synced with host #0 //if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 ) // initSortByDateTable(coll); //else if ( g_hostdb.m_initialized && isClockInSync() ) // initSortByDateTable(coll); // . do it for all regard-less // . once clock is in sync with host #0 we may do it again! //if ( g_hostdb.m_initialized ) // initSortByDateTable(coll); // success return true; hadError: log("admin: Had error adding new collection: %s.",mstrerror(g_errno)); // do not delete it, might have failed to add because not enough // memory to read in the tree *-saved.dat file on disk!! and if // you delete in then core the *-saved.dat file gets overwritten!!! return false; /* g_indexdb.getRdb()->delColl ( coll ); g_datedb.getRdb()->delColl ( coll ); g_timedb.getRdb()->delColl ( coll ); g_titledb.getRdb()->delColl ( coll ); g_revdb.getRdb()->delColl ( coll ); g_sectiondb.getRdb()->delColl ( coll ); g_placedb.getRdb()->delColl ( coll ); g_tagdb.getRdb()->delColl ( coll ); //g_catdb.getRdb()->delColl ( coll ); //g_checksumdb.getRdb()->delColl ( coll ); g_spiderdb.getRdb()->delColl ( coll ); g_doledb.getRdb()->delColl ( coll ); g_tfndb.getRdb()->delColl ( coll ); g_clusterdb.getRdb()->delColl ( coll ); g_linkdb.getRdb()->delColl ( coll ); deleteRec ( coll ); return false; */ }