// . list of types is on: http://www.duke.edu/websrv/file-extensions.html // . i copied it to the bottom of this file though const char *HttpMime::getContentTypeFromExtension ( const char *ext ) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return "text/html"; // get hash for table look up int32_t key = hash32n ( ext ); char **pp = (char **)s_mimeTable.getValue ( &key ); // if not found in table, assume text/html if ( ! pp ) return "text/html"; return *pp; }
const char *extensionToContentTypeStr2 ( const char *ext , int32_t elen ) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return NULL; if ( elen <= 0 ) return NULL; // get hash for table look up int32_t key = hash32 ( ext , elen ); char **pp = (char **)s_mimeTable.getValue ( &key ); if ( ! pp ) return NULL; return *pp; }
iconv_t gbiconv_open( char *tocode, char *fromcode) { // get hash for to/from uint32_t hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0); uint32_t hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0); uint32_t hash = hash32h(hash1, hash2); g_errno = 0; iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash); iconv_t conv = NULL; if ( convp ) conv = *convp; //log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%"XINT32": 0x%"XINT32"", // fromcode, tocode, // hash, conv); if (!conv){ //log(LOG_DEBUG, "uni: Allocating new convertor for " // "%s to %s (hash: 0x%"XINT32")", // fromcode, tocode,hash); conv = iconv_open(tocode, fromcode); if (conv == (iconv_t) -1) { log(LOG_WARN, "uni: failed to open converter for " "%s to %s: %s (%d)", fromcode, tocode, strerror(errno), errno); // need to stop if necessary converters don't open //char *xx=NULL; *xx = 0; g_errno = errno; if (errno == EINVAL) g_errno = EBADCHARSET; return conv; } // add mem to table to keep track g_mem.addMem((void*)conv, 52, "iconv", 1); // cache convertor s_convTable.addKey(&hash, &conv); //log(LOG_DEBUG, "uni: Saved convertor 0x%"INT32" under hash 0x%"XINT32"", // conv, hash); } else{ // reset convertor char *dummy = NULL; size_t dummy2 = 0; // JAB: warning abatement //size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2); iconv(conv,NULL,NULL,&dummy,&dummy2); } return conv; }
nodeid_t getTagId ( char *s , NodeType **retp ) { // init table? static bool s_init = false; static HashTableX s_ht; static char s_buf[10000]; if ( ! s_init ) { s_init = true; s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0 // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { char *name = g_nodes[i].m_nodeName; int32_t nlen = gbstrlen(name); int64_t h = hash64Upper_a ( name,nlen,0LL ); NodeType *nt = &g_nodes[i]; if ( ! s_ht.addKey(&h,&nt) ) { char *xx=NULL;*xx=0; } } // sanity if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; } // sanity test nodeid_t tt = getTagId ( "br" ); if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; } } // find end of tag name. hyphens are ok to be in name. // facebook uses underscores like <start_time> char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++); // hash it for lookup int64_t h = hash64Upper_a ( s , e - s , 0 ); // look it up NodeType **ntp = (NodeType **)s_ht.getValue(&h); // assume none if ( retp ) *retp = NULL; // none? if ( ! ntp ) return 0; // got one if ( retp ) *retp = *ntp; // get id otherwise return (*ntp)->m_nodeId; }
// a host is asking us (host #0) what proxy to use? static void handleRequest54(UdpSlot *udpSlot, int32_t niceness) { char *request = udpSlot->m_readBuf; int32_t requestSize = udpSlot->m_readBufSize; // we now use the top part of the Msg13Request as the ProxyRequest Msg13Request *preq = (Msg13Request *)request; // sanity check if ( requestSize != preq->getProxyRequestSize() ) { log("db: Got bad request 0x54 size of %" PRId32" bytes. bad", requestSize ); g_udpServer.sendErrorReply ( udpSlot , EBADREQUESTSIZE ); return; } // is the request telling us it is done downloading through a proxy? if ( preq->m_opCode == OP_RETPROXY ) { returnProxy ( preq , udpSlot ); return; } // if sender is asking for a new proxy and wants us to ban // the previous proxy we sent for this urlIp... if ( preq->m_banProxyIp ) { // don't core if misses sanity. it seems we don't always // NULLify these or something. // these must match if(preq->m_banProxyIp != preq->m_proxyIp || preq->m_banProxyPort != preq->m_proxyPort){ log("db: proxy: banproxyip != proxyip. mismatch!"); g_udpServer.sendErrorReply ( udpSlot , EBADENGINEER); return; } // this will "return" the banned proxy returnProxy ( preq , NULL ); // now add it to the banned table int64_t uip = preq->m_urlIp; int64_t pip = preq->m_banProxyIp; int64_t h64 = hash64h ( uip , pip ); if ( ! s_proxyBannedTable.isInTable ( &h64 ) ) { s_proxyBannedTable.addKey ( &h64 ); // for stats counting. each proxy ip maps to # // of unique website IPs that have banned it. s_banCountTable.addTerm32((uint32_t)pip); } } // shortcut int32_t urlIp = preq->m_urlIp; // send to a proxy that is up and has the least amount // of LoadBuckets with this urlIp, if tied, go to least loaded. // clear counts for this url ip for scoring the best proxy to use for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); sp->m_countForThisIp = 0; sp->m_lastTimeUsedForThisIp = 0LL; } // this table maps a url's current IP to a possibly MULTIPLE slots // which tell us what proxy is downloading a page from that IP. // so we can try to find a proxy that is not download a url from // this IP currently, or hasn't been for the longest time... int32_t hslot = s_loadTable.getSlot ( &urlIp ); // scan all proxies that have this urlip outstanding for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){ // get the bucket LoadBucket *lb; lb = (LoadBucket *)s_loadTable.getValueFromSlot(i); // get the spider proxy this load point was for uint64_t key = (uint32_t)lb->m_proxyIp; key <<= 16; key |= (uint16_t)lb->m_proxyPort; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValue(&key); // must be there unless user remove it from the list if ( ! sp ) continue; // count it up if ( lb->m_downloadEndTimeMS == 0LL ) sp->m_countForThisIp++; // set the last time used to the most recently downloaded time // that this proxy has downloaded from this ip if ( lb->m_downloadEndTimeMS && lb->m_downloadEndTimeMS > sp->m_lastTimeUsedForThisIp ) sp->m_lastTimeUsedForThisIp = lb->m_downloadEndTimeMS; } // first try to get a spider proxy that is not "dead" bool skipDead = true; int32_t numBannedProxies = 0; int32_t aliveProxyCandidates = 0; redo: // get the min of the counts int32_t minCount = 999999; for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) { numBannedProxies++; continue; } } // if it failed the last test, skip it if ( skipDead && sp->m_lastDownloadError ) continue; if ( skipDead ) aliveProxyCandidates++; if ( sp->m_countForThisIp >= minCount ) continue; minCount = sp->m_countForThisIp; } // all dead? then get the best dead one if ( minCount == 999999 ) { skipDead = false; goto redo; } // . we only use one proxy if none are banned by this IP // . when that gets banned, we will use the next 2 proxies with // a higher backoff/crawlDelay, etc. int32_t threshHold; if ( numBannedProxies <= 0 ) threshHold = 1; // if first proxy gets banned, try next 2 proxies until both get ban'd else if ( numBannedProxies == 1 ) threshHold = 2; else if ( numBannedProxies < 1+2) threshHold = 3 - numBannedProxies; // if next two proxies got banned, try next 4 proxies until banned else if ( numBannedProxies == 3 ) threshHold = 4; else if ( numBannedProxies < 3+4) threshHold = 7 - numBannedProxies; // if next 4 proxies got banned, try next 8 proxies until they get band else if ( numBannedProxies == 7 ) threshHold = 8; else if ( numBannedProxies < 7+8) threshHold = 15 - numBannedProxies; else if ( numBannedProxies == 15) threshHold = 16; else if ( numBannedProxies < 15+16 ) threshHold = 31-numBannedProxies; else if ( numBannedProxies == 31 ) threshHold = 32; else if ( numBannedProxies < 31+32)threshHold=63-numBannedProxies; else if ( numBannedProxies == 63 ) threshHold = 64; else if ( numBannedProxies < 63+64)threshHold=127-numBannedProxies; else if ( numBannedProxies == 127 ) threshHold = 128; else if ( numBannedProxies < 127+128)threshHold=255-numBannedProxies; else if ( numBannedProxies == 255 ) threshHold = 256; else if ( numBannedProxies < 255+256)threshHold=512-numBannedProxies; else if ( numBannedProxies == 511 ) threshHold = 512; else if ( numBannedProxies < 511+512)threshHold=1024-numBannedProxies; else threshHold = 1024; if ( threshHold <= 0 ) { log("proxy: spiderproxy error in threshold of %" PRId32" " "for banned=%" PRId32,threshHold,numBannedProxies); threshHold = 1; } // reset minCount so we can take the min over those we check here minCount = -1; int64_t oldest = 0x7fffffffffffffffLL; SpiderProxy *winnersp = NULL; int32_t count = 0; // start at a random slot based on url's IP so we don't // overload the first proxy int32_t start = ((uint32_t)urlIp) % s_iptab.getNumSlots(); int32_t slotCount = s_iptab.getNumSlots(); // . now find the best proxy wih the minCount for ( int32_t i = start ; ; i++ ) { // scan all slots in hash table, then stop if ( slotCount-- <= 0 ) break; // wrap around to zero if we hit the end if ( i == s_iptab.getNumSlots() ) i = 0; // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if it failed the last test, skip it... not here... if ( skipDead && sp->m_lastDownloadError ) continue; // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue; } // if some proxies are "alive" then only pick from // the first half of the proxies that are alive (i.e. still // work). that way, when one of those goes dead we will inc // the backoff (crawldelay) and a new proxy that we haven't // used for this url's IP will take it's place. and such // new proxies will only have the new backoff count used // through them. that way, we don't get ALL of our proxies // banned at about the same time since we do somewhat uniform // load balancing over them. if ( skipDead && count >= threshHold)//aliveProxyCandidates/2 ) continue; // count the alive/non-banned candidates count++; // if all hosts were "dead" because they all had // m_lastDownloadError set then minCount will be 999999 // and nobody should continue from this statement: if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue; // then go by last download time for this ip if ( sp->m_countForThisIp == minCount && minCount>=0 && sp->m_lastTimeUsedForThisIp >= oldest ) continue; // pick the spider proxy used longest ago oldest = sp->m_lastTimeUsedForThisIp; minCount = sp->m_countForThisIp; // got a new winner winnersp = sp; } // we must have a winner if ( ! winnersp ) { g_process.shutdownAbort(true); } int64_t nowms = gettimeofdayInMillisecondsLocal(); // add a new load bucket then! LoadBucket bb; bb.m_urlIp = urlIp; // the time it started bb.m_downloadStartTimeMS = nowms; // download has not ended yet bb.m_downloadEndTimeMS = 0LL; // the host using the proxy bb.m_hostId = udpSlot->getHostId(); // key is this for m_prTable bb.m_proxyIp = winnersp->m_ip; bb.m_proxyPort = winnersp->m_port; // a new id. we use this to update the downloadEndTime when done static int32_t s_lbid = 0; // add it now bb.m_id = s_lbid++; s_loadTable.addKey ( &urlIp , &bb ); // winner count update winnersp->m_timesUsed++; // sanity if ( (int32_t)sizeof(ProxyReply) > TMPBUFSIZE ){g_process.shutdownAbort(true);} // and give proxy ip/port back to the requester so they can // use that to download their url ProxyReply *prep = (ProxyReply *)udpSlot->m_tmpBuf; prep->m_proxyIp = winnersp->m_ip; prep->m_proxyPort = winnersp->m_port; // this is just '\0' if none strcpy(prep->m_usernamePwd,winnersp->m_usernamePwd); // do not count the proxy we are returning as "more" prep->m_hasMoreProxiesToTry = ( aliveProxyCandidates > 1 ); // and the loadbucket id, so requester can tell us it is done // downloading through the proxy and we can update the LoadBucket // for this transaction (m_lbId) prep->m_lbId = bb.m_id; // requester wants to know how many proxies have been banned by the // urlIp so it can increase a self-imposed crawl-delay to be more // sensitive to the spider policy. prep->m_numBannedProxies = numBannedProxies; //char *p = udpSlot->m_tmpBuf; //*(int32_t *)p = winnersp->m_ip ; p += 4; //*(int16_t *)p = winnersp->m_port; p += 2; // and the loadbucket id //*(int32_t *)p = bb.m_id; p += 4; // with dup keys we end up with long chains of crap and this // takes forever. so just flush the whole thing every 2 minutes AND // when 20000+ entries are in there static time_t s_lastTime = 0; time_t now = nowms / 1000; if ( s_lastTime == 0 ) s_lastTime = now; time_t elapsed = now - s_lastTime; if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) { log("sproxy: flushing %i entries from proxy loadtable that " "have accumulated since %i seconds ago", (int)s_loadTable.m_numSlotsUsed,(int)elapsed); s_loadTable.clear(); // only do this one per minute s_lastTime = now; } int32_t sanityCount = 0;//s_loadTable.getNumSlots(); // top: // now remove old entries from the load table. entries that // have completed and have a download end time more than 10 mins ago. for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) { // skip if empty if ( ! s_loadTable.m_flags[i] ) continue; // get the bucket LoadBucket *pp =(LoadBucket *)s_loadTable.getValueFromSlot(i); // skip if still active if ( pp->m_downloadEndTimeMS == 0LL ) continue; // delta t int64_t took = nowms - pp->m_downloadEndTimeMS; // < 10 mins? now it's < 15 seconds to prevent clogging. if ( took < LOADPOINT_EXPIRE_MS ) continue; // 100 at a time so we don't slam cpu if ( sanityCount++ > 100 ) break; // ok, its too old, nuke it to save memory s_loadTable.removeSlot(i); // the keys might have buried us but we really should not // mis out on analyzing any keys if we just keep looping here // should we? TODO: figure it out. if we miss a few it's not // a big deal. //i--; //goto top; } // send the proxy ip/port/LBid back to user g_udpServer.sendReply(udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot); }
void processReply ( char *reply , long replyLen ) { // store our current reply SafeBuf fb2; fb2.safeMemcpy(reply,replyLen ); fb2.nullTerm(); // log that we got the reply log("qa: got reply(len=%li)(errno=%s)=%s", replyLen,mstrerror(g_errno),reply); char *content = NULL; long contentLen = 0; // get mime if ( reply ) { HttpMime mime; mime.set ( reply, replyLen , NULL ); // only hash content since mime has a timestamp in it content = mime.getContent(); contentLen = mime.getContentLen(); if ( content && contentLen>0 && content[contentLen] ) { char *xx=NULL;*xx=0; } } if ( ! content ) { content = ""; contentLen = 0; } s_content = content; // take out <responseTimeMS> markOut ( content , "<currentTimeUTC>"); markOut ( content , "<responseTimeMS>"); // until i figure this one out, take it out markOut ( content , "<docsInCollection>"); // until i figure this one out, take it out markOut ( content , "<hits>"); // for those links in the html pages markOut ( content, "rand64="); // for json markOut ( content , "\"currentTimeUTC\":" ); markOut ( content , "\"responseTimeMS\":"); markOut ( content , "\"docsInCollection\":"); // for xml markOut ( content , "<currentTimeUTC>" ); markOut ( content , "<responseTimeMS>"); markOut ( content , "<docsInCollection>"); // indexed 1 day ago markOut ( content,"indexed:"); // modified 1 day ago markOut ( content,"modified:"); // s_gigabitCount... it is perpetually incrementing static counter // in PageResults.cpp markOut(content,"ccc("); markOut(content,"id=fd"); markOut(content,"id=sd"); // for some reason the term freq seems to change a little in // the scoring table markOut(content,"id=tf"); // make checksum. we ignore back to back spaces so this // hash works for <docsInCollection>10 vs <docsInCollection>9 long contentCRC = 0; if ( content ) contentCRC = qa_hash32 ( content ); // note it log("qa: got contentCRC of %lu",contentCRC); // if what we expected, save to disk if not there yet, then // call s_callback() to resume the qa pipeline /* if ( contentCRC == s_expectedCRC ) { // save content if good char fn3[1024]; sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); File ff; ff.set ( fn3 ); if ( ! ff.doesExist() ) { // if not there yet then save it fb2.save(fn3); } // . continue on with the qa process // . which qa function that may be //s_callback(); return; } */ // // if crc of content does not match what was expected then do a diff // so we can see why not // // this means caller does not care about the response if ( ! s_checkCRC ) { //s_callback(); return; } //const char *emsg = "qa: bad contentCRC of %li should be %li " // "\n";//"phase=%li\n"; //fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1); // hash url long urlHash32 = hash32n ( s_url.getUrl() ); // combine test function too since two tests may use the same url long nameHash = hash32n ( s_qt->m_testName ); // combine together urlHash32 = hash32h ( nameHash , urlHash32 ); static bool s_init = false; if ( ! s_init ) { s_init = true; s_ht.set(4,4,1024,NULL,0,false,0,"qaht"); // make symlink //char cmd[512]; //snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir); //system(cmd); char dir[1024]; snprintf(dir,1000,"%sqa",g_hostdb.m_dir); long status = ::mkdir ( dir , S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH ); if ( status == -1 && errno != EEXIST && errno ) log("qa: Failed to make directory %s: %s.", dir,mstrerror(errno)); // try to load from disk SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: loading crctable.dat"); s_ht.load ( fn.getBufStart() , "crctable.dat" ); } // break up into lines char fn2[1024]; sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); fb2.save ( fn2 ); // look up in hashtable to see what reply crc should be long *val = (long *)s_ht.getValue ( &urlHash32 ); // just return if the same if ( val && contentCRC == *val ) { g_qaOutput.safePrintf("<b style=color:green;>" "passed test</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu " "crc=<a href=/qa/content.%lu>" "%lu</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } if ( ! val ) { // add it so we know s_ht.addKey ( &urlHash32 , &contentCRC ); g_qaOutput.safePrintf("<b style=color:blue;>" "first time testing</b><br>%s : " "<a href=%s>%s</a> " "(urlhash=%lu " "crc=<a href=/qa/content.%lu>%lu" "</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } log("qa: crc changed for url %s from %li to %li", s_url.getUrl(),*val,contentCRC); // get response on file SafeBuf fb1; char fn1[1024]; sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val); fb1.load(fn1); fb1.nullTerm(); // do the diff between the two replies so we can see what changed char cmd[1024]; sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2); log("qa: %s\n",cmd); system(cmd); g_numErrors++; g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu)<br>" "<input type=checkbox name=urlhash%lu value=1 " // use ajax to update test crc. if you undo your // check then it should put the old val back. // when you first click the checkbox it should // gray out the diff i guess. "onclick=submitchanges(%lu,%lu);> " "Accept changes" "<br>" "original on left, new on right. " "oldcrc = <a href=/qa/content.%lu>%lu</a>" " != <a href=/qa/content.%lu>%lu</a> = newcrc" "<br>diff output follows:<br>" "<pre id=%lu style=background-color:0xffffff;>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, // input checkbox name field urlHash32, // submitchanges() parms urlHash32, contentCRC, // original/old content.%lu *val, *val, // new content.%lu contentCRC, contentCRC, // for the pre tag id: urlHash32); // store in output SafeBuf sb; sb.load("/tmp/diffout"); g_qaOutput.htmlEncode ( sb.getBufStart() ); g_qaOutput.safePrintf("</pre><br><hr>"); // if this is zero allow it to slide by. it is learning mode i guess. // so we can learn what crc we need to use. // otherwise, stop right there for debugging //if ( s_expectedCRC != 0 ) exit(1); // keep on going //s_callback(); }