// . init s_mimeTable in this call // . called from HttpServer::init // . returns false and sets g_errno on error bool HttpMime::init ( ) { // only need to call once if ( s_init ) return true; // make sure only called once s_init = true; //s_mimeTable.set ( 256 ); //s_mimeTable.setLabel("mimetbl"); if ( ! s_mimeTable.set(4,sizeof(char *),256,NULL,0,false,1,"mimetbl")) return false; // set table from internal list for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2 ) { int32_t key = hash32n ( s_ext[i] ); if ( ! s_mimeTable.addKey ( &key , &s_ext[i+1] ) ) return log("HttpMime::init: failed to set table."); } // quick text const char *tt = getContentTypeFromExtension ( "zip" ); if ( strcmp(tt,"application/zip") != 0 ) { g_errno = EBADENGINEER; return log("http: Failed to init mime table correctly."); } // a more thorough test for ( uint32_t i = 0 ; i < sizeof(s_ext)/sizeof(char *) ; i+=2) { tt = getContentTypeFromExtension ( s_ext[i] ); if ( strcmp(tt,s_ext[i+1]) == 0 ) continue; g_errno = EBADENGINEER; return log("http: Failed to do mime table correctly. i=%" PRId32,i); } // TODO: set it from a user supplied file here return true; }
iconv_t gbiconv_open( char *tocode, char *fromcode) { // get hash for to/from uint32_t hash1 = hash32Lower_a(tocode, gbstrlen(tocode), 0); uint32_t hash2 = hash32Lower_a(fromcode, gbstrlen(fromcode),0); uint32_t hash = hash32h(hash1, hash2); g_errno = 0; iconv_t *convp = (iconv_t *)s_convTable.getValue(&hash); iconv_t conv = NULL; if ( convp ) conv = *convp; //log(LOG_DEBUG, "uni: convertor %s -> %s from hash 0x%"XINT32": 0x%"XINT32"", // fromcode, tocode, // hash, conv); if (!conv){ //log(LOG_DEBUG, "uni: Allocating new convertor for " // "%s to %s (hash: 0x%"XINT32")", // fromcode, tocode,hash); conv = iconv_open(tocode, fromcode); if (conv == (iconv_t) -1) { log(LOG_WARN, "uni: failed to open converter for " "%s to %s: %s (%d)", fromcode, tocode, strerror(errno), errno); // need to stop if necessary converters don't open //char *xx=NULL; *xx = 0; g_errno = errno; if (errno == EINVAL) g_errno = EBADCHARSET; return conv; } // add mem to table to keep track g_mem.addMem((void*)conv, 52, "iconv", 1); // cache convertor s_convTable.addKey(&hash, &conv); //log(LOG_DEBUG, "uni: Saved convertor 0x%"INT32" under hash 0x%"XINT32"", // conv, hash); } else{ // reset convertor char *dummy = NULL; size_t dummy2 = 0; // JAB: warning abatement //size_t res = iconv(conv,NULL,NULL,&dummy,&dummy2); iconv(conv,NULL,NULL,&dummy,&dummy2); } return conv; }
nodeid_t getTagId ( char *s , NodeType **retp ) { // init table? static bool s_init = false; static HashTableX s_ht; static char s_buf[10000]; if ( ! s_init ) { s_init = true; s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0 // how many NodeTypes do we have in g_nodes? static int32_t nn = sizeof(g_nodes) / sizeof(NodeType); // set the hash table for ( int32_t i = 0 ; i < nn ; i++ ) { char *name = g_nodes[i].m_nodeName; int32_t nlen = gbstrlen(name); int64_t h = hash64Upper_a ( name,nlen,0LL ); NodeType *nt = &g_nodes[i]; if ( ! s_ht.addKey(&h,&nt) ) { char *xx=NULL;*xx=0; } } // sanity if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; } // sanity test nodeid_t tt = getTagId ( "br" ); if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; } } // find end of tag name. hyphens are ok to be in name. // facebook uses underscores like <start_time> char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++); // hash it for lookup int64_t h = hash64Upper_a ( s , e - s , 0 ); // look it up NodeType **ntp = (NodeType **)s_ht.getValue(&h); // assume none if ( retp ) *retp = NULL; // none? if ( ! ntp ) return 0; // got one if ( retp ) *retp = *ntp; // get id otherwise return (*ntp)->m_nodeId; }
// get the id from a 2 character country code uint8_t getCountryId ( char *cc ) { static bool s_init = false; static char buf[2000]; static HashTableX ht; char tmp[4]; if ( ! s_init ) { s_init = true; // hash them up ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids"); // now add in all the country codes long n = (long) sizeof(s_countryCode) / sizeof(char *); for ( long i = 0 ; i < n ; i++ ) { char *s = (char *)s_countryCode[i]; //long slen = gbstrlen ( s ); // sanity check if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; } // map it to a 4 byte key tmp[0]=s[0]; tmp[1]=s[1]; tmp[2]=0; tmp[3]=0; // a val of 0 does not mean empty in HashTableX, // that is an artifact of HashTableT uint8_t val = i; // +1; // add 1 cuz 0 means lang unknown if ( ! ht.addKey ( tmp , &val ) ) { char *xx=NULL;*xx=0; } } } // lookup tmp[0]=to_lower_a(cc[0]); tmp[1]=to_lower_a(cc[1]); tmp[2]=0; tmp[3]=0; long slot = ht.getSlot ( tmp ); if ( slot < 0 ) return 0; void *val = ht.getValueFromSlot ( slot ); return *(uint8_t *)val ; }
// . how many keys are dups // . returns -1 on error long HashTableX::getNumDups() { if ( ! m_allowDups ) return 0; HashTableX tmp; if ( ! tmp.set ( m_ks, 0, m_numSlots, NULL , 0 , false , m_niceness, "htxtmp") ) return -1; // put into that table for ( long i = 0 ; i < m_numSlots ; i++ ) { // skip empty bucket if ( ! m_flags[i] ) continue; // get the key char *kp = (char *)getKeyFromSlot(i); // add to new table if ( ! tmp.addKey ( kp ) ) return -1; } // the unqieus long uniques = tmp.m_numSlotsUsed; // the dups long dups = m_numSlotsUsed - uniques; // that's it return dups; }
// . when the Conf::m_proxyIps parm is updated we call this to rebuild // s_iptab, our table of SpiderProxy instances, which has the proxies and // their performance statistics. // . we try to maintain stats of ip/ports that did NOT change when rebuilding. bool buildProxyTable ( ) { // scan the NEW list of proxy ip/port pairs in g_conf char *p = g_conf.m_proxyIps.getBufStart(); HashTableX tmptab; tmptab.set(8,0,16,NULL,0,false,"tmptab"); // scan the user inputted space-separated list of ip:ports // (optional username:password@ip:port) for ( ; *p ; ) { // skip white space if ( is_wspace_a(*p) ) { p++; continue; } // skip http:// if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; } // scan in an ip:port char *s = p; char *portStr = NULL; int32_t dc = 0, pc = 0, gc = 0, bc = 0; const char *msg; char *usernamePwd = NULL; int32_t usernamePwdLen = 0; char *ipStart = p; // scan all characters until we hit \0 or another whitespace for ( ; *s && !is_wspace_a(*s); s++) { if ( *s == '@' ) { // must be username:pwd if ( pc != 1 ) { msg = "bad username:password"; goto hadError; } usernamePwd = p; usernamePwdLen = s - p; if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) { msg = "username:password too long"; goto hadError; } dc = 0; gc = 0; bc = 0; pc = 0; portStr = NULL; ipStart = s+1; continue; } if ( *s == '.' ) { dc++; continue; } if ( *s == ':' ) { portStr=s; pc++; continue; } if ( is_digit(*s) ) { gc++; continue; } bc++; continue; } // ensure it is a legit ip:port combo msg = NULL; if ( gc < 4 ) msg = "not enough digits for an ip"; if ( pc > 1 ) msg = "too many colons"; if ( dc != 3 ) msg = "need 3 dots for an ip address"; if ( bc ) msg = "got illegal char in ip:port listing"; if ( msg ) { hadError: char c = *s; *s = '\0'; log("buf: %s for %s",msg,p); *s = c; return false; } // convert it int32_t iplen = s - ipStart; if ( portStr ) iplen = portStr - ipStart; int32_t ip = atoip(ipStart,iplen); // another sanity check if ( ip == 0 || ip == -1 ) { log("spider: got bad proxy ip for %s",p); return false; } // and the port default is 80 int32_t port = 80; if ( portStr ) port = atol2(portStr+1,s-portStr-1); if ( port < 0 || port > 65535 ) { log("spider: got bad proxy port for %s",p); return false; } // . we got a legit ip:port // . see if already in our table uint64_t ipKey = (uint32_t)ip; ipKey <<= 16; ipKey |= (uint16_t)(port & 0xffff); // also store into tmptable to see what we need to remove tmptab.addKey(&ipKey); // see if in table int32_t islot = s_iptab.getSlot( &ipKey); // advance p p = s; // if in there, keep it as is if ( islot >= 0 ) continue; // otherwise add new entry SpiderProxy newThing; memset ( &newThing , 0 , sizeof(SpiderProxy)); newThing.m_ip = ip; newThing.m_port = port; newThing.m_lastDownloadTookMS = -1; newThing.m_lastSuccessfulTestMS = -1; gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen); // ensure it is NULL terminated newThing.m_usernamePwd[usernamePwdLen] = '\0'; if ( ! s_iptab.addKey ( &ipKey, &newThing ) ) return false; } redo: int32_t removed = 0; // scan all SpiderProxies in tmptab for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty buckets in hashtable s_iptab if ( ! s_iptab.m_flags[i] ) continue; // get the key int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i); // must also exist in tmptab, otherwise it got removed by user if ( tmptab.isInTable ( &key ) ) continue; // skip if not in table if ( s_iptab.getSlot ( &key ) < 0 ) { log("sproxy: iptable hashing messed up"); continue; } // shoot, it got removed. not in the new list of ip:ports s_iptab.removeKey ( &key ); removed++; // hashtable is messed up now, start over //goto redo; } if ( removed ) goto redo; return true; }
// a host is asking us (host #0) what proxy to use? static void handleRequest54(UdpSlot *udpSlot, int32_t niceness) { char *request = udpSlot->m_readBuf; int32_t requestSize = udpSlot->m_readBufSize; // we now use the top part of the Msg13Request as the ProxyRequest Msg13Request *preq = (Msg13Request *)request; // sanity check if ( requestSize != preq->getProxyRequestSize() ) { log("db: Got bad request 0x54 size of %" PRId32" bytes. bad", requestSize ); g_udpServer.sendErrorReply ( udpSlot , EBADREQUESTSIZE ); return; } // is the request telling us it is done downloading through a proxy? if ( preq->m_opCode == OP_RETPROXY ) { returnProxy ( preq , udpSlot ); return; } // if sender is asking for a new proxy and wants us to ban // the previous proxy we sent for this urlIp... if ( preq->m_banProxyIp ) { // don't core if misses sanity. it seems we don't always // NULLify these or something. // these must match if(preq->m_banProxyIp != preq->m_proxyIp || preq->m_banProxyPort != preq->m_proxyPort){ log("db: proxy: banproxyip != proxyip. mismatch!"); g_udpServer.sendErrorReply ( udpSlot , EBADENGINEER); return; } // this will "return" the banned proxy returnProxy ( preq , NULL ); // now add it to the banned table int64_t uip = preq->m_urlIp; int64_t pip = preq->m_banProxyIp; int64_t h64 = hash64h ( uip , pip ); if ( ! s_proxyBannedTable.isInTable ( &h64 ) ) { s_proxyBannedTable.addKey ( &h64 ); // for stats counting. each proxy ip maps to # // of unique website IPs that have banned it. s_banCountTable.addTerm32((uint32_t)pip); } } // shortcut int32_t urlIp = preq->m_urlIp; // send to a proxy that is up and has the least amount // of LoadBuckets with this urlIp, if tied, go to least loaded. // clear counts for this url ip for scoring the best proxy to use for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); sp->m_countForThisIp = 0; sp->m_lastTimeUsedForThisIp = 0LL; } // this table maps a url's current IP to a possibly MULTIPLE slots // which tell us what proxy is downloading a page from that IP. // so we can try to find a proxy that is not download a url from // this IP currently, or hasn't been for the longest time... int32_t hslot = s_loadTable.getSlot ( &urlIp ); // scan all proxies that have this urlip outstanding for ( int32_t i = hslot ; i >= 0 ; i = s_loadTable.getNextSlot(i,&urlIp)){ // get the bucket LoadBucket *lb; lb = (LoadBucket *)s_loadTable.getValueFromSlot(i); // get the spider proxy this load point was for uint64_t key = (uint32_t)lb->m_proxyIp; key <<= 16; key |= (uint16_t)lb->m_proxyPort; SpiderProxy *sp = (SpiderProxy *)s_iptab.getValue(&key); // must be there unless user remove it from the list if ( ! sp ) continue; // count it up if ( lb->m_downloadEndTimeMS == 0LL ) sp->m_countForThisIp++; // set the last time used to the most recently downloaded time // that this proxy has downloaded from this ip if ( lb->m_downloadEndTimeMS && lb->m_downloadEndTimeMS > sp->m_lastTimeUsedForThisIp ) sp->m_lastTimeUsedForThisIp = lb->m_downloadEndTimeMS; } // first try to get a spider proxy that is not "dead" bool skipDead = true; int32_t numBannedProxies = 0; int32_t aliveProxyCandidates = 0; redo: // get the min of the counts int32_t minCount = 999999; for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) { numBannedProxies++; continue; } } // if it failed the last test, skip it if ( skipDead && sp->m_lastDownloadError ) continue; if ( skipDead ) aliveProxyCandidates++; if ( sp->m_countForThisIp >= minCount ) continue; minCount = sp->m_countForThisIp; } // all dead? then get the best dead one if ( minCount == 999999 ) { skipDead = false; goto redo; } // . we only use one proxy if none are banned by this IP // . when that gets banned, we will use the next 2 proxies with // a higher backoff/crawlDelay, etc. int32_t threshHold; if ( numBannedProxies <= 0 ) threshHold = 1; // if first proxy gets banned, try next 2 proxies until both get ban'd else if ( numBannedProxies == 1 ) threshHold = 2; else if ( numBannedProxies < 1+2) threshHold = 3 - numBannedProxies; // if next two proxies got banned, try next 4 proxies until banned else if ( numBannedProxies == 3 ) threshHold = 4; else if ( numBannedProxies < 3+4) threshHold = 7 - numBannedProxies; // if next 4 proxies got banned, try next 8 proxies until they get band else if ( numBannedProxies == 7 ) threshHold = 8; else if ( numBannedProxies < 7+8) threshHold = 15 - numBannedProxies; else if ( numBannedProxies == 15) threshHold = 16; else if ( numBannedProxies < 15+16 ) threshHold = 31-numBannedProxies; else if ( numBannedProxies == 31 ) threshHold = 32; else if ( numBannedProxies < 31+32)threshHold=63-numBannedProxies; else if ( numBannedProxies == 63 ) threshHold = 64; else if ( numBannedProxies < 63+64)threshHold=127-numBannedProxies; else if ( numBannedProxies == 127 ) threshHold = 128; else if ( numBannedProxies < 127+128)threshHold=255-numBannedProxies; else if ( numBannedProxies == 255 ) threshHold = 256; else if ( numBannedProxies < 255+256)threshHold=512-numBannedProxies; else if ( numBannedProxies == 511 ) threshHold = 512; else if ( numBannedProxies < 511+512)threshHold=1024-numBannedProxies; else threshHold = 1024; if ( threshHold <= 0 ) { log("proxy: spiderproxy error in threshold of %" PRId32" " "for banned=%" PRId32,threshHold,numBannedProxies); threshHold = 1; } // reset minCount so we can take the min over those we check here minCount = -1; int64_t oldest = 0x7fffffffffffffffLL; SpiderProxy *winnersp = NULL; int32_t count = 0; // start at a random slot based on url's IP so we don't // overload the first proxy int32_t start = ((uint32_t)urlIp) % s_iptab.getNumSlots(); int32_t slotCount = s_iptab.getNumSlots(); // . now find the best proxy wih the minCount for ( int32_t i = start ; ; i++ ) { // scan all slots in hash table, then stop if ( slotCount-- <= 0 ) break; // wrap around to zero if we hit the end if ( i == s_iptab.getNumSlots() ) i = 0; // skip empty slots if ( ! s_iptab.m_flags[i] ) continue; // get the spider proxy SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); // if it failed the last test, skip it... not here... if ( skipDead && sp->m_lastDownloadError ) continue; // if this proxy was banned by the url's ip... skip it. it is // not a candidate... if ( skipDead ) { int64_t uip = preq->m_urlIp; int64_t pip = sp->m_ip; int64_t h64 = hash64h ( uip , pip ); if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue; } // if some proxies are "alive" then only pick from // the first half of the proxies that are alive (i.e. still // work). that way, when one of those goes dead we will inc // the backoff (crawldelay) and a new proxy that we haven't // used for this url's IP will take it's place. and such // new proxies will only have the new backoff count used // through them. that way, we don't get ALL of our proxies // banned at about the same time since we do somewhat uniform // load balancing over them. if ( skipDead && count >= threshHold)//aliveProxyCandidates/2 ) continue; // count the alive/non-banned candidates count++; // if all hosts were "dead" because they all had // m_lastDownloadError set then minCount will be 999999 // and nobody should continue from this statement: if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue; // then go by last download time for this ip if ( sp->m_countForThisIp == minCount && minCount>=0 && sp->m_lastTimeUsedForThisIp >= oldest ) continue; // pick the spider proxy used longest ago oldest = sp->m_lastTimeUsedForThisIp; minCount = sp->m_countForThisIp; // got a new winner winnersp = sp; } // we must have a winner if ( ! winnersp ) { g_process.shutdownAbort(true); } int64_t nowms = gettimeofdayInMillisecondsLocal(); // add a new load bucket then! LoadBucket bb; bb.m_urlIp = urlIp; // the time it started bb.m_downloadStartTimeMS = nowms; // download has not ended yet bb.m_downloadEndTimeMS = 0LL; // the host using the proxy bb.m_hostId = udpSlot->getHostId(); // key is this for m_prTable bb.m_proxyIp = winnersp->m_ip; bb.m_proxyPort = winnersp->m_port; // a new id. we use this to update the downloadEndTime when done static int32_t s_lbid = 0; // add it now bb.m_id = s_lbid++; s_loadTable.addKey ( &urlIp , &bb ); // winner count update winnersp->m_timesUsed++; // sanity if ( (int32_t)sizeof(ProxyReply) > TMPBUFSIZE ){g_process.shutdownAbort(true);} // and give proxy ip/port back to the requester so they can // use that to download their url ProxyReply *prep = (ProxyReply *)udpSlot->m_tmpBuf; prep->m_proxyIp = winnersp->m_ip; prep->m_proxyPort = winnersp->m_port; // this is just '\0' if none strcpy(prep->m_usernamePwd,winnersp->m_usernamePwd); // do not count the proxy we are returning as "more" prep->m_hasMoreProxiesToTry = ( aliveProxyCandidates > 1 ); // and the loadbucket id, so requester can tell us it is done // downloading through the proxy and we can update the LoadBucket // for this transaction (m_lbId) prep->m_lbId = bb.m_id; // requester wants to know how many proxies have been banned by the // urlIp so it can increase a self-imposed crawl-delay to be more // sensitive to the spider policy. prep->m_numBannedProxies = numBannedProxies; //char *p = udpSlot->m_tmpBuf; //*(int32_t *)p = winnersp->m_ip ; p += 4; //*(int16_t *)p = winnersp->m_port; p += 2; // and the loadbucket id //*(int32_t *)p = bb.m_id; p += 4; // with dup keys we end up with long chains of crap and this // takes forever. so just flush the whole thing every 2 minutes AND // when 20000+ entries are in there static time_t s_lastTime = 0; time_t now = nowms / 1000; if ( s_lastTime == 0 ) s_lastTime = now; time_t elapsed = now - s_lastTime; if ( elapsed > 120 && s_loadTable.getNumSlots() > 10000 ) { log("sproxy: flushing %i entries from proxy loadtable that " "have accumulated since %i seconds ago", (int)s_loadTable.m_numSlotsUsed,(int)elapsed); s_loadTable.clear(); // only do this one per minute s_lastTime = now; } int32_t sanityCount = 0;//s_loadTable.getNumSlots(); // top: // now remove old entries from the load table. entries that // have completed and have a download end time more than 10 mins ago. for ( int32_t i = s_loadTable.getNumSlots() - 1 ; i >= 0 ; i-- ) { // skip if empty if ( ! s_loadTable.m_flags[i] ) continue; // get the bucket LoadBucket *pp =(LoadBucket *)s_loadTable.getValueFromSlot(i); // skip if still active if ( pp->m_downloadEndTimeMS == 0LL ) continue; // delta t int64_t took = nowms - pp->m_downloadEndTimeMS; // < 10 mins? now it's < 15 seconds to prevent clogging. if ( took < LOADPOINT_EXPIRE_MS ) continue; // 100 at a time so we don't slam cpu if ( sanityCount++ > 100 ) break; // ok, its too old, nuke it to save memory s_loadTable.removeSlot(i); // the keys might have buried us but we really should not // mis out on analyzing any keys if we just keep looping here // should we? TODO: figure it out. if we miss a few it's not // a big deal. //i--; //goto top; } // send the proxy ip/port/LBid back to user g_udpServer.sendReply(udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot->m_tmpBuf, sizeof(ProxyReply), udpSlot); }
// . this returns false if blocks, true otherwise // . sets g_errno on failure bool Msg1c::gotList ( ) { if ( g_errno ) return true; int64_t *tmpDocIds = m_msg3a.getDocIds(); int32_t numDocIds = m_msg3a.getNumDocIds(); if ( m_startNum > 0) { numDocIds -= m_startNum; tmpDocIds = &tmpDocIds[m_startNum]; } m_numDocIds = numDocIds; // save for reporting // log it log(LOG_INFO,"admin: Got %" PRId32" docIds for query reindex.", numDocIds); // bail if no need if ( numDocIds <= 0 ) return true; // force spiders on on entire network. they will progagate from // host #0... g_conf.m_spideringEnabled = true; int32_t nowGlobal = getTimeGlobal(); HashTableX dt; char dbuf[1024]; dt.set(8,0,64,dbuf,1024,false,0,"ddocids"); m_sb.setLabel("reiadd"); State13 *st = (State13 *)m_state; GigablastRequest *gr = &st->m_gr; m_numDocIdsAdded = 0; // list consists of docIds, loop through each one for(int32_t i = 0; i < numDocIds; i++) { int64_t docId = tmpDocIds[i]; // when searching events we get multiple docids that are same if ( dt.isInTable ( &docId ) ) continue; // add it if ( ! dt.addKey ( &docId ) ) return true; SpiderRequest sr; sr.reset(); // url is a docid! sprintf ( sr.m_url , "%" PRIu64 , docId ); // make a fake first ip // use only 64k values so we don't stress doledb/waittrees/etc. // for large #'s of docids int32_t firstIp = (docId & 0x0000ffff); // bits 6-13 of the docid are the domain hash so use those // when doing a REINDEX (not delete!) to ensure that requests // on the same domain go to the same shard, at least when // we have up to 256 shards. if we have more than 256 shards // at this point some shards will not participate in the // query reindex/delete process because of this, so // we'll want to allow more bits in in that case perhaps. // check out Hostdb::getShardNum(RDB_SPIDERDB) in Hostdb.cpp // to see what shard is responsible for storing and indexing // this SpiderRequest based on the firstIp. if ( ! m_forceDel ) { // if we are a REINDEX not a delete because // deletes don't need to spider/redownload the doc // so the distribution can be more random firstIp >>= 6; firstIp &= 0xff; } // 0 is not a legit val. it'll core below. if ( firstIp == 0 ) { firstIp = 1; } // use a fake ip sr.m_firstIp = firstIp; // we are not really injecting... sr.m_isInjecting = false;//true; sr.m_hopCount = -1; sr.m_isPageReindex = 1; sr.m_urlIsDocId = 1; sr.m_fakeFirstIp = 1; // now you can recycle content instead of re-downloading it // for every docid sr.m_recycleContent = gr->m_recycleContent; // if this is zero we end up getting deduped in // dedupSpiderList() if there was a SpiderReply whose // spider time was > 0 sr.m_addedTime = nowGlobal; sr.m_forceDelete = m_forceDel ? 1 : 0; // . complete its m_key member // . parentDocId is used to make the key, but only allow one // page reindex spider request per url... so use "0" // . this will set "uh48" to hash64b(m_url) which is the docid sr.setKey( firstIp, 0LL , false ); // how big to serialize int32_t recSize = sr.getRecSize(); m_numDocIdsAdded++; // store it if ( ! m_sb.safeMemcpy ( (char *)&sr , recSize ) ) { // g_errno must be set if ( ! g_errno ) { g_process.shutdownAbort(true); } log(LOG_LOGIC, "admin: Query reindex size of %" PRId32" " "too big. Aborting. Bad engineer." , (int32_t)0);//m_list.getListSize() ); return true; } }
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) { // get request char *request = udpSlot->m_readBuf; int32_t reqSize = udpSlot->m_readBufSize; // shortcut UdpServer *us = &g_udpServer; // breathe QUICKPOLL ( niceness ); // shortcut char *reply = udpSlot->m_tmpBuf; // // . is it confirming that he got all the locks? // . if so, remove the doledb record and dock the doleiptable count // before adding a waiting tree entry to re-pop the doledb record // if ( reqSize == sizeof(ConfirmRequest) ) { char *msg = NULL; ConfirmRequest *cq = (ConfirmRequest *)request; // confirm the lock HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 ); if ( slot < 0 ) { log("spider: got a confirm request for a key not " "in the table! coll must have been deleted " " or reset " "while lock request was outstanding."); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; //char *xx=NULL;*xx=0; } } UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); lock->m_confirmed = true; // note that if ( g_conf.m_logDebugSpider ) // Wait ) log("spider: got confirm lock request for ip=%s", iptoa(lock->m_firstIp)); // get it SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum); // make it negative cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL; // and add the negative rec to doledb (deletion operation) Rdb *rdb = &g_doledb.m_rdb; if ( ! rdb->addRecord ( cq->m_collnum, (char *)&cq->m_doledbKey, NULL , // data 0 , //dataSize 1 )){ // niceness // tree is dumping or something, probably ETRYAGAIN if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno)); } //char *xx=NULL;*xx=0; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // now remove from doleiptable since we removed from doledb if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp ); // how many spiders outstanding for this coll and IP? //int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp); // DO NOT add back to waiting tree if max spiders // out per ip was 1 OR there was a crawldelay. but better // yet, take care of that in the winReq code above. // . now add to waiting tree so we add another spiderdb // record for this firstip to doledb // . true = callForScan // . do not add to waiting tree if we have enough outstanding // spiders for this ip. we will add to waiting tree when // we receive a SpiderReply in addSpiderReply() if ( sc && //out < cq->m_maxSpidersOutPerIp && // this will just return true if we are not the // responsible host for this firstip // DO NOT populate from this!!! say "false" here... ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) && // must be an error... g_errno ) { msg = "FAILED TO ADD TO WAITING TREE"; log("spider: %s %s",msg,mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // success!! reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // sanity check if ( reqSize != sizeof(LockRequest) ) { log("spider: bad msg12 request size of %" PRId32,reqSize); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , EBADREQUEST ); return; } // deny it if we are not synced yet! otherwise we core in // getTimeGlobal() below if ( ! isClockInSync() ) { // log it so we can debug it //log("spider: clock not in sync with host #0. so " // "returning etryagain for lock reply"); // let admin know why we are not spidering log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , ETRYAGAIN ); return; } LockRequest *lr = (LockRequest *)request; //uint64_t lockKey = *(int64_t *)request; //int32_t lockSequence = *(int32_t *)(request+8); // is this a remove operation? assume not //bool remove = false; // get top bit //if ( lockKey & 0x8000000000000000LL ) remove = true; // mask it out //lockKey &= 0x7fffffffffffffffLL; // sanity check, just 6 bytes! (48 bits) if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 request uh48=%" PRId64" remove=%" PRId32, lr->m_lockKeyUh48, (int32_t)lr->m_removeLock); // get time int32_t nowGlobal = getTimeGlobal(); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port ); // this must be legit - sanity check if ( hostId < 0 ) { char *xx=NULL;*xx=0; } // remove expired locks from locktable removeExpiredLocks ( hostId ); int64_t lockKey = lr->m_lockKeyUh48; // check tree int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 ); // put it here UrlLock *lock = NULL; // if there say no no if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if doing a remove operation and that was our hostid then unlock it if ( lr->m_removeLock && lock && lock->m_hostId == hostId && lock->m_lockSequence == lr->m_lockSequence ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // ok, at this point all remove ops return if ( lr->m_removeLock ) { reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } ///////// // // add new lock // ///////// // if lock > 1 hour old then remove it automatically!! if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // note it for now log("spider: removing lock after %" PRId32" seconds " "for lockKey=%" PRIu64" hid=%" PRId32, (nowGlobal - lock->m_timestamp), lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // if lock still there, do not grant another lock if ( lock ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: refusing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); reply[0] = 0; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // make the new lock UrlLock tmp; tmp.m_hostId = hostId; tmp.m_lockSequence = lr->m_lockSequence; tmp.m_timestamp = nowGlobal; tmp.m_expires = 0; tmp.m_firstIp = lr->m_firstIp; tmp.m_collnum = lr->m_collnum; // when the spider returns we remove its lock on reception of the // spiderReply, however, we actually just set the m_expires time // to 5 seconds into the future in case there is a current request // to get a lock for that url in progress. but, we do need to // indicate that the spider has indeed completed by setting // m_spiderOutstanding to true. this way, addToWaitingTree() will // not count it towards a "max spiders per IP" quota when deciding // on if it should add a new entry for this IP. tmp.m_spiderOutstanding = true; // this is set when all hosts in the group (shard) have granted the // lock and the host sends out a confirmLockAcquisition() request. // until then we do not know if the lock will be granted by all hosts // in the group (shard) tmp.m_confirmed = false; // put it into the table if ( ! ht->addKey ( &lockKey , &tmp ) ) { // return error if that failed! log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // note it for now if ( g_conf.m_logDebugSpider ) log("spider: granting lock for lockKey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // grant the lock reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; }
// langId is language of the query long long getSynBaseHash64 ( char *qstr , uint8_t langId ) { Words ww; ww.set3 ( qstr ); long nw = ww.getNumWords(); long long *wids = ww.getWordIds(); //char **wptrs = ww.getWords(); //long *wlens = ww.getWordLens(); long long baseHash64 = 0LL; Synonyms syn; // assume english if unknown to fix 'pandora's tower' // vs 'pandoras tower' where both words are in both // english and german so langid is unknown if ( langId == langUnknown ) langId = langEnglish; // . store re-written query into here then hash that string // . this way we can get rid of spaces //char rebuf[1024]; //char *p = rebuf; //if ( strstr(qstr,"cheatcodes") ) // log("hey"); // for deduping HashTableX dups; if ( ! dups.set ( 8,0,1024,NULL,0,false,0,"qhddup") ) return false; // scan the words for ( long i = 0 ; i < nw ; i++ ) { // skip if not alnum if ( ! wids[i] ) continue; // get its synonyms into tmpBuf char tmpBuf[TMPSYNBUFSIZE]; // . assume niceness of 0 for now // . make sure to get all synsets!! ('love' has two synsets) long naids = syn.getSynonyms (&ww,i,langId,tmpBuf,0); // term freq algo //long pop = g_speller.getPhrasePopularity(NULL, // wids[i], // true, // langId); // is it a queryStopWord like "the" or "and"? bool isQueryStop = ::isQueryStopWord(NULL,0,wids[i]); // a more restrictive list bool isStop = ::isStopWord(NULL,0,wids[i]); if ( ::isCommonQueryWordInEnglish(wids[i]) ) isStop = true; // find the smallest one unsigned long long min = wids[i]; //char *minWordPtr = wptrs[i]; //long minWordLen = wlens[i]; // declare up here since we have a goto below long j; // add to table too if ( dups.isInTable ( &min ) ) goto gotdup; // add to it if ( ! dups.addKey ( &min ) ) return false; // now scan the synonyms, they do not include "min" in them for ( j = 0 ; j < naids ; j++ ) { // get it unsigned long long aid64; aid64 = (unsigned long long)syn.m_aids[j]; // if any syn already hashed then skip it and count // as a repeated term. we have to do it this way // rather than just getting the minimum synonym // word id, because 'love' has two synsets and // 'like', a synonym of 'love' only has one synset // and they end up having different minimum synonym // word ids!!! if ( dups.isInTable ( &aid64 ) ) break; // add it. this could fail! if ( ! dups.addKey ( &aid64 ) ) return false; // set it? if ( aid64 >= min ) continue; // got a new min min = aid64; //minWordPtr = syn.m_termPtrs[j]; //minWordLen = syn.m_termLens[j]; // get largest term freq of all synonyms //long pop2 = g_speller.getPhrasePopularity(NULL,aid64, // true,langId); //if ( pop2 > pop ) pop = pop2; } // early break out means a hit in dups table if ( j < naids ) { gotdup: // do not count as repeat if query stop word // because they often repeat if ( isQueryStop ) continue; // count # of repeated word forms //nrwf++; continue; } // hash that now // do not include stop words in synbasehash so // 'search the web' != 'search web' if ( ! isStop ) { // no! make it order independent so 'search the web' // equals 'web the search' and 'engine search' // equals 'search engine' //baseHash64 <<= 1LL; baseHash64 ^= min; } // count it, but only if not a query stop word like "and" // or "the" or "a". # of unique word forms. //if ( ! isQueryStop ) nuwf++; // get term freq //if ( pop > maxPop ) maxPop = pop; // control word? //if ( wids[i] == cw1 ) ncwf++; } return baseHash64; }
// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr // to the first one. // . then the parent caller can store that ptr in the m_wordToSyn[] array // which we pre-alloc upon calling the set() function based on the # of // words we got // . returns # of synonyms stored into "tmpBuf" long Synonyms::getSynonyms ( Words *words , long wordNum , uint8_t langId , char *tmpBuf , long niceness ) { // punct words have no synoyms if ( ! words->m_wordIds[wordNum] ) return 0; // store these m_words = words; m_docLangId = langId; m_niceness = niceness; // sanity check if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; } // init the dedup table to dedup wordIds HashTableX dt; char dbuf[512]; dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds"); long maxSyns = (long)MAX_SYNS; char *bufPtr = tmpBuf; // point into buffer m_aids = (long long *)bufPtr; bufPtr += maxSyns * 8; // then the word ids m_wids0 = (long long *)bufPtr; bufPtr += maxSyns * 8; // second word ids, for multi alnum word synonyms, i.e. "New Jersey" m_wids1 = (long long *)bufPtr; bufPtr += maxSyns * 8; m_termPtrs = (char **)bufPtr; bufPtr += maxSyns * 4; m_termLens = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWords = (long *)bufPtr; bufPtr += maxSyns * 4; m_numAlnumWordsInBase = (long *)bufPtr; bufPtr += maxSyns * 4; // source m_src = bufPtr; bufPtr += maxSyns; // cursors m_aidsPtr = m_aids; m_wids0Ptr = m_wids0; m_wids1Ptr = m_wids1; m_srcPtr = m_src; m_termPtrsPtr = m_termPtrs; m_termLensPtr = m_termLens; m_numAlnumWordsPtr = m_numAlnumWords; m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase; char *w = m_words->m_words [wordNum]; long wlen = m_words->m_wordLens[wordNum]; // // NOW hit wiktionary // Trust this less then our s_exceptions above, but more than // our morph computations below // char sourceId = SOURCE_WIKTIONARY; char *ss = NULL; long long bwid; char wikiLangId = m_docLangId; bool hadSpace ; long klen ; long baseNumAlnumWords; tryOtherLang: /* // if word only exists in one language, assume that language for word // even if m_docLangId is langUnknown (0) if ( ! ss && ! m_docLangId && ! wikiLangId ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; // each lang has its own bit long long bits = g_speller.getLangBits64 ( &bwid ); // skip if not unique char count = getNumBitsOn64 ( bits ) ; // if we only got one lang we could be, assume that if ( count == 1 ) // get it. bit #0 is english, so add 1 wikiLangId = getBitPosLL((uint8_t *)&bits) + 1; // try setting based on script. greek. russian. etc. // if the word was not in the wiktionary. // this will be langUnknown if not definitive. else wikiLangId = getCharacterLanguage(w); } */ // try looking up bigram so "new jersey" gets "nj" as synonym if ( wikiLangId && wordNum+2< m_words->m_numWords && m_words->m_wordIds[wordNum+2]) { // get phrase id bigram then long conti = 0; bwid = hash64Lower_utf8_cont(w,wlen,0,&conti); // then the next word char *wp2 = m_words->m_words[wordNum+2]; long wlen2 = m_words->m_wordLens[wordNum+2]; bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti); baseNumAlnumWords = 2; ss = g_wiktionary.getSynSet( bwid, wikiLangId ); } // need a language for wiktionary to work with if ( wikiLangId && ! ss ) { // get raw word id bwid = m_words->m_wordIds[wordNum]; baseNumAlnumWords = 1; //if ( bwid == 1424622907102375150LL) // log("a"); ss = g_wiktionary.getSynSet( bwid, wikiLangId ); // if that failed try removing 's from word if there if ( ! ss && wlen >= 3 && w[wlen-2]=='\'' && w[wlen-1]=='s' ) { long long cwid = hash64Lower_utf8(w,wlen-2); ss = g_wiktionary.getSynSet( cwid, wikiLangId ); } } // even though a document may be in german it often has some // english words "pdf download" "copyright" etc. so if the word // has no synset in german, try it in english if ( //numPresets == 0 && ! ss && m_docLangId != langEnglish && wikiLangId != langEnglish && m_docLangId && g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) { // try english wikiLangId = langEnglish; sourceId = SOURCE_WIKTIONARY_EN; goto tryOtherLang; } // if it was in wiktionary, just use that synset if ( ss ) { // prepare th HashTableX dedup; HashTableX *dd = NULL; char dbuf[512]; long count = 0; addSynSet: // do we have another set following this char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss); // if so, init the dedup table then if ( next && ! dd ) { dd = &dedup; dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf"); } // skip over the pipe i guess char *pipe = ss + 2; // zh_ch? if ( *pipe == '_' ) pipe += 3; // sanity if ( *pipe != '|' ) { char *xx=NULL;*xx=0; } // point to word list char *p = pipe + 1; // hash up the list of words, they are in utf8 and char *e = p + 1; // save count in case we need to undo //long saved = m_numAlts[wordNum]; hashLoop: // skip synonyms that are anagrams because its to ambiguous // the are mappings like // "PC" -> "PC,Personal Computer" // "PC" -> "PC,Probable Cause" ... (lots more!) //bool isAnagram = true; for ( ; *e !='\n' && *e != ',' ; e++ ) ; // if ( ! is_upper_a(*e) ) isAnagram = false; // get it long long h = hash64Lower_utf8_nospaces ( p , e - p ); // skip if same as base word if ( h == bwid ) goto getNextSyn; // should we check for dups? if ( dd ) { // skip dups if ( dd->isInTable(&h) ) goto getNextSyn; // dedup. return false with g_errno set on error if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids; } // store it *m_aidsPtr++ = h; // store source *m_srcPtr++ = sourceId; hadSpace = false; klen = e - p; for ( long k = 0 ; k < klen ; k++ ) if ( is_wspace_a(p[k]) ) hadSpace = true; *m_termPtrsPtr++ = p; *m_termLensPtr++ = e-p; // only for multi-word synonyms like "New Jersey"... *m_wids0Ptr = 0LL; *m_wids1Ptr = 0LL; *m_numAlnumWordsPtr = 1; // and for multi alnum word synonyms if ( hadSpace ) { Words sw; sw.setx ( p , e - p , m_niceness ); *(long long *)m_wids0Ptr = sw.m_wordIds[0]; *(long long *)m_wids1Ptr = sw.m_wordIds[2]; *(long *)m_numAlnumWordsPtr = sw.getNumAlnumWords(); } m_wids0Ptr++; m_wids1Ptr++; m_numAlnumWordsPtr++; // how many words did we have to hash to find a synset? // i.e. "new jersey" would be 2, to get "nj" *m_numAlnumWordsInBasePtr++ = baseNumAlnumWords; // do not breach if ( ++count >= maxSyns ) goto done; getNextSyn: // loop for more if ( *e == ',' ) { e++; p = e; goto hashLoop; } // add in the next syn set, deduped if ( next ) { ss = next; goto addSynSet; } // wrap it up done: // all done return m_aidsPtr - m_aids; } // strip marks from THIS word, return -1 w/ g_errno set on error if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids; // returns false with g_errno set if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids; // if we end in apostrophe, strip and add if ( wlen>= 3 && w[wlen-1] == 's' && w[wlen-2]=='\'' && ! addWithoutApostrophe ( wordNum, &dt ) ) return m_aidsPtr - m_aids; return m_aidsPtr - m_aids; }
// returns false if blocked, true otherwise bool Statsdb::gifLoop ( ) { // shortcut Msg5 *m = &m_msg5; //#ifndef _USEPLOTTER_ //return true; //#endif // loop over all the lists in the time range, [m_t1,m_t2] for ( ; ! m_done ; ) { if ( ! m->getList ( (char)RDB_STATSDB , "statsdb" , // coll &m_list , (char *)&m_startKey , (char *)&m_endKey , 32000 , // requested scan size true , // include tree? false , // add to cache? 0 , // max cache age 0 , // start file number -1 , // number of files NULL , // state gotListWrapper, // callback m_niceness , // niceness false , // do error correction? NULL , // cache key pointer 0 , // # retries -1 , // max # retries true , // compensate for merge? -1 , // sync point NULL ) ) // msg5b return false; // . process list // . returns false with g_errno set on error if ( ! processList() ) return true; } // define time delta - commented out because it's currently not used. long dt = m_t2 - m_t1; //#ifdef _USEPLOTTER_ // gif size //char tmp[64]; // dimensions of the gif //sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 ); //GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp ); // create one //GIFPlotter plotter ( NULL , m_fd , NULL ); // open it //plotter.openpl ( ); // define the space with boundaries 100 unit wide boundaries //plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 ); // line thickness in user coordinates (pixels for us) //plotter.linewidth ( 1 ); // set bg color to gray (r/g/b) //plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 ); // erase Plotter's graphics display //plotter.erase (); // draw axises in black //plotter.pencolorname ("black"); // // main graphing window // m_gw.safePrintf("<div style=\"position:relative;" "background-color:#c0c0c0;" //"overflow-y:hidden;" "overflow-x:hidden;" "z-index:-10;" // the tick marks we print below are based on it // being a window of the last 20 seconds... and using // DX pixels "min-width:%lipx;" "min-height:%lipx;" //"width:100%%;" //"min-height:600px;" "margin-top:10px;" "margin-bottom:10px;" "margin-right:10px;" "margin-left:10px;\">" ,(long)DX + 2 *m_bx ,(long)DY + 2*m_by); // draw the x-axis //plotter.line ( m_bx , m_by , DX + m_bx , m_by ); // 10 x-axis tick marks for ( int x = DX/20 ; x <= DX ; x += DX/20 ) { // tick mark //plotter.line ( x , -20 , x , 20 ); m_gw.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:0;" "background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\"></div>\n" , m_bx + (long)x-1 ); long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt; // LABEL m_gw.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:20;" //"background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\">%lis</div>\n" , (long)x-10 + m_bx // the label: , xv ); } HashTableX tmpht; tmpht.set(4,0,0,NULL,0,false,m_niceness,"statsparms"); long col = 0; m_sb2->safePrintf("<table border=1 width=100%%>\n"); // label offset to prevent collisions of superimposing multiple // graph calbrations long zoff = 0; // // point to the triplets in m_sb1's buffer (x,y,c) // char *p = m_sb1.getBufStart(); char *pend = p + m_sb1.length(); for ( ; p < pend ; p += 12 ) { // breathe QUICKPOLL ( m_niceness ); // get graph hash of this point long gh = *(long *)(p +8); // if we already did this graph, skip it if ( tmpht.isInTable ( &gh ) ) continue; // . graph this single graph of this color // . returns ptr to first point of different color! plotGraph ( p , pend , gh , m_gw , zoff ); // prevent collisions zoff += 20; // get the label based on graphHash Label *bb = getLabel ( gh ); // add to key if ( col == 0 ) m_sb2->safePrintf("<tr>"); m_sb2->safePrintf("<td bgcolor=#%06lx> </td>" "<td>%s</td>\n", bb->m_color , bb->m_keyDesc ); if ( col == 1 ) m_sb2->safePrintf("</tr>\n"); // inc column and wrap if ( ++col >= 2 ) col = 0; // . do not re-display // . TODO: deal with error tmpht.addKey ( &gh ); } // clear that up m_sb1.reset(); // now plot the events, horizontal line segments like the performance // graph uses for ( long i = 0 ; i < m_ht3.m_numSlots ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // skip if slot empty if ( ! m_ht3.m_flags[i] ) continue; // get the offset into m_sb3 long offset = *(long *)m_ht3.getValueFromSlot(i); // get buf start char *bufStart = m_sb3.getBufStart(); // get the ptr EventPoint *pp = (EventPoint *)(bufStart + offset); // get name of parm Parm *m = g_parms.getParmFromParmHash ( pp->m_parmHash ); // make sure we got it if ( ! m ) { log("statsdb: unrecognized parm hash = %li", pp->m_parmHash); continue; //char *xx=NULL;*xx=0; } } // set the line width //plotter.linewidth ( pp->m_thickness ); // get parm hash long colorHash = pp->m_parmHash; // add in old/new values to make it different colorHash = hash32h ( (long)pp->m_oldVal , colorHash ); colorHash = hash32h ( (long)pp->m_newVal , colorHash ); // . get color // . is really the parm hash in disguise long c1 = colorHash & 0x00ffffff; // use the color specified from addStat_r() for this line/pt //plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 , // ((c1 >> 8) & 0xff) << 8 , // ((c1 >> 0) & 0xff) << 8 ); long x1 = pp->m_a; long x2 = pp->m_b; long y1 = *(long *)m_ht3.getKey(i); // i value // ensure at least 3 units wide for visibility if ( x2 < x1 + 10 ) x2 = x1 + 10; // . flip the y so we don't have to scroll the browser down // . DY does not include the axis and tick marks //long fy1 = DY - y1 + m_by ; // plot it //plotter.line ( x1 , fy1 , x2 , fy1 ); drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness ); // add to map key? only if we haven't already if ( tmpht.isInTable ( &colorHash ) ) continue; // add it if ( col == 0 ) m_sb2->safePrintf("<tr>"); char *title = "unknown parm"; if ( m ) title = m->m_title; m_sb2->safePrintf("<td bgcolor=#%06lx> </td>",c1); // print the parm name and old/new values m_sb2->safePrintf("<td><b>%s</b>",title); if ( pp->m_oldVal != pp->m_newVal ) m_sb2->safePrintf(" (%.02f -> %.02f)", pp->m_oldVal,pp->m_newVal); m_sb2->safePrintf("</td>"); if ( col == 1 ) m_sb2->safePrintf("</tr>\n"); // inc column and wrap if ( ++col >= 2 ) col = 0; // . do not re-display // . TODO: deal with error tmpht.addKey ( &colorHash ) ; } m_sb2->safePrintf("</table>\n"); // clear that up m_ht3.reset(); m_sb3.reset(); // and stat states m_ht0.reset(); m_sb0.reset(); // all done free some mem m_sb1.reset(); //m_sb2.reset(); // // but not m_sb2 cuz that has the html in it!! // // all done //if ( plotter.closepl () < 0 ) // log("admin: Could not close performance graph object."); // close the file //fclose ( m_fd ); //#endif // close main graphing window m_gw.safePrintf("</div>\n"); return true; }
static bool isTLD ( char *tld , int32_t tldLen ) { int32_t pcount = 0; // now they are random! for ( int32_t i = 0 ; i < tldLen ; i++ ) { // period count if ( tld[i] == '.' ) { pcount++; continue; } if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false; } if ( pcount == 0 ) return true; if ( pcount >= 2 ) return false; // otherwise, if one period, check table to see if qualified // we use this as our hashtable static bool s_isInitialized = false; // . i shrunk this list a lot // . see backups for the hold list static const char * const s_tlds[] = { // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt "AAA", "AARP", "ABB", "ABBOTT", "ABBVIE", "ABOGADO", "ABUDHABI", "AC", "ACADEMY", "ACCENTURE", "ACCOUNTANT", "ACCOUNTANTS", "ACO", "ACTIVE", "ACTOR", "AD", "ADAC", "ADS", "ADULT", "AE", "AEG", "AERO", "AF", "AFL", "AG", "AGAKHAN", "AGENCY", "AI", "AIG", "AIRFORCE", "AIRTEL", "AKDN", "AL", "ALIBABA", "ALIPAY", "ALLFINANZ", "ALLY", "ALSACE", "AM", "AMICA", "AMSTERDAM", "ANALYTICS", "ANDROID", "ANQUAN", "AO", "APARTMENTS", "APP", "APPLE", "AQ", "AQUARELLE", "AR", "ARAMCO", "ARCHI", "ARMY", "ARPA", "ARTE", "AS", "ASIA", "ASSOCIATES", "AT", "ATTORNEY", "AU", "AUCTION", "AUDI", "AUDIO", "AUTHOR", "AUTO", "AUTOS", "AVIANCA", "AW", "AWS", "AX", "AXA", "AZ", "AZURE", "BA", "BABY", "BAIDU", "BAND", "BANK", "BAR", "BARCELONA", "BARCLAYCARD", "BARCLAYS", "BAREFOOT", "BARGAINS", "BAUHAUS", "BAYERN", "BB", "BBC", "BBVA", "BCG", "BCN", "BD", "BE", "BEATS", "BEER", "BENTLEY", "BERLIN", "BEST", "BET", "BF", "BG", "BH", "BHARTI", "BI", "BIBLE", "BID", "BIKE", "BING", "BINGO", "BIO", "BIZ", "BJ", "BLACK", "BLACKFRIDAY", "BLOOMBERG", "BLUE", "BM", "BMS", "BMW", "BN", "BNL", "BNPPARIBAS", "BO", "BOATS", "BOEHRINGER", "BOM", "BOND", "BOO", "BOOK", "BOOTS", "BOSCH", "BOSTIK", "BOT", "BOUTIQUE", "BR", "BRADESCO", "BRIDGESTONE", "BROADWAY", "BROKER", "BROTHER", "BRUSSELS", "BS", "BT", "BUDAPEST", "BUGATTI", "BUILD", "BUILDERS", "BUSINESS", "BUY", "BUZZ", "BV", "BW", "BY", "BZ", "BZH", "CA", "CAB", "CAFE", "CAL", "CALL", "CAMERA", "CAMP", "CANCERRESEARCH", "CANON", "CAPETOWN", "CAPITAL", "CAR", "CARAVAN", "CARDS", "CARE", "CAREER", "CAREERS", "CARS", "CARTIER", "CASA", "CASH", "CASINO", "CAT", "CATERING", "CBA", "CBN", "CC", "CD", "CEB", "CENTER", "CEO", "CERN", "CF", "CFA", "CFD", "CG", "CH", "CHANEL", "CHANNEL", "CHASE", "CHAT", "CHEAP", "CHLOE", "CHRISTMAS", "CHROME", "CHURCH", "CI", "CIPRIANI", "CIRCLE", "CISCO", "CITIC", "CITY", "CITYEATS", "CK", "CL", "CLAIMS", "CLEANING", "CLICK", "CLINIC", "CLINIQUE", "CLOTHING", "CLOUD", "CLUB", "CLUBMED", "CM", "CN", "CO", "COACH", "CODES", "COFFEE", "COLLEGE", "COLOGNE", "COM", "COMMBANK", "COMMUNITY", "COMPANY", "COMPARE", "COMPUTER", "COMSEC", "CONDOS", "CONSTRUCTION", "CONSULTING", "CONTACT", "CONTRACTORS", "COOKING", "COOL", "COOP", "CORSICA", "COUNTRY", "COUPON", "COUPONS", "COURSES", "CR", "CREDIT", "CREDITCARD", "CREDITUNION", "CRICKET", "CROWN", "CRS", "CRUISES", "CSC", "CU", "CUISINELLA", "CV", "CW", "CX", "CY", "CYMRU", "CYOU", "CZ", "DABUR", "DAD", "DANCE", "DATE", "DATING", "DATSUN", "DAY", "DCLK", "DE", "DEALER", "DEALS", "DEGREE", "DELIVERY", "DELL", "DELOITTE", "DELTA", "DEMOCRAT", "DENTAL", "DENTIST", "DESI", "DESIGN", "DEV", "DIAMONDS", "DIET", "DIGITAL", "DIRECT", "DIRECTORY", "DISCOUNT", "DJ", "DK", "DM", "DNP", "DO", "DOCS", "DOG", "DOHA", "DOMAINS", "DOWNLOAD", "DRIVE", "DUBAI", "DURBAN", "DVAG", "DZ", "EARTH", "EAT", "EC", "EDEKA", "EDU", "EDUCATION", "EE", "EG", "EMAIL", "EMERCK", "ENERGY", "ENGINEER", "ENGINEERING", "ENTERPRISES", "EPSON", "EQUIPMENT", "ER", "ERNI", "ES", "ESQ", "ESTATE", "ET", "EU", "EUROVISION", "EUS", "EVENTS", "EVERBANK", "EXCHANGE", "EXPERT", "EXPOSED", "EXPRESS", "EXTRASPACE", "FAGE", "FAIL", "FAIRWINDS", "FAITH", "FAMILY", "FAN", "FANS", "FARM", "FASHION", "FAST", "FEEDBACK", "FERRERO", "FI", "FILM", "FINAL", "FINANCE", "FINANCIAL", "FIRESTONE", "FIRMDALE", "FISH", "FISHING", "FIT", "FITNESS", "FJ", "FK", "FLICKR", "FLIGHTS", "FLORIST", "FLOWERS", "FLSMIDTH", "FLY", "FM", "FO", "FOO", "FOOTBALL", "FORD", "FOREX", "FORSALE", "FORUM", "FOUNDATION", "FOX", "FR", "FRESENIUS", "FRL", "FROGANS", "FRONTIER", "FTR", "FUND", "FURNITURE", "FUTBOL", "FYI", "GA", "GAL", "GALLERY", "GALLO", "GALLUP", "GAME", "GARDEN", "GB", "GBIZ", "GD", "GDN", "GE", "GEA", "GENT", "GENTING", "GF", "GG", "GGEE", "GH", "GI", "GIFT", "GIFTS", "GIVES", "GIVING", "GL", "GLASS", "GLE", "GLOBAL", "GLOBO", "GM", "GMAIL", "GMBH", "GMO", "GMX", "GN", "GOLD", "GOLDPOINT", "GOLF", "GOO", "GOOG", "GOOGLE", "GOP", "GOT", "GOV", "GP", "GQ", "GR", "GRAINGER", "GRAPHICS", "GRATIS", "GREEN", "GRIPE", "GROUP", "GS", "GT", "GU", "GUCCI", "GUGE", "GUIDE", "GUITARS", "GURU", "GW", "GY", "HAMBURG", "HANGOUT", "HAUS", "HDFCBANK", "HEALTH", "HEALTHCARE", "HELP", "HELSINKI", "HERE", "HERMES", "HIPHOP", "HITACHI", "HIV", "HK", "HM", "HN", "HOCKEY", "HOLDINGS", "HOLIDAY", "HOMEDEPOT", "HOMES", "HONDA", "HORSE", "HOST", "HOSTING", "HOTELES", "HOTMAIL", "HOUSE", "HOW", "HR", "HSBC", "HT", "HTC", "HU", "HYUNDAI", "IBM", "ICBC", "ICE", "ICU", "ID", "IE", "IFM", "IINET", "IL", "IM", "IMAMAT", "IMMO", "IMMOBILIEN", "IN", "INDUSTRIES", "INFINITI", "INFO", "ING", "INK", "INSTITUTE", "INSURANCE", "INSURE", "INT", "INTERNATIONAL", "INVESTMENTS", "IO", "IPIRANGA", "IQ", "IR", "IRISH", "IS", "ISELECT", "ISMAILI", "IST", "ISTANBUL", "IT", "ITAU", "IWC", "JAGUAR", "JAVA", "JCB", "JCP", "JE", "JETZT", "JEWELRY", "JLC", "JLL", "JM", "JMP", "JNJ", "JO", "JOBS", "JOBURG", "JOT", "JOY", "JP", "JPMORGAN", "JPRS", "JUEGOS", "KAUFEN", "KDDI", "KE", "KERRYHOTELS", "KERRYLOGISTICS", "KERRYPROPERTIES", "KFH", "KG", "KH", "KI", "KIA", "KIM", "KINDER", "KITCHEN", "KIWI", "KM", "KN", "KOELN", "KOMATSU", "KP", "KPMG", "KPN", "KR", "KRD", "KRED", "KUOKGROUP", "KW", "KY", "KYOTO", "KZ", "LA", "LACAIXA", "LAMBORGHINI", "LAMER", "LANCASTER", "LAND", "LANDROVER", "LANXESS", "LASALLE", "LAT", "LATROBE", "LAW", "LAWYER", "LB", "LC", "LDS", "LEASE", "LECLERC", "LEGAL", "LEXUS", "LGBT", "LI", "LIAISON", "LIDL", "LIFE", "LIFEINSURANCE", "LIFESTYLE", "LIGHTING", "LIKE", "LIMITED", "LIMO", "LINCOLN", "LINDE", "LINK", "LIPSY", "LIVE", "LIVING", "LIXIL", "LK", "LOAN", "LOANS", "LOCUS", "LOL", "LONDON", "LOTTE", "LOTTO", "LOVE", "LR", "LS", "LT", "LTD", "LTDA", "LU", "LUPIN", "LUXE", "LUXURY", "LV", "LY", "MA", "MADRID", "MAIF", "MAISON", "MAKEUP", "MAN", "MANAGEMENT", "MANGO", "MARKET", "MARKETING", "MARKETS", "MARRIOTT", "MBA", "MC", "MD", "ME", "MED", "MEDIA", "MEET", "MELBOURNE", "MEME", "MEMORIAL", "MEN", "MENU", "MEO", "MG", "MH", "MIAMI", "MICROSOFT", "MIL", "MINI", "MK", "ML", "MLS", "MM", "MMA", "MN", "MO", "MOBI", "MOBILY", "MODA", "MOE", "MOI", "MOM", "MONASH", "MONEY", "MONTBLANC", "MORMON", "MORTGAGE", "MOSCOW", "MOTORCYCLES", "MOV", "MOVIE", "MOVISTAR", "MP", "MQ", "MR", "MS", "MT", "MTN", "MTPC", "MTR", "MU", "MUSEUM", "MUTUAL", "MUTUELLE", "MV", "MW", "MX", "MY", "MZ", "NA", "NADEX", "NAGOYA", "NAME", "NATURA", "NAVY", "NC", "NE", "NEC", "NET", "NETBANK", "NETWORK", "NEUSTAR", "NEW", "NEWS", "NEXT", "NEXTDIRECT", "NEXUS", "NF", "NG", "NGO", "NHK", "NI", "NICO", "NIKON", "NINJA", "NISSAN", "NISSAY", "NL", "NO", "NOKIA", "NORTHWESTERNMUTUAL", "NORTON", "NOWRUZ", "NP", "NR", "NRA", "NRW", "NTT", "NU", "NYC", "NZ", "OBI", "OFFICE", "OKINAWA", "OLAYAN", "OM", "OMEGA", "ONE", "ONG", "ONL", "ONLINE", "OOO", "ORACLE", "ORANGE", "ORG", "ORGANIC", "ORIGINS", "OSAKA", "OTSUKA", "OVH", "PA", "PAGE", "PAMPEREDCHEF", "PANERAI", "PARIS", "PARS", "PARTNERS", "PARTS", "PARTY", "PASSAGENS", "PE", "PET", "PF", "PG", "PH", "PHARMACY", "PHILIPS", "PHOTO", "PHOTOGRAPHY", "PHOTOS", "PHYSIO", "PIAGET", "PICS", "PICTET", "PICTURES", "PID", "PIN", "PING", "PINK", "PIZZA", "PK", "PL", "PLACE", "PLAY", "PLAYSTATION", "PLUMBING", "PLUS", "PM", "PN", "POHL", "POKER", "P**N", "POST", "PR", "PRAXI", "PRESS", "PRO", "PROD", "PRODUCTIONS", "PROF", "PROGRESSIVE", "PROMO", "PROPERTIES", "PROPERTY", "PROTECTION", "PS", "PT", "PUB", "PW", "PWC", "PY", "QA", "QPON", "QUEBEC", "QUEST", "RACING", "RE", "READ", "REALTOR", "REALTY", "RECIPES", "RED", "REDSTONE", "REDUMBRELLA", "REHAB", "REISE", "REISEN", "REIT", "REN", "RENT", "RENTALS", "REPAIR", "REPORT", "REPUBLICAN", "REST", "RESTAURANT", "REVIEW", "REVIEWS", "REXROTH", "RICH", "RICOH", "RIO", "RIP", "RO", "ROCHER", "ROCKS", "RODEO", "ROOM", "RS", "RSVP", "RU", "RUHR", "RUN", "RW", "RWE", "RYUKYU", "SA", "SAARLAND", "SAFE", "SAFETY", "SAKURA", "SALE", "SALON", "SAMSUNG", "SANDVIK", "SANDVIKCOROMANT", "SANOFI", "SAP", "SAPO", "SARL", "SAS", "SAXO", "SB", "SBI", "SBS", "SC", "SCA", "SCB", "SCHAEFFLER", "SCHMIDT", "SCHOLARSHIPS", "SCHOOL", "SCHULE", "SCHWARZ", "SCIENCE", "SCOR", "SCOT", "SD", "SE", "SEAT", "SECURITY", "SEEK", "SELECT", "SENER", "SERVICES", "SEVEN", "SEW", "SEX", "SEXY", "SFR", "SG", "SH", "SHARP", "SHAW", "SHELL", "SHIA", "SHIKSHA", "SHOES", "SHOUJI", "SHOW", "SHRIRAM", "SI", "SINA", "SINGLES", "SITE", "SJ", "SK", "SKI", "SKIN", "SKY", "SKYPE", "SL", "SM", "SMILE", "SN", "SNCF", "SO", "SOCCER", "SOCIAL", "SOFTBANK", "SOFTWARE", "SOHU", "SOLAR", "SOLUTIONS", "SONG", "SONY", "SOY", "SPACE", "SPIEGEL", "SPOT", "SPREADBETTING", "SR", "SRL", "ST", "STADA", "STAR", "STARHUB", "STATEBANK", "STATEFARM", "STATOIL", "STC", "STCGROUP", "STOCKHOLM", "STORAGE", "STORE", "STREAM", "STUDIO", "STUDY", "STYLE", "SU", "SUCKS", "SUPPLIES", "SUPPLY", "SUPPORT", "SURF", "SURGERY", "SUZUKI", "SV", "SWATCH", "SWISS", "SX", "SY", "SYDNEY", "SYMANTEC", "SYSTEMS", "SZ", "TAB", "TAIPEI", "TALK", "TAOBAO", "TATAMOTORS", "TATAR", "TATTOO", "TAX", "TAXI", "TC", "TCI", "TD", "TEAM", "TECH", "TECHNOLOGY", "TEL", "TELECITY", "TELEFONICA", "TEMASEK", "TENNIS", "TEVA", "TF", "TG", "TH", "THD", "THEATER", "THEATRE", "TICKETS", "TIENDA", "TIFFANY", "TIPS", "TIRES", "TIROL", "TJ", "TK", "TL", "TM", "TMALL", "TN", "TO", "TODAY", "TOKYO", "TOOLS", "TOP", "TORAY", "TOSHIBA", "TOTAL", "TOURS", "TOWN", "TOYOTA", "TOYS", "TR", "TRADE", "TRADING", "TRAINING", "TRAVEL", "TRAVELERS", "TRAVELERSINSURANCE", "TRUST", "TRV", "TT", "TUBE", "TUI", "TUNES", "TUSHU", "TV", "TVS", "TW", "TZ", "UA", "UBS", "UG", "UK", "UNICOM", "UNIVERSITY", "UNO", "UOL", "US", "UY", "UZ", "VA", "VACATIONS", "VANA", "VC", "VE", "VEGAS", "VENTURES", "VERISIGN", "VERSICHERUNG", "VET", "VG", "VI", "VIAJES", "VIDEO", "VIG", "VIKING", "VILLAS", "VIN", "VIP", "VIRGIN", "VISION", "VISTA", "VISTAPRINT", "VIVA", "VLAANDEREN", "VN", "VODKA", "VOLKSWAGEN", "VOTE", "VOTING", "VOTO", "VOYAGE", "VU", "VUELOS", "WALES", "WALTER", "WANG", "WANGGOU", "WARMAN", "WATCH", "WATCHES", "WEATHER", "WEATHERCHANNEL", "WEBCAM", "WEBER", "WEBSITE", "WED", "WEDDING", "WEIBO", "WEIR", "WF", "WHOSWHO", "WIEN", "WIKI", "WILLIAMHILL", "WIN", "WINDOWS", "WINE", "WME", "WOLTERSKLUWER", "WORK", "WORKS", "WORLD", "WS", "WTC", "WTF", "XBOX", "XEROX", "XIHUAN", "XIN", "XN--11B4C3D", "XN--1CK2E1B", "XN--1QQW23A", "XN--30RR7Y", "XN--3BST00M", "XN--3DS443G", "XN--3E0B707E", "XN--3PXU8K", "XN--42C2D9A", "XN--45BRJ9C", "XN--45Q11C", "XN--4GBRIM", "XN--55QW42G", "XN--55QX5D", "XN--5TZM5G", "XN--6FRZ82G", "XN--6QQ986B3XL", "XN--80ADXHKS", "XN--80AO21A", "XN--80ASEHDB", "XN--80ASWG", "XN--8Y0A063A", "XN--90A3AC", "XN--90AIS", "XN--9DBQ2A", "XN--9ET52U", "XN--9KRT00A", "XN--B4W605FERD", "XN--BCK1B9A5DRE4C", "XN--C1AVG", "XN--C2BR7G", "XN--CCK2B3B", "XN--CG4BKI", "XN--CLCHC0EA0B2G2A9GCD", "XN--CZR694B", "XN--CZRS0T", "XN--CZRU2D", "XN--D1ACJ3B", "XN--D1ALF", "XN--E1A4C", "XN--ECKVDTC9D", "XN--EFVY88H", "XN--ESTV75G", "XN--FCT429K", "XN--FHBEI", "XN--FIQ228C5HS", "XN--FIQ64B", "XN--FIQS8S", "XN--FIQZ9S", "XN--FJQ720A", "XN--FLW351E", "XN--FPCRJ9C3D", "XN--FZC2C9E2C", "XN--G2XX48C", "XN--GCKR3F0F", "XN--GECRJ9C", "XN--H2BRJ9C", "XN--HXT814E", "XN--I1B6B1A6A2E", "XN--IMR513N", "XN--IO0A7I", "XN--J1AEF", "XN--J1AMH", "XN--J6W193G", "XN--JLQ61U9W7B", "XN--JVR189M", "XN--KCRX77D1X4A", "XN--KPRW13D", "XN--KPRY57D", "XN--KPU716F", "XN--KPUT3I", "XN--L1ACC", "XN--LGBBAT1AD8J", "XN--MGB9AWBF", "XN--MGBA3A3EJT", "XN--MGBA3A4F16A", "XN--MGBA7C0BBN0A", "XN--MGBAAM7A8H", "XN--MGBAB2BD", "XN--MGBAYH7GPA", "XN--MGBB9FBPOB", "XN--MGBBH1A71E", "XN--MGBC0A9AZCG", "XN--MGBCA7DZDO", "XN--MGBERP4A5D4AR", "XN--MGBPL2FH", "XN--MGBT3DHD", "XN--MGBTX2B", "XN--MGBX4CD0AB", "XN--MIX891F", "XN--MK1BU44C", "XN--MXTQ1M", "XN--NGBC5AZD", "XN--NGBE9E0A", "XN--NODE", "XN--NQV7F", "XN--NQV7FS00EMA", "XN--NYQY26A", "XN--O3CW4H", "XN--OGBPF8FL", "XN--P1ACF", "XN--P1AI", "XN--PBT977C", "XN--PGBS0DH", "XN--PSSY2U", "XN--Q9JYB4C", "XN--QCKA1PMC", "XN--QXAM", "XN--RHQV96G", "XN--ROVU88B", "XN--S9BRJ9C", "XN--SES554G", "XN--T60B56A", "XN--TCKWE", "XN--UNUP4Y", "XN--VERMGENSBERATER-CTB", "XN--VERMGENSBERATUNG-PWB", "XN--VHQUV", "XN--VUQ861B", "XN--W4R85EL8FHU5DNRA", "XN--WGBH1C", "XN--WGBL6A", "XN--XHQ521B", "XN--XKC2AL3HYE2A", "XN--XKC2DL3A5EE0H", "XN--Y9A3AQ", "XN--YFRO4I67O", "XN--YGBI2AMMX", "XN--ZFR164B", "XPERIA", "XXX", "XYZ", "YACHTS", "YAHOO", "YAMAXUN", "YANDEX", "YE", "YODOBASHI", "YOGA", "YOKOHAMA", "YOU", "YOUTUBE", "YT", "YUN", "ZA", "ZARA", "ZERO", "ZIP", "ZM", "ZONE", "ZUERICH", "ZW", "AB.CA", "AC.AE", "AC.AT", "AC.CN", "AC.CR", "AC.CY", "AC.FJ", "AC.GG", "AC.ID", "AC.IL", "AC.IM", "AC.IN", "AC.JE", "AC.JP", "AC.KR", "AC.NZ", "AC.PA", "AC.TH", "AC.UG", "AC.UK", "AC.YU", "AC.ZA", "AD.JP", "AH.CN", "ALDERNEY.GG", "ALT.ZA", "ART.BR", "ART.DO", "ARTS.CO", "ARTS.VE", "ASN.AU", "ASN.LV", "BBS.TR", "BC.CA", "BIB.VE", "BJ.CN", "CO.AT", "CO.AO", "CO.CK", "CO.CR", "CO.GG", "CO.HU", "CO.ID", "CO.IL", "CO.IM", "CO.IN", "CO.JE", "CO.JP", "CO.KR", "COM.AR", "COM.AU", "COM.AZ", "COM.BB", "COM.BM", "COM.BR", "COM.BS", "COM.CN", "COM.CO", "COM.CU", "COM.CY", "COM.DO", "COM.EC", "COM.EG", "COM.FJ", "COM.GE", "COM.GU", "COM.HK", "COM.JO", "COM.KH", "COM.LA", "COM.LB", "COM.LC", "COM.LV", "COM.LY", "COM.MM", "COM.MO", "COM.MT", "COM.MX", "COM.MY", "COM.NA", "COM.NC", "COM.NI", "COM.NP", "COM.PA", "COM.PE", "COM.PH", "COM.PL", "COM.PY", "COM.RU", "COM.SG", "COM.SH", "COM.SY", "COM.TN", "COM.TR", "COM.TW", "COM.UA", "COM.UY", "COM.VE", "CONF.AU", "CONF.LV", "CO.NZ", "COOP", "CO.AE", "CO.SV", "CO.TH", "CO.UG", "CO.UK", "CO.VE", "CO.VI", "CO.YU", "CO.ZA", "CQ.CN", "CSIRO.AU", "ED.CR", "EDU.BM", "EDU.AR", "EDU.CN", "EDU.CO", "EDU.DO", "EDU.EC", "EDU.EG", "EDU.GE", "EDU.GU", "EDU.JO", "EDU.LC", "EDU.LV", "EDU.MM", "EDU.MO", "EDU.MY", "EDUNET.TN", "EDU.PA", "EDU.PY", "EDU.SG", "EDU.SH", "EDU.TR", "EDU.TW", "EDU.UY", "EDU.VE", "EDU.YU", "EDU.ZA", "ENS.TN", "ERNET.IN", "ESP.BR", "ETC.BR", "EUN.EG", "FI.CR", "FIN.EC", "FIN.TN", "FIRM.CO", "FIRM.VE", "G12.BR", "GD.CN", "GEN.NZ", "GOB.PA", "GO.CR", "GO.ID", "GO.KR", "GO.TH", "GO.UG", "GOV.AE", "GOV.AR", "GOV.AU", "GOV.BM", "GOV.BR", "GOV.CN", "GOV.CO", "GOV.CY", "GOV.DO", "GOV.EC", "GOV.EG", "GOVE.TW", "GOV.FJ", "GOV.GE", "GOV.GG", "GOV.GU", "GOV.IL", "GOV.IM", "GOV.IN", "GOV.JE", "GOV.JO", "GOV.JP", "GOV.LB", "GOV.LC", "GOV.LV", "GOV.MM", "GOV.MO", "GOV.MY", "GOV.SG", "GOV.SH", "GOV.TN", "GOVT.NZ", "GOV.TR", "GOV.UA", "GOV.UK", "GOV.VE", "GOV.ZA", "GS.CN", "GUERNSEY.GG", "GX.CN", "GZ.CN", "HB.CN", "HE.CN", "HI.CN", "HK.CN", "HL.CN", "HN.CN", "ID.AU", "ID.FJ", "ID.LV", "IND.BR", "IND.GG", "IND.JE", "IND.TN", "INF.BR", "INFO.AU", "INFO.CO", "INFO.HU", "INFO.TN", "INFO.VE", "INT.CO", "INTL.TN", "INT.VE", "JERSEY.JE", "JL.CN", "JS.CN", "K12.EC", "K12.IL", "K12.TR", "LKD.CO.IM", "LN.CN", "LTD.GG", "LTD.JE", "LTD.UK", "MB.CA", "MED.EC", "MIL.BR", "MIL.CO", "MIL.DO", "MIL.EC", "MIL.GE", "MIL.GU", "MIL.ID", "MIL.LB", "MIL.LV", "MIL.PH", "MIL.SH", "MIL.TR", "MIL.VE", "MIL.ZA", "MO.CN", "MOD.UK", "MUNI.IL", "MUSEUM", "NAME", "NAT.TN", "NB.CA", "NET.AR", "NET.AU", "NET.AZ", "NET.BB", "NET.BM", "NET.BR", "NET.BS", "NET.CN", "NET.CU", "NET.CY", "NET.DO", "NET.EC", "NET.EG", "NET.GE", "NET.GG", "NET.GU", "NET.HK", "NET.ID", "NET.IL", "NET.IM", "NET.IN", "NET.JE", "NET.JO", "NET.JP", "NET.KH", "NET.LA", "NET.LB", "NET.LC", "NET.LV", "NET.LY", "NET.MM", "NET.MO", "NET.MT", "NET.MX", "NET.MY", "NET.NA", "NET.NC", "NET.NP", "NET.NZ", "NET.PA", "NET.PE", "NET.PH", "NET.PL", "NET.PY", "NET.RU", "NET.SG", "NET.SH", "NET.SY", "NET.TH", "NET.TN", "NET.TR", "NET.TW", "NET.UA", "NET.UK", "NET.UY", "NET.VE", "NET.VI", "NET.ZA", "NF.CA", "NGO.PH", "NGO.ZA", "NHS.UK", "NIC.IM", "NIC.IN", "NM.CN", "NM.KR", "NOM.CO", "NOM.VE", "NOM.ZA", "NS.CA", "NSK.SU", "NT.CA", "NUI.HU", "NX.CN", "ON.CA", "OR.CR", "ORG.AE", "ORG.AR", "ORG.AU", "ORG.AZ", "ORG.BB", "ORG.BM", "ORG.BR", "ORG.BS", "ORG.CN", "ORG.CO", "ORG.CU", "ORG.CY", "ORG.DO", "ORG.EC", "ORG.EG", "ORG.FJ", "ORG.GE", "ORG.GG", "ORG.GU", "ORG.HK", "ORG.HU", "ORG.IL", "ORG.IM", "ORG.JE", "ORG.JP", "ORG.KH", "ORG.LA", "ORG.LB", "ORG.LC", "ORG.LV", "ORG.LY", "ORG.MM", "ORG.MO", "ORG.MT", "ORG.MX", "ORG.MY", "ORG.NA", "ORG.NC", "ORG.NZ", "ORG.PA", "ORG.PE", "ORG.PH", "ORG.PL", "ORG.PY", "ORG.RU", "ORG.SG", "ORG.SH", "ORG.SY", "ORG.TN", "ORG.TR", "ORG.TW", "ORG.UK", "ORG.UY", "ORG.VE", "ORG.VI", "ORG.YU", "ORG.ZA", "OR.ID", "OR.KR", "OR.TH", "ORT.NP", "OR.UG", "OZ.AU", "PE.CA", "PLC.CO.IM", "PLC.UK", "POLICE.UK", "PRIV.HU", "PSI.BR", "PVT.GE", "QC.CA", "QH.CN", "REC.BR", "REC.CO", "REC.VE", "RE.KR", "RES.IN", "RNRT.TN", "RNS.TN", "RNU.TN", "SA.CR", "SARK.GG", "SC.CN", "SCH.GG", "SCH.JE", "SCHOOL.FJ", "SCHOOL.ZA", "SCH.UK", "SCI.EG", "SH.CN", "SK.CA", "SLD.PA", "SN.CN", "STORE.CO", "STORE.VE", "SX.CN", "TEC.VE", "TELEMEMO.AU", "TJ.CN", "TM.HU", "TMP.BR", "TM.ZA", "TOURISM.TN", "TW.CN", "WEB.CO", "WEB.DO", "WEB.VE", "WEB.ZA", "XJ.CN", "XZ.CN", "YK.CA", "YN.CN", "ZJ.CN" }; if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0, "tldtbl") ) return log("build: Could not init table of TLDs."); // now add in all the stop words int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); for ( int32_t i = 0 ; i < n ; i++ ) { const char *d = s_tlds[i]; int32_t dlen = gbstrlen ( d ); int64_t dh = hash64Lower_a ( d , dlen ); if ( ! s_table.addKey (&dh,NULL) ) return log("build: dom table failed"); } s_isInitialized = true; } int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld)); return s_table.isInTable ( &h );//getScoreFromTermId ( h ); }
// returns -1 and sets g_errno on error, because 0 means langUnknown long Words::getLanguage( Sections *sections , long maxSamples, long niceness, long *langScore) { // calculate scores if not given //Scores calcdScores; //if ( ! scores ) { // if ( ! calcdScores.set( this,m_version,false ) ) // return -1; // scores = &calcdScores; //} // . take a random sample of words and look them up in the // language dictionary //HashTableT<long long, char> ht; HashTableX ht; long long langCount[MAX_LANGUAGES]; long long langWorkArea[MAX_LANGUAGES]; long numWords = m_numWords; //long skip = numWords/maxSamples; //if ( skip == 0 ) skip = 1; // reset the language count memset(langCount, 0, sizeof(long long)*MAX_LANGUAGES); // sample the words //long wordBase = 0; long wordi = 0; //if ( ! ht.set(maxSamples*1.5) ) return -1; if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false, niceness,"wordslang")) return -1; // . avoid words in these bad sections // . google seems to index SEC_MARQUEE so i took that out of badFlags long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // shortcuts long long *wids = m_wordIds; long *wlens = m_wordLens; char **wptrs = m_words; //long langTotal = 0; // log ( LOG_WARN, "xmldoc: Picking language from %li words with %li skip", // numWords, skip ); char numOne = 1; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // this means null too if ( sections && sections->m_numSections == 0 ) sp = NULL; long maxCount = 1000; while ( wordi < numWords ) { // breathe QUICKPOLL( niceness ); // move to the next valid word if ( ! wids [wordi] ) { wordi++; continue; } if ( wlens[wordi] < 2 ) { wordi++; continue; } // skip if in a bad section //long flags = sections->m_sectionPtrs[i]->m_flags; // meaning script section ,etc if ( sp && ( sp[wordi]->m_flags & badFlags ) ) { wordi++; continue; } // check the language //unsigned char lang = 0; // Skip if word is capitalized and not preceded by a tag //if(s_isWordCap(getWord(wordi), getWordLen(wordi)) && // wordi > 0 && !getTagId(wordi - 1)) { // wordi++; // continue; //} // Skip word if bounded by '/' or '?' might be in a URL if(isBounded(wordi)) { wordi++; continue; } // is it arabic? sometimes they are spammy pages and repeat // a few arabic words over and over again, so don't do deduping // with "ht" before checking this. char cl = getCharacterLanguage ( wptrs[wordi] ); if ( cl ) { langCount[(unsigned char)cl]++; wordi++; continue; } //if(ht.getSlot(m_wordIds[wordi]) !=-1) { if(!ht.isEmpty(&m_wordIds[wordi]) ) { wordi++; continue; } // If we can't add the word, it's not that bad. // Just gripe about it in the log. if(!ht.addKey(&m_wordIds[wordi], &numOne)) { log(LOG_WARN, "build: Could not add word to temporary " "table, memory error?\n"); g_errno = ENOMEM; return -1; } if ( maxCount-- <= 0 ) break; // No lang from charset, got a phrase, and 0 language does not have // a score Order is very important! int foundone = 0; if ( // lang == 0 && // we seem to be missing hungarian and thai g_speller.getPhraseLanguages(getWord(wordi), getWordLen(wordi), langWorkArea) && // why must it have an "unknown score" of 0? // allow -1... i don't know what that means!! langWorkArea[0] <= 0) { int lasty = -1; for(int y = 1; y < MAX_LANGUAGES; y++) { if(langWorkArea[y] == 0) continue; langCount[y]++; long pop = langWorkArea[y]; // negative means in an official dictionary if ( pop < 0 ) { pop *= -1; langCount[y] += 1; } // extra? if ( pop > 1000 ) langCount[y] += 2; if ( pop > 10000 ) langCount[y] += 2; lasty = y; foundone++; } // . if it can only belong to one language // . helps fix that fact that our unifiedDict is crummy // and identifes some words as being in a lot of languages // like "Pronto" as being in english and not giving // the popularities correctly. if ( foundone == 1 ) // give massive boost langCount[lasty] += 10; } // . try to skip unknown words without killing sample size // . we lack russian, hungarian and arabic in the unified // dict, so try to do character detection for those langs. // . should prevent them from being detected as unknown // langs and coming up for english search 'gigablast' if ( ! foundone ) { langCount[langUnknown]++; // do not count towards sample size maxCount++; } // skip to the next word //wordBase += skip; //if ( wordi < wordBase ) // wordi = wordBase; //else wordi++; } // punish unknown count in case a doc has a lot of proper names // or something //langCount[langUnknown] /= 2; // get the lang with the max score then int l = s_findMaxIndex(langCount, MAX_LANGUAGES); // if(langCount[l] < 15) return(langUnknown); if(langScore) *langScore = langCount[l]; // return if known now return l; }
// . merge all the replies together // . put final merged docids into m_docIds[],m_bitScores[],m_scores[],... // . this calls Msg51 to get cluster levels when done merging // . Msg51 remembers clusterRecs from previous call to avoid repeating lookups // . returns false if blocked, true otherwise // . sets g_errno and returns true on error bool Msg3a::mergeLists ( ) { // time how long the merge takes if ( m_debug ) { logf( LOG_DEBUG, "query: msg3a: --- Final DocIds --- " ); m_startTime = gettimeofdayInMilliseconds(); } // reset our final docids count here in case we are a re-call m_numDocIds = 0; // a secondary count, how many unique docids we scanned, and not // necessarily added to the m_docIds[] array //m_totalDocCount = 0; // long docCount = 0; m_moreDocIdsAvail = true; // shortcut //long numSplits = m_numHosts;//indexdbSplit; // . point to the various docids, etc. in each split reply // . tcPtr = term count. how many required query terms does the doc // have? formerly called topExplicits in IndexTable2.cpp long long *diPtr [MAX_INDEXDB_SPLIT]; float *rsPtr [MAX_INDEXDB_SPLIT]; key_t *ksPtr [MAX_INDEXDB_SPLIT]; long long *diEnd [MAX_INDEXDB_SPLIT]; for ( long j = 0; j < m_numHosts ; j++ ) { Msg39Reply *mr =m_reply[j]; // if we have gbdocid:| in query this could be NULL if ( ! mr ) { diPtr[j] = NULL; diEnd[j] = NULL; rsPtr[j] = NULL; ksPtr[j] = NULL; continue; } diPtr [j] = (long long *)mr->ptr_docIds; rsPtr [j] = (float *)mr->ptr_scores; ksPtr [j] = (key_t *)mr->ptr_clusterRecs; diEnd [j] = (long long *)(mr->ptr_docIds + mr->m_numDocIds * 8); } // clear if we had it if ( m_finalBuf ) { mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" ); m_finalBuf = NULL; m_finalBufSize = 0; } // // HACK: START section stats merge // m_sectionStats.reset(); long sneed = 0; for ( long j = 0; j < m_numHosts ; j++ ) { Msg39Reply *mr = m_reply[j]; if ( ! mr ) continue; sneed += mr->size_siteHashList/4; } HashTableX dt; //char tmpBuf[5000]; if (sneed&&!dt.set(4,0,sneed,NULL,0,false, m_r->m_niceness,"uniqsit")) return true; for ( long j = 0; sneed && j < m_numHosts ; j++ ) { Msg39Reply *mr =m_reply[j]; if ( ! mr ) continue; SectionStats *src = &mr->m_sectionStats; SectionStats *dst = &m_sectionStats; dst->m_onSiteDocIds += src->m_onSiteDocIds; dst->m_offSiteDocIds += src->m_offSiteDocIds; // now the list should be the unique site hashes that // had the section hash. we need to uniquify them again // here. long *p = (long *)mr->ptr_siteHashList; long np = mr->size_siteHashList / 4; for ( long k = 0 ; k < np ; k++ ) // hash it up, no dups! dt.addKey(&p[k]); // update our count based on that dst->m_numUniqueSites = dt.getNumSlotsUsed(); } if ( m_r->m_getSectionStats ) return true; // // HACK: END section stats merge // if ( m_docsToGet <= 0 ) { char *xx=NULL; *xx=0; } // . how much do we need to store final merged docids, etc.? // . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1 long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1); // allocate it m_finalBuf = (char *)mmalloc ( need , "finalBuf" ); m_finalBufSize = need; // g_errno should be set if this fails if ( ! m_finalBuf ) return true; // hook into it char *p = m_finalBuf; m_docIds = (long long *)p; p += m_docsToGet * 8; m_scores = (float *)p; p += m_docsToGet * sizeof(float); m_clusterRecs = (key_t *)p; p += m_docsToGet * sizeof(key_t); m_clusterLevels = (char *)p; p += m_docsToGet * 1; m_scoreInfos = (DocIdScore **)p; p+=m_docsToGet*sizeof(DocIdScore *); // sanity check char *pend = m_finalBuf + need; if ( p != pend ) { char *xx = NULL; *xx =0; } // . now allocate for hash table // . get at least twice as many slots as docids HashTableT<long long,char> htable; // returns false and sets g_errno on error if ( ! htable.set ( m_docsToGet * 2 ) ) return true; // hash table for doing site clustering, provided we // are fully split and we got the site recs now HashTableT<long long,long> htable2; if ( m_r->m_doSiteClustering && ! htable2.set ( m_docsToGet * 2 ) ) return true; // // ***MERGE ALL SPLITS INTO m_docIds[], etc.*** // // . merge all lists in m_replyDocIds[splitNum] // . we may be re-called later after m_docsToGet is increased // if too many docids were clustered/filtered out after the call // to Msg51. mergeLoop: // the winning docid will be diPtr[maxj] long maxj = -1; //Msg39Reply *mr; long hslot; // get the next highest-scoring docids from all split lists for ( long j = 0; j < m_numHosts; j++ ) { // . skip exhausted lists // . these both should be NULL if reply was skipped because // we did a gbdocid:| query if ( diPtr[j] >= diEnd[j] ) continue; // compare the score if ( maxj == -1 ) { maxj = j; continue; } if ( *rsPtr[j] < *rsPtr[maxj] ) continue; if ( *rsPtr[j] > *rsPtr[maxj] ) { maxj = j; continue; } // prefer lower docids on top if ( *diPtr[j] < *diPtr[maxj] ) { maxj = j; continue; } } if ( maxj == -1 ) { m_moreDocIdsAvail = false; goto doneMerge; } // only do this logic if we have clusterdb recs included if ( m_r->m_doSiteClustering && // if the clusterLevel was set to CR_*errorCode* then this key // will be 0, so in that case, it might have been a not found // or whatever, so let it through regardless ksPtr[maxj]->n0 != 0LL && ksPtr[maxj]->n1 != 0 ) { // get the hostname hash, a long long long sh = g_clusterdb.getSiteHash26 ((char *)ksPtr[maxj]); // do we have enough from this hostname already? long slot = htable2.getSlot ( sh ); // if this hostname already visible, do not over-display it... if ( slot >= 0 ) { // get the count long val = htable2.getValueFromSlot ( slot ); // . if already 2 or more, give up // . if the site hash is 0, that usually means a // "not found" in clusterdb, and the accompanying // cluster level would be set as such, but since we // did not copy the cluster levels over in the merge // algo above, we don't know for sure... cluster recs // are set to 0 in the Msg39.cpp clustering. if ( sh && val >= 2 ) goto skip; // inc the count val++; // store it htable2.setValue ( slot , val ); } // . add it, this should be pre-allocated! // . returns false and sets g_errno on error else if ( ! htable2.addKey(sh,1) ) return true; } hslot = htable.getSlot ( *diPtr[maxj] ); // . only add it to the final list if the docid is "unique" // . BUT since different event ids share the same docid, exception! if ( hslot < 0 ) { // always inc this //m_totalDocCount++; // only do this if we need more if ( m_numDocIds < m_docsToGet ) { // get DocIdScore class for this docid Msg39Reply *mr = m_reply[maxj]; // point to the array of DocIdScores DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo; long nds = mr->size_scoreInfo/sizeof(DocIdScore); DocIdScore *dp = NULL; for ( long i = 0 ; i < nds ; i++ ) { if ( ds[i].m_docId != *diPtr[maxj] ) continue; dp = &ds[i]; break; } // add the max to the final merged lists m_docIds [m_numDocIds] = *diPtr[maxj]; // wtf? if ( ! dp ) { // this is empty if no scoring info // supplied! if ( m_r->m_getDocIdScoringInfo ) log("msg3a: CRAP! got empty score " "info for " "d=%lli", m_docIds[m_numDocIds]); //char *xx=NULL; *xx=0; 261561804684 // qry = www.yahoo } // point to the single DocIdScore for this docid m_scoreInfos[m_numDocIds] = dp; // reset this just in case if ( dp ) { dp->m_singleScores = NULL; dp->m_pairScores = NULL; } // now fix DocIdScore::m_pairScores and m_singleScores // ptrs so they reference into the // Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf // like they should. it seems we do not free the // Msg39Replies so we should be ok referencing them. if ( dp && dp->m_singlesOffset >= 0 ) dp->m_singleScores = (SingleScore *)(mr->ptr_singleScoreBuf+ dp->m_singlesOffset) ; if ( dp && dp->m_pairsOffset >= 0 ) dp->m_pairScores = (PairScore *)(mr->ptr_pairScoreBuf + dp->m_pairsOffset ); // turn it into a float, that is what rscore_t is. // we do this to make it easier for PostQueryRerank.cpp m_scores [m_numDocIds]=(float)*rsPtr[maxj]; if ( m_r->m_doSiteClustering ) m_clusterRecs[m_numDocIds]= *ksPtr[maxj]; // clear this out //m_eventIdBits[m_numDocIds].clear(); // set this for use below hslot = m_numDocIds; // point to next available slot to add to m_numDocIds++; } // if it has ALL the required query terms, count it //if ( *bsPtr[maxj] & 0x60 ) m_numAbove++; // . add it, this should be pre-allocated! // . returns false and sets g_errno on error if ( ! htable.addKey(*diPtr[maxj],1) ) return true; } skip: // increment the split pointers from which we took the max rsPtr[maxj]++; diPtr[maxj]++; ksPtr[maxj]++; // get the next highest docid and add it in if ( m_numDocIds < m_docsToGet ) goto mergeLoop; doneMerge: if ( m_debug ) { // show how long it took logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li " "splits in %llu ms. " , (unsigned long)this, m_numDocIds, (long)m_numHosts, gettimeofdayInMilliseconds() - m_startTime ); // show the final merged docids for ( long i = 0 ; i < m_numDocIds ; i++ ) { long sh = 0; if ( m_r->m_doSiteClustering ) sh=g_clusterdb.getSiteHash26((char *) &m_clusterRecs[i]); // print out score_t logf(LOG_DEBUG,"query: msg3a: [%lu] " "%03li) merged docId=%012llu " "score=%.01f hosthash=0x%lx", (unsigned long)this, i, m_docIds [i] , (float)m_scores [i] , sh ); } } // if we had a full split, we should have gotten the cluster recs // from each split already memset ( m_clusterLevels , CR_OK , m_numDocIds ); return true; }
bool sendPageQA ( TcpSocket *sock , HttpRequest *hr ) { char pbuf[32768]; SafeBuf sb(pbuf, 32768); //char format = hr->getReplyFormat(); // set this. also sets gr->m_hr GigablastRequest gr; // this will fill in GigablastRequest so all the parms we need are set g_parms.setGigablastRequest ( sock , hr , &gr ); // // . handle a request to update the crc for this test // . test id identified by "ajaxUrlHash" which is the hash of the test's url // and the test name, QATest::m_testName long ajax = hr->getLong("ajax",0); unsigned long ajaxUrlHash ; ajaxUrlHash = (unsigned long long)hr->getLongLong("uh",0LL); unsigned long ajaxCrc ; ajaxCrc = (unsigned long long)hr->getLongLong("crc",0LL); if ( ajax ) { // make sure it is initialized if ( s_ht.m_ks ) { // overwrite current value with provided one because // the user click on an override checkbox to update // the crc s_ht.addKey ( &ajaxUrlHash , &ajaxCrc ); saveHashTable(); } // send back the urlhash so the checkbox can turn the // bg color of the "diff" gray SafeBuf sb3; sb3.safePrintf("%lu",ajaxUrlHash); g_httpServer.sendDynamicPage(sock, sb3.getBufStart(), sb3.length(), -1/*cachetime*/); return true; } // if they hit the submit button, begin the tests long submit = hr->hasField("action"); long n = sizeof(s_qatests)/sizeof(QATest); if ( submit && g_qaInProgress ) { g_errno = EINPROGRESS; g_httpServer.sendErrorReply(sock,g_errno,mstrerror(g_errno)); return true; } // set m_doTest for ( long i = 0 ; submit && i < n ; i++ ) { QATest *qt = &s_qatests[i]; char tmp[10]; sprintf(tmp,"test%li",i); qt->m_doTest = hr->getLong(tmp,0); } if ( submit ) { // reset all the static thingies resetFlags(); // save socket g_qaSock = sock; g_numErrors = 0; g_qaOutput.reset(); g_qaOutput.safePrintf("<html><body>" "<title>QA Test Results</title>\n"); g_qaOutput.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n" // update s_ht with the new crc for this test "function submitchanges(urlhash,crc) " "{\n " "var client=new XMLHttpRequest();\n" "client.onreadystatechange=gotsubmitreplyhandler;" "var " "u='/admin/qa?ajax=1&uh='+urlhash+'&crc='+crc;\n" "client.open('GET',u);\n" "client.send();\n" // use that to fix background to gray "var w=document.getElementById(urlhash);\n" // set background color "w.style.backgroundColor = '0xe0e0e0';\n" // gear spinning after checkbox "}\n\n " // call this when we got the reply that the // checkbox went through "function gotsubmitreplyhandler() {\n" // return if reply is not fully ready "if(this.readyState != 4 )return;\n" // if error or empty reply then do nothing "if(!this.responseText)return;\n" // response text is the urlhash32, unsigned long "var id=this.responseText;\n" // use that to fix background to gray "var w=document.getElementById(id);\n" // set background color "w.style.backgroundColor = '0xe0e0e0';\n" "}\n\n" "</SCRIPT> "); // and run the qa test loop if ( ! qatest( ) ) return false; // what happened? log("qa: qatest completed without blocking"); } // show tests, all checked by default, to perform g_pages.printAdminTop ( &sb , sock , hr ); sb.safePrintf("<SCRIPT LANGUAGE=\"javascript\">\n" "function checkAll(name, num)\n " "{ " " for (var i = 0; i < num; i++) {\n" " var e = document.getElementById(name + i);\n" //"alert(name+i);" " e.checked = !e.checked ;\n " " }\n" "}\n\n " "</SCRIPT> "); //sb.safePrintf("<form name=\"fo\">"); sb.safePrintf("\n<table %s>\n",TABLE_STYLE); sb.safePrintf("<tr class=hdrow><td colspan=2>" "<center><b>QA Tests</b></center>" "</td></tr>"); // header row sb.safePrintf("<tr><td><b>Do Test?</b> <a style=cursor:hand;" "cursor:pointer; " "onclick=\"checkAll('test', %li);\">(toggle)</a>",n); sb.safePrintf("</td><td><b>Test Name</b></td></tr>\n"); // . we keep the ptr to each test in an array // . print out each qa function for ( long i = 0 ; i < n ; i++ ) { QATest *qt = &s_qatests[i]; char *bg; if ( i % 2 == 0 ) bg = LIGHT_BLUE; else bg = DARK_BLUE; sb.safePrintf("<tr bgcolor=#%s>" "<td><input type=checkbox value=1 name=test%li " "id=test%li></td>" "<td>%s" "<br>" "<font color=gray size=-1>%s</font>" "</td>" "</tr>\n" , bg , i , i , qt->m_testName , qt->m_testDesc ); } sb.safePrintf("</table>\n<br>\n"); // "</form>\n"); g_pages.printAdminBottom ( &sb , hr ); g_httpServer.sendDynamicPage(sock, sb.getBufStart(), sb.length(), -1/*cachetime*/); return true; }
void processReply ( char *reply , long replyLen ) { // store our current reply SafeBuf fb2; fb2.safeMemcpy(reply,replyLen ); fb2.nullTerm(); // log that we got the reply log("qa: got reply(len=%li)(errno=%s)=%s", replyLen,mstrerror(g_errno),reply); char *content = NULL; long contentLen = 0; // get mime if ( reply ) { HttpMime mime; mime.set ( reply, replyLen , NULL ); // only hash content since mime has a timestamp in it content = mime.getContent(); contentLen = mime.getContentLen(); if ( content && contentLen>0 && content[contentLen] ) { char *xx=NULL;*xx=0; } } if ( ! content ) { content = ""; contentLen = 0; } s_content = content; // take out <responseTimeMS> markOut ( content , "<currentTimeUTC>"); markOut ( content , "<responseTimeMS>"); // until i figure this one out, take it out markOut ( content , "<docsInCollection>"); // until i figure this one out, take it out markOut ( content , "<hits>"); // for those links in the html pages markOut ( content, "rand64="); // for json markOut ( content , "\"currentTimeUTC\":" ); markOut ( content , "\"responseTimeMS\":"); markOut ( content , "\"docsInCollection\":"); // for xml markOut ( content , "<currentTimeUTC>" ); markOut ( content , "<responseTimeMS>"); markOut ( content , "<docsInCollection>"); // indexed 1 day ago markOut ( content,"indexed:"); // modified 1 day ago markOut ( content,"modified:"); // s_gigabitCount... it is perpetually incrementing static counter // in PageResults.cpp markOut(content,"ccc("); markOut(content,"id=fd"); markOut(content,"id=sd"); // for some reason the term freq seems to change a little in // the scoring table markOut(content,"id=tf"); // make checksum. we ignore back to back spaces so this // hash works for <docsInCollection>10 vs <docsInCollection>9 long contentCRC = 0; if ( content ) contentCRC = qa_hash32 ( content ); // note it log("qa: got contentCRC of %lu",contentCRC); // if what we expected, save to disk if not there yet, then // call s_callback() to resume the qa pipeline /* if ( contentCRC == s_expectedCRC ) { // save content if good char fn3[1024]; sprintf(fn3,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); File ff; ff.set ( fn3 ); if ( ! ff.doesExist() ) { // if not there yet then save it fb2.save(fn3); } // . continue on with the qa process // . which qa function that may be //s_callback(); return; } */ // // if crc of content does not match what was expected then do a diff // so we can see why not // // this means caller does not care about the response if ( ! s_checkCRC ) { //s_callback(); return; } //const char *emsg = "qa: bad contentCRC of %li should be %li " // "\n";//"phase=%li\n"; //fprintf(stderr,emsg,contentCRC,s_expectedCRC);//,s_phase-1); // hash url long urlHash32 = hash32n ( s_url.getUrl() ); // combine test function too since two tests may use the same url long nameHash = hash32n ( s_qt->m_testName ); // combine together urlHash32 = hash32h ( nameHash , urlHash32 ); static bool s_init = false; if ( ! s_init ) { s_init = true; s_ht.set(4,4,1024,NULL,0,false,0,"qaht"); // make symlink //char cmd[512]; //snprintf(cmd,"cd %s/html ;ln -s ../qa ./qa", g_hostdb.m_dir); //system(cmd); char dir[1024]; snprintf(dir,1000,"%sqa",g_hostdb.m_dir); long status = ::mkdir ( dir , S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH ); if ( status == -1 && errno != EEXIST && errno ) log("qa: Failed to make directory %s: %s.", dir,mstrerror(errno)); // try to load from disk SafeBuf fn; fn.safePrintf("%s/qa/",g_hostdb.m_dir); log("qa: loading crctable.dat"); s_ht.load ( fn.getBufStart() , "crctable.dat" ); } // break up into lines char fn2[1024]; sprintf(fn2,"%sqa/content.%lu",g_hostdb.m_dir,contentCRC); fb2.save ( fn2 ); // look up in hashtable to see what reply crc should be long *val = (long *)s_ht.getValue ( &urlHash32 ); // just return if the same if ( val && contentCRC == *val ) { g_qaOutput.safePrintf("<b style=color:green;>" "passed test</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu " "crc=<a href=/qa/content.%lu>" "%lu</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } if ( ! val ) { // add it so we know s_ht.addKey ( &urlHash32 , &contentCRC ); g_qaOutput.safePrintf("<b style=color:blue;>" "first time testing</b><br>%s : " "<a href=%s>%s</a> " "(urlhash=%lu " "crc=<a href=/qa/content.%lu>%lu" "</a>)<br>" "<hr>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, contentCRC, contentCRC); return; } log("qa: crc changed for url %s from %li to %li", s_url.getUrl(),*val,contentCRC); // get response on file SafeBuf fb1; char fn1[1024]; sprintf(fn1,"%sqa/content.%lu",g_hostdb.m_dir, *val); fb1.load(fn1); fb1.nullTerm(); // do the diff between the two replies so we can see what changed char cmd[1024]; sprintf(cmd,"diff %s %s > /tmp/diffout",fn1,fn2); log("qa: %s\n",cmd); system(cmd); g_numErrors++; g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : " "<a href=%s>%s</a> (urlhash=%lu)<br>" "<input type=checkbox name=urlhash%lu value=1 " // use ajax to update test crc. if you undo your // check then it should put the old val back. // when you first click the checkbox it should // gray out the diff i guess. "onclick=submitchanges(%lu,%lu);> " "Accept changes" "<br>" "original on left, new on right. " "oldcrc = <a href=/qa/content.%lu>%lu</a>" " != <a href=/qa/content.%lu>%lu</a> = newcrc" "<br>diff output follows:<br>" "<pre id=%lu style=background-color:0xffffff;>", s_qt->m_testName, s_url.getUrl(), s_url.getUrl(), urlHash32, // input checkbox name field urlHash32, // submitchanges() parms urlHash32, contentCRC, // original/old content.%lu *val, *val, // new content.%lu contentCRC, contentCRC, // for the pre tag id: urlHash32); // store in output SafeBuf sb; sb.load("/tmp/diffout"); g_qaOutput.htmlEncode ( sb.getBufStart() ); g_qaOutput.safePrintf("</pre><br><hr>"); // if this is zero allow it to slide by. it is learning mode i guess. // so we can learn what crc we need to use. // otherwise, stop right there for debugging //if ( s_expectedCRC != 0 ) exit(1); // keep on going //s_callback(); }