// a quickie // this url gives a m_preCount that is too low. why? // http://go.tfol.com/163/speed.asp long countWords ( char *p , long plen , long niceness ) { char *pend = p + plen; long count = 1; loop: // sequence of punct for ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) { // breathe QUICKPOLL ( niceness ); // in case being set from xml tags, count as words now if ( *p=='<') count++; } count++; // sequence of alnum for ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) // breathe QUICKPOLL ( niceness ); count++; if ( p < pend ) goto loop; // some extra for good meaure return count+10; }
// . ***** META LIST DELETE LOOP ***** // . scan for meta lists to remove from syncdb // . check every D KEY // . must NOT have any "need to send request" keys (a bit set) // . must NOT have any "need to recv request" keys (b bit set) // . must NOT have our "need to add" key (c bit set) void Syncdb::loop3 ( ) { // . loop over the meta lists we need to delete // . these are "d" keys // . use a "tid" of 0 key128_t sk = makeKey ( 0,0,0,1,0,0,0,0 ); key128_t ek = makeKey ( 0,0,0,1,0,0xffffffff,0xffffffffffffffffLL,1 ); // get the first node in sequence, if any long nn = m_qt.getNextNode ( 0 , (char *)&sk ); // do the loop for ( ; nn >= 0 ; nn = m_qt.getNextNode ( nn ) ) { // breathe QUICKPOLL ( MAX_NICENESS ); // get key key128_t k = *(key128_t *)m_qt.getKey ( nn ); // stop when we hit the end if ( k > ek ) break; // get zid uint64_t zid = getZid ( &k ); // get sid uint32_t sid = getSid ( &k ); // have we sent/recvd all checkoff requests required? have // we added the meta list? if so, we can nuke it from syncdb if ( ! canDeleteMetaList ( sid, zid ) ) { // no use banging away at this sid any more since we // are missing another action for this one sid++; // find the key of the FIRST meta list we need to add // for this new senderId, "sid" key128_t nk = makeKey ( 0,0,0,1,0,sid,0,0 ); // undo the m_qt.getNextNode(nn) we call in for loop nn = m_qt.getPrevNode ( 0 , (char *)&nk ); // sanity check if ( nn < 0 ) { char *xx=NULL;*xx=0; } // get next key from this new sid continue; } // . make the negative key for syncdb // . it just uses a negative "c" key, with a tid of 0 key128_t dk = makeKey ( 0,0,1,0,0,sid,zid,0); // . add it to syncdb to signifiy a delete // . this returns false and sets g_errno on error if(!m_rdb.addRecord((collnum_t)0,(char *)&dk,NULL,0, MAX_NICENESS)) return; // delete it from quick tree now that we added the negative // key successfully to syncdb long dn = m_qt.getNode ( 0, (char *)&k ); // must be there! if ( ! dn ) { char *xx=NULL;*xx=0; } // nuke it m_qt.deleteNode ( dn , true ); } // . success // . do not recall until big loop completes a round m_calledLoop3 = true; }
// . returns false and sets g_errno on error // . we are responsible for freeing reply/replySize void Msg0::gotReply ( char *reply , int32_t replySize , int32_t replyMaxSize ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN" ); // timing debug if ( g_conf.m_logTimingNet && m_rdbId==RDB_POSDB && m_startTime > 0 ) log(LOG_TIMING,"net: msg0: Got termlist, termId=%" PRIu64". " "Took %" PRId64" ms, replySize=%" PRId32" (niceness=%" PRId32").", g_posdb.getTermId ( m_startKey ) , gettimeofdayInMilliseconds()-m_startTime, replySize,m_niceness); // TODO: insert some seals for security, may have to alloc // separate space for the list then // set the list w/ the remaining data QUICKPOLL(m_niceness); m_list->set ( reply , replySize , reply , // alloc buf begins here, too replyMaxSize , m_startKey , m_endKey , m_fixedDataSize , true , // ownData? m_useHalfKeys , m_ks ); // return now if we don't add to cache //if ( ! m_addToCache ) return; // // add posdb list to termlist cache // //if ( m_rdbId != RDB_POSDB ) return; // add to LOCAL termlist cache //addToTermListCache(m_coll,m_startKey,m_endKey,m_list); // ignore any error adding to cache //g_errno = 0; // . NO! no more network caching, we got gigabit... save space // for our disk, no replication, man, mem is expensive // . throw the just the list into the net cache // . addToNetCache() will copy it for it's own // . our current copy should be freed by the user's callback somewhere // . grab our corresponding rdb's local cache // . we'll use it to store this list since there's no collision chance //RdbCache *cache = m_rdb->getCache (); // . add the list to this cache // . returns false and sets g_errno on error // . will not be added if cannot copy the data //cache->addList ( m_startKey , m_list ) ; // reset g_errno -- we don't care if cache coulnd't add it //g_errno = 0; logTrace( g_conf.m_logTraceMsg0, "END" ); }
// after you read/write from/to disk, copy into the page cache void DiskPageCache::addPages ( long vfd, char *buf, long numBytes, long long offset , long niceness ){ // check for override function //if ( m_isOverriden ) { // m_addPages2 ( this, // vfd, // buf, // numBytes, // offset ); // return; //} // if vfd is -1, then we were not able to add a map for this file if ( vfd < 0 ) return; // no NULL ptrs if ( ! buf ) return; // return if no pages allowed in page cache if ( m_maxMemOff == 0 ) return; // or disabled if ( ! m_enabled ) return; // disabled at the master controls? if ( m_switch && ! *m_switch ) return; // sometimes the file got unlinked on us if ( ! m_memOff[vfd] ) return; // what is the page range? long long sp = offset / m_pageSize ; // point to it char *bufPtr = buf; char *bufEnd = buf + numBytes; // . do not add first page unless right on the boundary // . how much did we exceed the boundary by? oldshort skip = offset - sp * m_pageSize ; long size = m_pageSize - skip; // now add the remaining pages while ( bufPtr < bufEnd ) { // breathe QUICKPOLL(niceness); // ensure "size" is not too big if ( bufPtr + size > bufEnd ) size = bufEnd - bufPtr; // add the page to memory addPage ( vfd , sp , bufPtr , size , skip ); // advance bufPtr += size; sp++; size = m_pageSize; skip = 0; } }
// . returns true if document is adult, false otherwise bool AdultBit::getBit ( char *s , int32_t niceness) { // rudimentary adult detection algorithm int32_t i = 0; int32_t dirties = 0; int32_t j; int32_t slen; loop: // skip until we hit an alpha while ( s[i] && ! is_alpha_a(s[i]) ) i++; // return if done if ( ! s[i] ) return false; // . point to char after this alpha // . return if none j = i + 1; // find end of the alpha char sequence while ( s[j] && is_alpha_a(s[j]) ) j++; // skip over 1 or 2 letter words slen = j - i; if ( slen <= 2 ) { i = j; goto loop; } // it's adult content if it has just 1 obscene word if ( isObscene ( (char *) s+i , slen ) ) return true; // W = non-dirty word // D = dirty word // . = sequence of punctuation/num and/or 1 to 2 letter words // dirty sequences: // . D . D . D . (dirties=6) // . D . W . D . D . (dirties=5) // . basically, if 3 out of 4 words in a subsequence are // "dirty" then the whole document is "adult" content if ( isDirty ( (char *) s+i , slen ) ) { dirties += 2; if ( dirties >= 5 ) return true; i = j; goto loop; } dirties--; if ( dirties < 0 ) dirties = 0; QUICKPOLL((niceness)); i = j; goto loop; }
// . return false with g_errno set on error, true otherwise // . looking at the number of points per second // . average query latency for last 20 queries // . average disk bytes read for last 20 accesses // . val is the State::m_value measurement, a float // . also each point may represent a number of bytes transferred in which // case we use that number rather than "1", which is the default bool Statsdb::addPointsFromList ( Label *label ) { StatState *ss = getStatState ( label->m_graphHash ); // return false with g_errno set if ( ! ss ) return false; m_list.resetListPtr(); // scan the list for our junk for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) { // breathe QUICKPOLL(m_niceness); // get that StatKey *sk = (StatKey *)m_list.getCurrentRec(); // and data StatData *sd = (StatData *)m_list.getCurrentData(); // must be a "query" stat if ( sk->m_labelHash != label->m_labelHash ) continue; // add that addPoint ( sk , sd , ss , label ); } return true; }
// // . add EventPoints to m_sb3/m_ht3 // . these basically represent binary events or parm state changes // . i.e. "a merge operation" // . i.e. "changing a parm value" // bool Statsdb::addEventPointsFromList ( ) { m_list.resetListPtr(); // scan the list for our junk for ( ; ! m_list.isExhausted() ; m_list.skipCurrentRecord() ) { // breathe QUICKPOLL(m_niceness); // get that StatKey *sk = (StatKey *)m_list.getCurrentRec(); // and data StatData *sd = (StatData *)m_list.getCurrentData(); // must be an "event" stat... i.e. a status change if ( ! sd->isEvent() ) continue; // make sure to stack lines so they do not touch // each other... if ( ! addEventPoint ( sk->m_time1 , sk->m_labelHash , // parmHash sd->getOldVal () , sd->getNewVal () , 10 )) // thickness return false; } return true; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb collnum_t collnum , RdbList *list , const char *startKey , const char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int64_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , bool isRealMerge , bool allowPageCache , bool forceLocalIndexdb , bool noSplit , int32_t forceParitySplit ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId ); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if( g_conf.m_logTraceMsg0 ) // { // log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId); // log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks); // log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId); // } // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; logTrace( g_conf.m_logTraceMsg0, "END" ); log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; } // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; m_addToCache = addToCache; // . these define our request 100% KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) { log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId ); } // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { logTrace( g_conf.m_logTraceMsg0, "isLocal" ); if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_isRealMerge , m_allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END, return false" ); return false; } // nuke it reset(); logTrace( g_conf.m_logTraceMsg0, "END, return true" ); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%" PRIu32" " "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" " //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")", "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId); logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" ); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) { // cback niceness logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" ); return true; } // return false cuz it blocked logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" ); return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; buf = replyBuf; // get the multicast Multicast *m = &m_mcast; if ( ! m->send ( m_request , m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout*1000 , // timeout niceness , firstHostId , buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log(LOG_ERROR, "net: Failed to send request for data from %s in shard " "#%" PRIu32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // but speed it up m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) { logTrace( g_conf.m_logTraceMsg0, "END - returning false" ); return false; } logTrace( g_conf.m_logTraceMsg0, "END - returning true" ); return true; } m_numRequests++; // we blocked logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" ); return false; }
// . slot should be auto-nuked upon transmission or error // . TODO: ensure if this sendReply() fails does it really nuke the slot? void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) { // get the state State00 *st0 = (State00 *)state; // extract the udp slot and list and msg5 UdpSlot *slot = st0->m_slot; RdbList *list = &st0->m_list; Msg5 *msg5 = &st0->m_msg5; UdpServer *us = st0->m_us; // sanity check -- ensure they match //if ( niceness != st0->m_niceness ) // log("Msg0: niceness mismatch"); // debug msg //if ( niceness != 0 ) // log("HEY! niceness is not 0"); // timing debug if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) { //log("Msg0:hndled request %"UINT64"",gettimeofdayInMilliseconds()); int32_t size = -1; if ( list ) size = list->getListSize(); log(LOG_TIMING|LOG_DEBUG, "net: msg0: Handled request for data. " "Now sending data termId=%"UINT64" size=%"INT32"" " transId=%"INT32" ip=%s port=%i took=%"INT64" " "(niceness=%"INT32").", g_posdb.getTermId(msg5->m_startKey), size,slot->m_transId, iptoa(slot->m_ip),slot->m_port, gettimeofdayInMilliseconds() - st0->m_startTime , st0->m_niceness ); } // debug //if ( ! msg5->m_includeTree ) // log("hotit\n"); // on error nuke the list and it's data if ( g_errno ) { mdelete ( st0 , sizeof(State00) , "Msg0" ); delete (st0); // TODO: free "slot" if this send fails us->sendErrorReply ( slot , g_errno ); return; } QUICKPOLL(st0->m_niceness); // point to the serialized list in "list" char *data = list->getList(); int32_t dataSize = list->getListSize(); char *alloc = list->getAlloc(); int32_t allocSize = list->getAllocSize(); // tell list not to free the data since it is a reply so UdpServer // will free it when it destroys the slot list->setOwnData ( false ); // keep track of stats Rdb *rdb = getRdbFromId ( st0->m_rdbId ); if ( rdb ) rdb->sentReplyGet ( dataSize ); // TODO: can we free any memory here??? // keep track of how long it takes to complete the send st0->m_startTime = gettimeofdayInMilliseconds(); // debug point int32_t oldSize = msg5->m_minRecSizes; int32_t newSize = msg5->m_minRecSizes + 20; // watch for wrap around if ( newSize < oldSize ) newSize = 0x7fffffff; if ( dataSize > newSize && list->getFixedDataSize() == 0 && // do not annoy me with these linkdb msgs dataSize > newSize+100 ) log(LOG_LOGIC,"net: msg0: Sending more data than what was " "requested. Ineffcient. Bad engineer. dataSize=%"INT32" " "minRecSizes=%"INT32".",dataSize,oldSize); /* // always compress these lists if ( st0->m_rdbId == RDB_SECTIONDB ) { // && 1 == 3) { // get sh48, the sitehash key128_t *startKey = (key128_t *)msg5->m_startKey ; int64_t sh48 = g_datedb.getTermId(startKey); // debug //log("msg0: got sectiondblist from disk listsize=%"INT32"", // list->getListSize()); if ( dataSize > 50000 ) log("msg0: sending back list rdb=%"INT32" " "listsize=%"INT32" sh48=0x%"XINT64"", (int32_t)st0->m_rdbId, dataSize, sh48); // save it int32_t origDataSize = dataSize; // store compressed list on itself char *dst = list->m_list; // warn if niceness is 0! if ( st0->m_niceness == 0 ) log("msg0: compressing sectiondb list at niceness 0!"); // compress the list uint32_t lastVoteHash32 = 0LL; SectionVote *lastVote = NULL; for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // get rec char *rec = list->getCurrentRec(); // for ehre key128_t *key = (key128_t *)rec; // the score is the bit which is was set in // Section::m_flags for that docid int32_t secType = g_indexdb.getScore ( (char *)key ); // 0 means it probably used to count # of voters // from this site, so i don't think xmldoc uses // that any more if ( secType == SV_SITE_VOTER ) continue; // treat key like a datedb key and get the taghash uint32_t h32 = g_datedb.getDate ( key ); // get data/vote from the current record in the // sectiondb list SectionVote *sv=(SectionVote *)list->getCurrentData (); // get the average score for this doc float avg = sv->m_score ; if ( sv->m_numSampled > 0.0 ) avg /= sv->m_numSampled; // if same as last guy, add to it if ( lastVoteHash32 == h32 && lastVote ) { // turn possible multi-vote into single docid // into a single vote, with the score averaged. lastVote->m_score += avg; lastVote->m_numSampled++; continue; } // otherwise, add in a new guy! *(key128_t *)dst = *key; dst += sizeof(key128_t); // the new vote SectionVote *dsv = (SectionVote *)dst; dsv->m_score = avg; dsv->m_numSampled = 1; // set this lastVote = dsv; lastVoteHash32 = h32; // skip over dst += sizeof(SectionVote); } // update the list size now for sending back dataSize = dst - data; // if the list was over the requested minrecsizes we need // to set a flag so that the caller will do a re-call. // so making the entire odd, will be the flag. if ( origDataSize > msg5->m_minRecSizes && dataSize < origDataSize ) { *dst++ = '\0'; dataSize++; } // debug //log("msg0: compressed sectiondblist from disk " // "newlistsize=%"INT32"", dataSize); // use this timestamp int32_t now = getTimeLocal();//Global(); // finally, cache this sucker s_sectiondbCache.addRecord ( msg5->m_coll, (char *)startKey,//(char *)&sh48 data, dataSize , now ); // ignore errors g_errno = 0; } */ // // for linkdb lists, remove all the keys that have the same IP32 // and store a count of what we removed somewhere // if ( st0->m_rdbId == RDB_LINKDB ) { // store compressed list on itself char *dst = list->m_list; // keep stats int32_t totalOrigLinks = 0; int32_t ipDups = 0; int32_t lastIp32 = 0; char *listEnd = list->getListEnd(); // compress the list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // count it totalOrigLinks++; // get rec char *rec = list->getCurrentRec(); int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec ); // same as one before? if ( ip32 == lastIp32 && // are we the last rec? include that for // advancing the m_nextKey in Linkdb more // efficiently. rec + LDBKS < listEnd ) { ipDups++; continue; } // store it gbmemcpy (dst , rec , LDBKS ); dst += LDBKS; // update it lastIp32 = ip32; } // . if we removed one key, store the stats // . caller should recognize reply is not a multiple of // the linkdb key size LDBKS and no its there! if ( ipDups ) { //*(int32_t *)dst = totalOrigLinks; //dst += 4; //*(int32_t *)dst = ipDups; //dst += 4; } // update list parms list->m_listSize = dst - list->m_list; list->m_listEnd = list->m_list + list->m_listSize; data = list->getList(); dataSize = list->getListSize(); } //log("sending replySize=%"INT32" min=%"INT32"",dataSize,msg5->m_minRecSizes); // . TODO: dataSize may not equal list->getListMaxSize() so // Mem class may show an imblanace // . now g_udpServer is responsible for freeing data/dataSize // . the "true" means to call doneSending_ass() from the signal handler // if need be st0->m_us->sendReply_ass ( data , dataSize , alloc , // alloc allocSize , // alloc size slot , 60 , st0 , doneSending_ass , -1 , -1 , true ); }
// hostId is the remote hostid sending us the lock request void removeExpiredLocks ( int32_t hostId ) { // when we last cleaned them out static time_t s_lastTime = 0; int32_t nowGlobal = getTimeGlobalNoCore(); // only do this once per second at the most if ( nowGlobal <= s_lastTime ) return; // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; restart: // scan the slots int32_t ns = ht->m_numSlots; // . clean out expired locks... // . if lock was there and m_expired is up, then nuke it! // . when Rdb.cpp receives the "fake" title rec it removes the // lock, only it just sets the m_expired to a few seconds in the // future to give the negative doledb key time to be absorbed. // that way we don't repeat the same url we just got done spidering. // . this happens when we launch our lock request on a url that we // or a twin is spidering or has just finished spidering, and // we get the lock, but we avoided the negative doledb key. for ( int32_t i = 0 ; i < ns ; i++ ) { // breathe QUICKPOLL(MAX_NICENESS); // skip if empty if ( ! ht->m_flags[i] ) continue; // cast lock UrlLock *lock = (UrlLock *)ht->getValueFromSlot(i); int64_t lockKey = *(int64_t *)ht->getKeyFromSlot(i); // if collnum got deleted or reset collnum_t collnum = lock->m_collnum; if ( collnum >= g_collectiondb.m_numRecs || ! g_collectiondb.m_recs[collnum] ) { log("spider: removing lock from missing collnum " "%" PRId32,(int32_t)collnum); goto nuke; } // skip if not yet expired if ( lock->m_expires == 0 ) continue; if ( lock->m_expires >= nowGlobal ) continue; // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock after waiting. elapsed=%" PRId32"." " lockKey=%" PRIu64" hid=%" PRId32" expires=%" PRIu32" " "nowGlobal=%" PRIu32, (nowGlobal - lock->m_timestamp), lockKey,hostId, (uint32_t)lock->m_expires, (uint32_t)nowGlobal); nuke: // nuke the slot and possibly re-chain ht->removeSlot ( i ); // gotta restart from the top since table may have shrunk goto restart; } // store it s_lastTime = nowGlobal; }
// . now come here when we got the necessary index lists // . returns false if blocked, true otherwise // . sets g_errno on error bool Msg39::intersectLists ( ) { // bool updateReadInfo ) { // bail on error if ( g_errno ) { hadError: log("msg39: Had error getting termlists: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply (m_slot,this,NULL,0,0,true); return true; } // timestamp log if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Got %"INT32" lists in %"INT64" ms" , (PTRTYPE)this,m_tmpq.getNumTerms(), gettimeofdayInMilliseconds() - m_startTime); m_startTime = gettimeofdayInMilliseconds(); } // breathe QUICKPOLL ( m_r->m_niceness ); // ensure collection not deleted from under us CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; goto hadError; } // . set the IndexTable so it can set it's score weights from the // termFreqs of each termId in the query // . this now takes into account the special termIds used for sorting // by date (0xdadadada and 0xdadadad2 & TERMID_MASK) // . it should weight them so much so that the summation of scores // from other query terms cannot make up for a lower date score // . this will actually calculate the top // . this might also change m_tmpq.m_termSigns // . this won't do anything if it was already called m_posdbTable.init ( &m_tmpq , m_debug , this , &m_tt , m_r->m_collnum,//ptr_coll , &m_msg2 , // m_lists , //m_tmpq.m_numTerms , // m_numLists m_r ); // breathe QUICKPOLL ( m_r->m_niceness ); // . we have to do this here now too // . but if we are getting weights, we don't need m_tt! // . actually we were using it before for rat=0/bool queries but // i got rid of NO_RAT_SLOTS if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) { if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply ( m_slot , this , NULL , 0 , 0 , true); return true; } // if msg2 had ALL empty lists we can cut it int16_t if ( m_posdbTable.m_topTree->m_numNodes == 0 ) { //estimateHitsAndSendReply ( ); return true; } // we have to allocate this with each call because each call can // be a different docid range from doDocIdSplitLoop. if ( ! m_posdbTable.allocWhiteListTable() ) { log("msg39: Had error allocating white list table: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } //sendReply (m_slot,this,NULL,0,0,true); return true; } // do not re do it if doing docid range splitting m_allocedTree = true; // . now we must call this separately here, not in allocTopTree() // . we have to re-set the QueryTermInfos with each docid range split // since it will set the list ptrs from the msg2 lists if ( ! m_posdbTable.setQueryTermInfo () ) return true; // print query term bit numbers here for ( int32_t i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) { QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; SafeBuf sb; sb.safePrintf("query: msg39: BITNUM query term #%"INT32" \"%s\" " "bitnum=%"INT32" ", i , qt->m_term, qt->m_bitNum ); // put it back *tpc = tmp; logf(LOG_DEBUG,"%s",sb.getBufStart()); } // timestamp log if ( m_debug ) { log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Preparing to intersect " "took %"INT64" ms", (PTRTYPE)this, gettimeofdayInMilliseconds() - m_startTime ); m_startTime = gettimeofdayInMilliseconds(); } // time it int64_t start = gettimeofdayInMilliseconds(); int64_t diff; // . don't bother making a thread if lists are small // . look at STAGE? in IndexReadInfo.cpp to see how we read in stages // . it's always saying msg39 handler is hogging cpu...could this be it //if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread; // debug //goto skipThread; // . NOW! let's do this in a thread so we can continue to service // incoming requests // . don't launch more than 1 thread at a time for this // . set callback when thread done // breathe QUICKPOLL ( m_r->m_niceness ); // . create the thread // . only one of these type of threads should be launched at a time if ( ! m_debug && g_threads.call ( INTERSECT_THREAD , // threadType m_r->m_niceness , this , // top 4 bytes must be cback controlLoopWrapper2,//threadDoneWrapper , addListsWrapper ) ) { m_blocked = true; return false; } // if it failed //log(LOG_INFO,"query: Intersect thread creation failed. Doing " // "blocking. Hurts performance."); // check tree if ( m_tt.m_nodes == NULL ) { log(LOG_LOGIC,"query: msg39: Badness."); char *xx = NULL; *xx = 0; } // sometimes we skip the thread //skipThread: // . addLists() should never have a problem // . g_errno should be set by prepareToAddLists() above if there is // going to be a problem //if ( m_r->m_useNewAlgo ) m_posdbTable.intersectLists10_r ( ); //else // m_posdbTable.intersectLists9_r ( ); // time it diff = gettimeofdayInMilliseconds() - start; if ( diff > 10 ) log("query: Took %"INT64" ms for intersection",diff); // returns false if blocked, true otherwise //return addedLists (); return true; }
// . the main function to get the docids for the provided query in "req" // . it always blocks i guess void Msg39::getDocIds2 ( Msg39Request *req ) { // flag it as in use m_inUse = true; // store it, might be redundant if called from getDocIds() above m_r = req; // a handy thing m_debug = false; if ( m_r->m_debug ) m_debug = true; if ( g_conf.m_logDebugQuery ) m_debug = true; if ( g_conf.m_logTimingQuery ) m_debug = true; // ensure it's size is ok /* if ( m_r->size_whiteList <= 0 ) { g_errno = ENOCOLLREC; log(LOG_LOGIC,"query: msg39: getDocIds: %s." , mstrerror(g_errno) ); sendReply ( m_slot , this , NULL , 0 , 0 , true ); return ; } */ CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; log(LOG_LOGIC,"query: msg39: getDocIds: %s." , mstrerror(g_errno) ); sendReply ( m_slot , this , NULL , 0 , 0 , true ); return ; } // . set our m_q class // . m_boolFlag is either 1 or 0 in this case, the caller did the // auto-detect (boolFlag of 2) before calling us // . this now calls Query::addCompoundTerms() for us if ( ! m_tmpq.set2 ( m_r->ptr_query , m_r->m_language , m_r->m_queryExpansion , m_r->m_useQueryStopWords ) ) { log("query: msg39: setQuery: %s." , mstrerror(g_errno) ); sendReply ( m_slot , this , NULL , 0 , 0 , true ); return ; } // wtf? if ( g_errno ) { char *xx=NULL;*xx=0; } QUICKPOLL ( m_r->m_niceness ); // set m_errno if ( m_tmpq.m_truncated ) m_errno = EQUERYTRUNCATED; // ensure matches with the msg3a sending us this request if ( m_tmpq.getNumTerms() != m_r->m_nqt ) { g_errno = EBADENGINEER; log("query: Query parsing inconsistency for q=%s. " "langid=%"INT32". Check langids and m_queryExpansion parms " "which are the only parms that could be different in " "Query::set2(). You probably have different mysynoyms.txt " "files on two different hosts! check that!!" ,m_tmpq.m_orig ,(int32_t)m_r->m_language ); sendReply ( m_slot , this , NULL , 0 , 0 , true ); return ; } // debug if ( m_debug ) logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] Got request " "for q=%s", (PTRTYPE) this,m_tmpq.m_orig); // reset this m_tt.reset(); QUICKPOLL ( m_r->m_niceness ); // . if caller already specified a docid range, then be loyal to that! // . or if we do not have enough query terms to warrant splitting //if ( m_numDocIdSplits == 1 ) { // getLists(); // return; //} // . set up docid range cursor // . do twin splitting // . we do no do it this way any more... we subsplit each split // into two halves...!!! see logic in getLists() below!!! //if ( m_r->m_stripe == 1 ) { // m_ddd = MAX_DOCID / 2LL; // m_dddEnd = MAX_DOCID + 1LL; //} //else if ( m_r->m_stripe == 0 ) { // m_ddd = 0; // m_dddEnd = MAX_DOCID / 2LL; //} // support triplets, etc. later //else { // char *xx=NULL;*xx=0; //} // do not do twin splitting if only one host per group //if ( g_hostdb.getNumStripes() == 1 ) { m_ddd = 0; m_dddEnd = MAX_DOCID; //} m_phase = 0; // if ( m_r->m_docsToGet <= 0 ) { // estimateHitsAndSendReply ( ); // return; // } // if ( m_tmpq.m_numTerms <= 0 ) { // estimateHitsAndSendReply ( ); // return; // } // . otherwise, to prevent oom, split up docids into ranges // and get winners of each range. //if ( ! doDocIdSplitLoop() ) return; // . return false if it blocks true otherwise // . it will send a reply when done if ( ! controlLoop() ) return; // error? // if ( g_errno ) { // log(LOG_LOGIC,"query: msg39: doDocIdSplitLoop: %s." , // mstrerror(g_errno) ); // sendReply ( m_slot , this , NULL , 0 , 0 , true ); // return ; // } // it might not have blocked! if all lists in tree and used no thread // it will come here after sending the reply and destroying "this" return; }
// returns false if blocked, true otherwise bool Statsdb::gifLoop ( ) { // shortcut Msg5 *m = &m_msg5; //#ifndef _USEPLOTTER_ //return true; //#endif // loop over all the lists in the time range, [m_t1,m_t2] for ( ; ! m_done ; ) { if ( ! m->getList ( (char)RDB_STATSDB , "statsdb" , // coll &m_list , (char *)&m_startKey , (char *)&m_endKey , 32000 , // requested scan size true , // include tree? false , // add to cache? 0 , // max cache age 0 , // start file number -1 , // number of files NULL , // state gotListWrapper, // callback m_niceness , // niceness false , // do error correction? NULL , // cache key pointer 0 , // # retries -1 , // max # retries true , // compensate for merge? -1 , // sync point NULL ) ) // msg5b return false; // . process list // . returns false with g_errno set on error if ( ! processList() ) return true; } // define time delta - commented out because it's currently not used. long dt = m_t2 - m_t1; //#ifdef _USEPLOTTER_ // gif size //char tmp[64]; // dimensions of the gif //sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 ); //GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp ); // create one //GIFPlotter plotter ( NULL , m_fd , NULL ); // open it //plotter.openpl ( ); // define the space with boundaries 100 unit wide boundaries //plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 ); // line thickness in user coordinates (pixels for us) //plotter.linewidth ( 1 ); // set bg color to gray (r/g/b) //plotter.bgcolor ( 0xd600 , 0xce00 , 0xd600 ); // erase Plotter's graphics display //plotter.erase (); // draw axises in black //plotter.pencolorname ("black"); // // main graphing window // m_gw.safePrintf("<div style=\"position:relative;" "background-color:#c0c0c0;" //"overflow-y:hidden;" "overflow-x:hidden;" "z-index:-10;" // the tick marks we print below are based on it // being a window of the last 20 seconds... and using // DX pixels "min-width:%lipx;" "min-height:%lipx;" //"width:100%%;" //"min-height:600px;" "margin-top:10px;" "margin-bottom:10px;" "margin-right:10px;" "margin-left:10px;\">" ,(long)DX + 2 *m_bx ,(long)DY + 2*m_by); // draw the x-axis //plotter.line ( m_bx , m_by , DX + m_bx , m_by ); // 10 x-axis tick marks for ( int x = DX/20 ; x <= DX ; x += DX/20 ) { // tick mark //plotter.line ( x , -20 , x , 20 ); m_gw.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:0;" "background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\"></div>\n" , m_bx + (long)x-1 ); long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt; // LABEL m_gw.safePrintf("<div style=\"position:absolute;" "left:%li;" "bottom:20;" //"background-color:#000000;" "z-index:110;" "min-height:20px;" "min-width:3px;\">%lis</div>\n" , (long)x-10 + m_bx // the label: , xv ); } HashTableX tmpht; tmpht.set(4,0,0,NULL,0,false,m_niceness,"statsparms"); long col = 0; m_sb2->safePrintf("<table border=1 width=100%%>\n"); // label offset to prevent collisions of superimposing multiple // graph calbrations long zoff = 0; // // point to the triplets in m_sb1's buffer (x,y,c) // char *p = m_sb1.getBufStart(); char *pend = p + m_sb1.length(); for ( ; p < pend ; p += 12 ) { // breathe QUICKPOLL ( m_niceness ); // get graph hash of this point long gh = *(long *)(p +8); // if we already did this graph, skip it if ( tmpht.isInTable ( &gh ) ) continue; // . graph this single graph of this color // . returns ptr to first point of different color! plotGraph ( p , pend , gh , m_gw , zoff ); // prevent collisions zoff += 20; // get the label based on graphHash Label *bb = getLabel ( gh ); // add to key if ( col == 0 ) m_sb2->safePrintf("<tr>"); m_sb2->safePrintf("<td bgcolor=#%06lx> </td>" "<td>%s</td>\n", bb->m_color , bb->m_keyDesc ); if ( col == 1 ) m_sb2->safePrintf("</tr>\n"); // inc column and wrap if ( ++col >= 2 ) col = 0; // . do not re-display // . TODO: deal with error tmpht.addKey ( &gh ); } // clear that up m_sb1.reset(); // now plot the events, horizontal line segments like the performance // graph uses for ( long i = 0 ; i < m_ht3.m_numSlots ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // skip if slot empty if ( ! m_ht3.m_flags[i] ) continue; // get the offset into m_sb3 long offset = *(long *)m_ht3.getValueFromSlot(i); // get buf start char *bufStart = m_sb3.getBufStart(); // get the ptr EventPoint *pp = (EventPoint *)(bufStart + offset); // get name of parm Parm *m = g_parms.getParmFromParmHash ( pp->m_parmHash ); // make sure we got it if ( ! m ) { log("statsdb: unrecognized parm hash = %li", pp->m_parmHash); continue; //char *xx=NULL;*xx=0; } } // set the line width //plotter.linewidth ( pp->m_thickness ); // get parm hash long colorHash = pp->m_parmHash; // add in old/new values to make it different colorHash = hash32h ( (long)pp->m_oldVal , colorHash ); colorHash = hash32h ( (long)pp->m_newVal , colorHash ); // . get color // . is really the parm hash in disguise long c1 = colorHash & 0x00ffffff; // use the color specified from addStat_r() for this line/pt //plotter.pencolor ( ((c1 >> 16) & 0xff) << 8 , // ((c1 >> 8) & 0xff) << 8 , // ((c1 >> 0) & 0xff) << 8 ); long x1 = pp->m_a; long x2 = pp->m_b; long y1 = *(long *)m_ht3.getKey(i); // i value // ensure at least 3 units wide for visibility if ( x2 < x1 + 10 ) x2 = x1 + 10; // . flip the y so we don't have to scroll the browser down // . DY does not include the axis and tick marks //long fy1 = DY - y1 + m_by ; // plot it //plotter.line ( x1 , fy1 , x2 , fy1 ); drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness ); // add to map key? only if we haven't already if ( tmpht.isInTable ( &colorHash ) ) continue; // add it if ( col == 0 ) m_sb2->safePrintf("<tr>"); char *title = "unknown parm"; if ( m ) title = m->m_title; m_sb2->safePrintf("<td bgcolor=#%06lx> </td>",c1); // print the parm name and old/new values m_sb2->safePrintf("<td><b>%s</b>",title); if ( pp->m_oldVal != pp->m_newVal ) m_sb2->safePrintf(" (%.02f -> %.02f)", pp->m_oldVal,pp->m_newVal); m_sb2->safePrintf("</td>"); if ( col == 1 ) m_sb2->safePrintf("</tr>\n"); // inc column and wrap if ( ++col >= 2 ) col = 0; // . do not re-display // . TODO: deal with error tmpht.addKey ( &colorHash ) ; } m_sb2->safePrintf("</table>\n"); // clear that up m_ht3.reset(); m_sb3.reset(); // and stat states m_ht0.reset(); m_sb0.reset(); // all done free some mem m_sb1.reset(); //m_sb2.reset(); // // but not m_sb2 cuz that has the html in it!! // // all done //if ( plotter.closepl () < 0 ) // log("admin: Could not close performance graph object."); // close the file //fclose ( m_fd ); //#endif // close main graphing window m_gw.safePrintf("</div>\n"); return true; }
// returns false and sets g_errno on error bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query, LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize, uint8_t contentType, uint8_t langId, int32_t niceness ) { // make Msg20.cpp faster if it is just has // Msg20Request::m_setForLinkInfo set to true, no need to extricate a title. if ( maxTitleLen <= 0 ) { return true; } m_niceness = niceness; m_maxTitleLen = maxTitleLen; // if this is too big the "first line" algo can be huge!!! // and really slow everything way down with a huge title candidate int32_t maxTitleWords = 128; // assume no title reset(); int32_t NW = words->getNumWords(); // // now get all the candidates // // . allow up to 100 title CANDIDATES // . "as" is the word # of the first word in the candidate // . "bs" is the word # of the last word IN the candidate PLUS ONE int32_t n = 0; int32_t as[MAX_TIT_CANDIDATES]; int32_t bs[MAX_TIT_CANDIDATES]; float scores[MAX_TIT_CANDIDATES]; Words *cptrs[MAX_TIT_CANDIDATES]; int32_t types[MAX_TIT_CANDIDATES]; int32_t parent[MAX_TIT_CANDIDATES]; // record the scoring algos effects float baseScore [MAX_TIT_CANDIDATES]; float noCapsBoost [MAX_TIT_CANDIDATES]; float qtermsBoost [MAX_TIT_CANDIDATES]; float inCommonCandBoost[MAX_TIT_CANDIDATES]; // reset these for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) { // assume no parent parent[i] = -1; } // xml and words class for each link info, rss item Xml tx[MAX_TIT_CANDIDATES]; Words tw[MAX_TIT_CANDIDATES]; int32_t ti = 0; // restrict how many link texts and rss blobs we check for titles // because title recs like www.google.com have hundreds and can // really slow things down to like 50ms for title generation int32_t kcount = 0; int32_t rcount = 0; //int64_t x = gettimeofdayInMilliseconds(); // . get every link text // . TODO: repeat for linkInfo2, the imported link text for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) { // breathe QUICKPOLL(m_niceness); // fast skip check for link text if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue; // fast skip check for rss item if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue; // set Url Url u; u.set( k->getUrl(), k->size_urlBuf ); // is it the same host as us? bool sh = true; // skip if not from same host and should be if ( firstUrl->getHostLen() != u.getHostLen() ) { sh = false; } // skip if not from same host and should be if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) { sh = false; } // get the link text if ( k->size_linkText >= 3 ) { char *p = k->getLinkText(); int32_t plen = k->size_linkText - 1; if ( ! verifyUtf8 ( p , plen ) ) { log("title: set4 bad link text from url=%s", k->getUrl()); continue; } // now the words. if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) { return false; } // set the bookends, it is the whole thing cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); // score higher if same host if ( sh ) scores[n] = 1.05; // do not count so high if remote! else scores[n] = 0.80; // set the type if ( sh ) types [n] = TT_LINKTEXTLOCAL; else types [n] = TT_LINKTEXTREMOTE; // another candidate n++; // use xml and words ti++; // break out if too many already. save some for below. if ( n + 30 >= MAX_TIT_CANDIDATES ) break; } // get the rss item if ( k->size_rssItem <= 10 ) continue; // . returns false and sets g_errno on error // . use a 0 for niceness if ( ! k->setXmlFromRSS ( &tx[ti] , 0 ) ) return false; // get the word range int32_t tslen; bool isHtmlEnc; char *ts = tx[ti].getRSSTitle ( &tslen , &isHtmlEnc ); // skip if not in the rss if ( ! ts ) continue; // skip if empty if ( tslen <= 0 ) continue; // now set words to that if ( !tw[ti].set( ts, tslen, true, 0 ) ) { return false; } // point to that cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); // increment since we are using it ti++; // base score for rss title if ( sh ) scores[n] = 5.0; // if not same host, treat like link text else scores[n] = 2.0; // set the type if ( sh ) types [n] = TT_RSSITEMLOCAL; else types [n] = TT_RSSITEMREMOTE; // advance n++; // break out if too many already. save some for below. if ( n + 30 >= MAX_TIT_CANDIDATES ) break; } //logf(LOG_DEBUG,"title: took1=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // . set the flags array // . indicates what words are in title candidates already, but // that is set below // . up here we set words that are not allowed to be in candidates, // like words that are in a link that is not a self link // . alloc for it char *flags = NULL; char localBuf[10000]; int32_t need = words->getNumWords(); if ( need <= 10000 ) { flags = (char *)localBuf; } else { flags = (char *)mmalloc(need,"TITLEflags"); } if ( ! flags ) { return false; } // clear it memset ( flags , 0 , need ); // check tags in body nodeid_t *tids = words->getTagIds(); // scan to set link text flags // loop over all "words" in the html body char inLink = false; char selfLink = false; for ( int32_t i = 0 ; i < NW ; i++ ) { // breathe QUICKPOLL(m_niceness); // if in a link that is not self link, cannot be in a candidate if ( inLink && ! selfLink ) { flags[i] |= 0x02; } // out of a link if ( tids[i] == (TAG_A | BACKBIT) ) { inLink = false; } // if not start of <a> tag, skip it if ( tids[i] != TAG_A ) { continue; } // flag it inLink = true; // get the node in the xml int32_t xn = words->getNodes()[i]; // is it a self link? int32_t len; char *link = xml->getString(xn,"href",&len); // . set the url class to this // . TODO: use the base url in the doc Url u; u.set( link, len, true, false ); // compare selfLink = u.equals ( firstUrl ); // skip if not selfLink if ( ! selfLink ) { continue; } // if it is a selflink , check for an "onClick" tag in the // anchor tag to fix that Mixx issue for: // http://www.npr.org/templates/story/story.php?storyId=5417137 int32_t oclen; char *oc = xml->getString(xn,"onclick",&oclen); if ( ! oc ) { oc = xml->getString(xn,"onClick",&oclen); } // assume not a self link if we see that... if ( oc ) { selfLink = false; } // if this <a href> link has a "title" attribute, use that // instead! that thing is solid gold. int32_t atlen; char *atitle = xml->getString(xn,"title",&atlen); // stop and use that, this thing is gold! if ( ! atitle || atlen <= 0 ) { continue; } // craziness? ignore it... if ( atlen > 400 ) { continue; } // if it contains permanent, permalink or share, ignore it! if ( strncasestr ( atitle, "permalink", atlen ) || strncasestr ( atitle,"permanent", atlen) || strncasestr ( atitle,"share", atlen) ) { continue; } // do not count the link text as viable selfLink = false; // aw, dammit if ( ti >= MAX_TIT_CANDIDATES ) { continue; } // other dammit if ( n >= MAX_TIT_CANDIDATES ) { break; } // ok, process it if ( ! tw[ti].set ( atitle, atlen, true, 0 )) { return false; } // set the bookends, it is the whole thing cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); scores [n] = 3.0; // not ALWAYS solid gold! types [n] = TT_TITLEATT; // we are using the words class ti++; // advance n++; // break out if too many already. save some for below. if ( n + 20 >= MAX_TIT_CANDIDATES ) { break; } } //logf(LOG_DEBUG,"title: took2=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); //int64_t *wids = WW->getWordIds(); // . find the last positive scoring guy // . do not consider title candidates after "r" if "r" is non-zero // . FIXES http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/ // the candidate # of the title tag int32_t tti = -1; // allow up to 4 tags from each type char table[512]; // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // clear table counts memset ( table , 0 , 512 ); // the first word char *wstart = NULL; if ( NW > 0 ) { wstart = words->getWord(0); } // loop over all "words" in the html body for ( int32_t i = 0 ; i < NW ; i++ ) { // come back up here if we encounter another "title-ish" tag // within our first alleged "title-ish" tag subloop: // stop after 30k of text if ( words->getWord(i) - wstart > 200000 ) { break; // 1106 } // get the tag id minus the back tag bit nodeid_t tid = tids[i] & BACKBITCOMP; // pen up and pen down for these comment like tags if ( tid == TAG_SCRIPT || tid == TAG_STYLE ) { // ignore "titles" in script or style tags if ( ! (tids[i] & BACKBIT) ) { continue; } } /// @todo ALC we should allow more tags than just title/link // skip if not a good tag. if (tid != TAG_TITLE && tid != TAG_A) { continue; } // must NOT be a back tag if ( tids[i] & BACKBIT ) { continue; } // skip if we hit our limit if ( table[tid] >= 4 ) { continue; } // skip over tag/word #i i++; // no words in links, unless it is a self link if ( i < NW && (flags[i] & 0x02) ) { continue; } // the start should be here int32_t start = -1; // do not go too far int32_t max = i + 200; // find the corresponding back tag for it for ( ; i < NW && i < max ; i++ ) { // hey we got it, BUT we got no alnum word first // so the thing was empty, so loop back to subloop if ( (tids[i] & BACKBITCOMP) == tid && (tids[i] & BACKBIT ) && start == -1 ) { goto subloop; } // if we hit another title-ish tag, loop back up if ( (tids[i] & BACKBITCOMP) == TAG_TITLE || (tids[i] & BACKBITCOMP) == TAG_A ) { // if no alnum text, restart at the top if ( start == -1 ) { goto subloop; } // otherwise, break out and see if title works break; } // if we hit a breaking tag... if ( isBreakingTagId ( tids[i] & BACKBITCOMP ) && // do not consider <span> tags breaking for // our purposes. i saw a <h1><span> setup before. tids[i] != TAG_SPAN ) { break; } // skip if not alnum word if ( ! words->isAlnum(i) ) { continue; } // if we hit an alnum word, break out if ( start == -1 ) { start = i; } } // if no start was found, must have had a 0 score in there if ( start == -1 ) { continue; } // if we exhausted the doc, we are done if ( i >= NW ) { break; } // skip if way too big! if ( i >= max ) { continue; } // if was too long do not consider a title if ( i - start > 300 ) { continue; } // . skip if too many bytes // . this does not include the length of word #i, but #(i-1) if ( words->getStringSize ( start , i ) > 1000 ) { continue; } // when using pdftohtml, the title tag is the filename when PDF property does not have title tag if ( tid == TAG_TITLE && contentType == CT_PDF ) { // skip if title == '/in.[0-9]*' char* title_start = words->getWord(start); char* title_end = words->getWord(i); size_t title_size = title_end - title_start; const char* result = strnstr( title_start, "/in.", title_size ); if (result != NULL) { char* endp = NULL; // do some further verification to avoid screwing up title if ((strtoll(result + 4, &endp, 10) > 0) && (endp == title_end)) { continue; } } } // count it table[tid]++; // max it out if we are positive scoring. stop after the // first positive scoring guy in a section. this might // hurt the "Hamlet" thing though... // store a point to the title tag guy. Msg20.cpp needs this // because the zak's proximity algo uses it in Summary.cpp // and in Msg20.cpp // only get the first one! often the 2nd on is in an iframe!! which we now expand into here. if ( tid == TAG_TITLE && m_titleTagStart == -1 ) { m_titleTagStart = start; m_titleTagEnd = i; // save the candidate # because we always use this // as the title if we are a root if ( tti < 0 ) { tti = n; } } // point to words class of the body that was passed in to us cptrs[n] = words; as[n] = start; bs[n] = i; if ( tid == TAG_B ) { types[n] = TT_BOLDTAG; scores[n] = 1.0; } else if ( tid == TAG_H1 ) { types[n] = TT_HTAG; scores[n] = 1.8; } else if ( tid == TAG_H2 ) { types[n] = TT_HTAG; scores[n] = 1.7; } else if ( tid == TAG_H3 ) { types[n] = TT_HTAG; scores[n] = 1.6; } else if ( tid == TAG_TITLE ) { types[n] = TT_TITLETAG; scores[n] = 3.0; } else if ( tid == TAG_DIV ) { types[n] = TT_DIVTAG; scores[n] = 1.0; } else if ( tid == TAG_TD ) { types[n] = TT_TDTAG; scores[n] = 1.0; } else if ( tid == TAG_P ) { types[n] = TT_PTAG; scores[n] = 1.0; } else if ( tid == TAG_FONT ) { types[n] = TT_FONTTAG; scores[n] = 1.0; } else if ( tid == TAG_A ) { types[n] = TT_ATAG; // . self link is very powerful BUT // http://www.npr.org/templates/story/story.php?storyId=5417137 // doesn't use it right! so use // 1.3 instead of 3.0. that has an "onClick" thing in the // <a> tag, so check for that! // this was bad for // http://www.spiritualwoman.net/?cat=191 // so i am demoting from 3.0 to 1.5 scores[n] = 1.5; } // count it n++; // start loop over at tag #i, for loop does an i++, so negate // that so this will work i--; // break out if too many already. save some for below. if ( n + 10 >= MAX_TIT_CANDIDATES ) { break; } } //logf(LOG_DEBUG,"title: took3=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // to handle text documents, throw in the first line of text // as a title candidate, just make the score really low bool textDoc = (contentType == CT_UNKNOWN || contentType == CT_TEXT); if (textDoc) { // make "i" point to first alphabetical word in the document int32_t i ; for ( i = 0 ; i < NW && !words->isAlpha(i) ; i++); // if we got a first alphabetical word, then assume that to be the start of our title if ( i < NW && n < MAX_TIT_CANDIDATES ) { // first word in title is "t0" int32_t t0 = i; // find end of first line int32_t numWords = 0; // set i to the end now. we MUST find a \n to terminate the // title, otherwise we will not have a valid title while (i < NW && numWords < maxTitleWords && (words->isAlnum(i) || !words->hasChar(i, '\n'))) { if(words->isAlnum(i)) { numWords++; } ++i; } // "t1" is the end int32_t t1 = -1; // we must have found our \n in order to set "t1" if (i <= NW && numWords < maxTitleWords ) { t1 = i; } // set the ptrs cptrs [n] = words; // this is the last resort i guess... scores [n] = 0.5; types [n] = TT_FIRSTLINE; as [n] = t0; bs [n] = t1; // add it as a candidate if t0 and t1 were valid if (t0 >= 0 && t1 > t0) { n++; } } } //logf(LOG_DEBUG,"title: took4=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); { // now add the last url path to contain underscores or hyphens char *pstart = firstUrl->getPath(); // get first url Url *fu = firstUrl; // start at the end char *p = fu->getUrl() + fu->getUrlLen(); // end pointer char *pend = NULL; // come up here for each path component while ( p >= pstart ) { // save end pend = p; // skip over / if ( *p == '/' ) { p--; } // now go back to next / int32_t count = 0; for ( ; p >= pstart && *p !='/' ; p-- ) { if ( *p == '_' || *p == '-' ) { count++; } } // did we get it? if ( count > 0 ) { break; } } // did we get any? if ( p > pstart && n < MAX_TIT_CANDIDATES ) { // now set words to that if ( ! tw[ti].set ( p, (pend - p), true, 0 )) { return false; } // point to that cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); scores [n] = 1.0; types [n] = TT_URLPATH; // increment since we are using it ti++; // advance n++; } } // save old n int32_t oldn = n; // . do not split titles if we are a root url maps.yahoo.com was getting "Maps" for the title if ( firstUrl->isRoot() ) { oldn = -2; } // point to list of \0 separated titles const char *rootTitleBuf = NULL; const char *rootTitleBufEnd = NULL; // get the root title if we are not root! if (filteredRootTitleBuf) { #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(filteredRootTitleBuf,filteredRootTitleBufSize); #endif // point to list of \0 separated titles rootTitleBuf = filteredRootTitleBuf; rootTitleBufEnd = filteredRootTitleBuf + filteredRootTitleBufSize; } { Matches m; if ( rootTitleBuf && query ) { m.setQuery ( query ); } // convert into an array int32_t nr = 0; const char *pr = rootTitleBuf; const char *rootTitles[20]; int32_t rootTitleLens[20]; // loop over each root title segment for ( ; pr && pr < rootTitleBufEnd ; pr += strnlen(pr,rootTitleBufEnd-pr) + 1 ) { // if we had a query... if ( query ) { // reset it m.reset(); // see if root title segment has query terms in it m.addMatches ( const_cast<char*>(pr), strnlen(pr,rootTitleBufEnd-pr), MF_TITLEGEN, m_niceness ); // if matches query, do NOT add it, we only add it for // removing from the title of the page... if ( m.getNumMatches() ) { continue; } } // point to it. it should start with an alnum already // since it is the "filtered" list of root titles... // if not, fix it in xmldoc then. rootTitles [nr] = pr; rootTitleLens[nr] = gbstrlen(pr); // advance nr++; // no breaching if ( nr >= 20 ) break; } // now split up candidates in children candidates by tokenizing // using :, | and - as delimters. // the hyphen must have a space on at least one side, so "cd-rom" does // not create a pair of tokens... // FIX: for the title: // Best Careers 2009: Librarian - US News and World Report // we need to recognize "Best Careers 2009: Librarian" as a subtitle // otherwise we don't get it as the title. so my question is are we // going to have to do all the permutations at some point? for now // let's just add in pairs... for ( int32_t i = 0 ; i < oldn && n + 3 < MAX_TIT_CANDIDATES ; i++ ) { // stop if no root title segments if ( nr <= 0 ) break; // get the word info Words *w = cptrs[i]; int32_t a = as[i]; int32_t b = bs[i]; // init int32_t lasta = a; char prev = false; // char length in bytes //int32_t charlen = 1; // see how many we add int32_t added = 0; char *skipTo = NULL; bool qualified = true; // . scan the words looking for a token // . sometimes the candidates end in ": " so put in "k < b-1" // . made this from k<b-1 to k<b to fix // "Hot Tub Time Machine (2010) - IMDb" to strip IMDb for ( int32_t k = a ; k < b && n + 3 < MAX_TIT_CANDIDATES; k++){ // get word char *wp = w->getWord(k); // skip if not alnum if ( ! w->isAlnum(k) ) { // in order for next alnum word to // qualify for "clipping" if it matches // the root title, there has to be more // than just spaces here, some punct. // otherwise title // "T. D. Jakes: Biography from Answers.com" // becomes // "T. D. Jakes: Biography from" qualified=isWordQualified(wp,w->getWordLen(k)); continue; } // gotta be qualified! if ( ! qualified ) continue; // skip if in root title if ( skipTo && wp < skipTo ) continue; // does this match any root page title segments? int32_t j; for ( j = 0 ; j < nr ; j++ ) { // . compare to root title // . break out if we matched! if ( ! strncmp( wp, rootTitles[j], rootTitleLens[j] ) ) { break; } } // if we did not match a root title segment, // keep on chugging if ( j >= nr ) continue; // . we got a root title match! // . skip over skipTo = wp + rootTitleLens[j]; // must land on qualified punct then!! int32_t e = k+1; for ( ; e<b && w->getWord(e)<skipTo ; e++ ); // ok, word #e must be a qualified punct if ( e<b && ! isWordQualified(w->getWord(e),w->getWordLen(e))) // assume no match then!! continue; // if we had a previous guy, reset the end of the // previous candidate if ( prev ) { bs[n-2] = k; bs[n-1] = k; } // . ok, we got two more candidates // . well, only one more if this is not the 1st time if ( ! prev ) { cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = lasta; bs [n] = k; parent [n] = i; n++; added++; } // the 2nd one cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = e + 1; bs [n] = bs [i]; parent [n] = i; n++; added++; // now add in the last pair as a whole token cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = lasta; bs [n] = bs [i]; parent [n] = i; n++; added++; // nuke the current candidate then since it got // split up to not contain the root title... //cptrs[i] = NULL; // update this lasta = k+1; // if we encounter another delimeter we will have to revise bs[n-1], so note that prev = true; } // nuke the current candidate then since it got // split up to not contain the root title... if ( added ) { scores[i] = 0.001; //cptrs[i] = NULL; } // erase the pair if that there was only one token if ( added == 3 ) n--; } } for ( int32_t i = 0 ; i < n ; i++ ) baseScore[i] = scores[i]; // // . now punish by 0.85 for every lower case non-stop word it has // . reward by 1.1 if has a non-stopword in the query // for ( int32_t i = 0 ; i < n ; i++ ) { // point to the words Words *w = cptrs[i]; // skip if got nuked above if ( ! w ) { continue; } // the word ptrs char **wptrs = w->getWordPtrs(); // skip if empty if ( w->getNumWords() <= 0 ) { continue; } // get the word boundaries int32_t a = as[i]; int32_t b = bs[i]; // record the boosts float ncb = 1.0; float qtb = 1.0; // a flag char uncapped = false; // scan the words in this title candidate for ( int32_t j = a ; j < b ; j++ ) { // skip stop words if ( w->isQueryStopWord( j, langId ) ) { continue; } // punish if uncapitalized non-stopword if ( ! w->isCapitalized(j) ) { uncapped = true; } // skip if no query if ( ! query ) { continue; } int64_t wid = w->getWordId(j); // reward if in the query if ( query->getWordNum(wid) >= 0 ) { qtb *= 1.5; scores[i] *= 1.5; } } // . only punish once if missing a capitalized word hurts us for: // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html if ( uncapped ) { ncb *= 1.00; scores[i] *= 1.00; } // punish if a http:// title thingy char *s = wptrs[a]; int32_t size = w->getStringSize(a,b); if ( size > 9 && memcmp("http://", s, 7) == 0 ) { ncb *= .10; } if ( size > 14 && memcmp("h\0t\0t\0p\0:\0/\0/", s, 14) == 0 ) { ncb *= .10; } // set these guys scores[i] *= ncb; noCapsBoost[i] = ncb; qtermsBoost[i] = qtb; } // . now compare each candidate to the other candidates // . give a boost if matches for ( int32_t i = 0 ; i < n ; i++ ) { // point to the words Words *w1 = cptrs[i]; // skip if got nuked above if ( ! w1 ) { continue; } int32_t a1 = as[i]; int32_t b1 = bs[i]; // reset some flags char localFlag1 = 0; char localFlag2 = 0; // record the boost float iccb = 1.0; // total boost float total = 1.0; // to each other candidate for ( int32_t j = 0 ; j < n ; j++ ) { // not to ourselves if ( j == i ) { continue; } // or our derivatives if ( parent[j] == i ) { continue; } // or derivates to their parent if ( parent[i] == j ) { continue; } // only check parents now. do not check kids. // this was only for when doing percent contained // not getSimilarity() per se //if ( parent[j] != -1 ) continue; // TODO: do not accumulate boosts from a parent // and its kids, subtitles... // // do not compare type X to type Y if ( types[i] == TT_TITLETAG ) { if ( types[j] == TT_TITLETAG ) { continue; } } // do not compare a div candidate to another div cand // http://friendfeed.com/foxiewire?start=30 // likewise, a TD to another TD // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/match/351681.html // ... etc. if ( types[i] == TT_BOLDTAG || types[i] == TT_HTAG || types[i] == TT_DIVTAG || types[i] == TT_TDTAG || types[i] == TT_FONTTAG ) { if ( types[j] == types[i] ) continue; } // . do not compare one kid to another kid // . i.e. if we got "x | y" as a title and "x | z" // as a link text, it will emphasize "x" too much // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html if ( parent[j] != -1 && parent[i] != -1 ) continue; // . body type tags are mostly mutually exclusive // . for the legacy.com url mentioned below, we have // good stuff in <td> tags, so this hurts us... // . but for the sake of // http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/ // i put bold tags back if ( types[i] == TT_LINKTEXTLOCAL ) { if ( types[j] == TT_LINKTEXTLOCAL ) continue; } if ( types[i] == TT_RSSITEMLOCAL ) { if ( types[j] == TT_RSSITEMLOCAL ) continue; } // only compare to one local link text for each i if ( types[j] == TT_LINKTEXTLOCAL && localFlag1 ) { continue; } if ( types[j] == TT_RSSITEMLOCAL && localFlag2 ) { continue; } if ( types[j] == TT_LINKTEXTLOCAL ) { localFlag1 = 1; } if ( types[j] == TT_RSSITEMLOCAL ) { localFlag2 = 1; } // not link title attr to link title attr either // fixes http://www.spiritualwoman.net/?cat=191 if ( types[i] == TT_TITLEATT && types[j] == TT_TITLEATT ) continue; // get our words Words *w2 = cptrs[j]; // skip if got nuked above if ( ! w2 ) continue; int32_t a2 = as [j]; int32_t b2 = bs [j]; // how similar is title #i to title #j ? float fp = getSimilarity ( w2 , a2 , b2 , w1 , a1 , b1 ); // error? if ( fp == -1.0 ) return false; // custom boosting... float boost = 1.0; if ( fp >= .95 ) boost = 3.0; else if ( fp >= .90 ) boost = 2.0; else if ( fp >= .85 ) boost = 1.5; else if ( fp >= .80 ) boost = 1.4; else if ( fp >= .75 ) boost = 1.3; else if ( fp >= .70 ) boost = 1.2; else if ( fp >= .60 ) boost = 1.1; else if ( fp >= .50 ) boost = 1.08; else if ( fp >= .40 ) boost = 1.04; // limit total total *= boost; if ( total > 100.0 ) break; // if you are matching the url path, that is pretty // good so give more! // actually, that would hurt: // http://michellemalkin.com/2008/12/29/gag-worthy/ // custom boosting! if ( fp > 0.0 && g_conf.m_logDebugTitle ) logf(LOG_DEBUG,"title: i=%" PRId32" j=%" PRId32" fp=%.02f " "b=%.02f", i,j,fp,boost); // apply it scores[i] *= boost; iccb *= boost; } inCommonCandBoost[i] = iccb; } //logf(LOG_DEBUG,"title: took7=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // loop over all n candidates for ( int32_t i = 0 ; i < n ; i++ ) { // skip if not in the document body if ( cptrs[i] != words ) continue; // point to the words int32_t a1 = as [i]; int32_t b1 = bs [i]; // . loop through this candidates words // . TODO: use memset here? for ( int32_t j = a1 ; j <= b1 && j < NW ; j++ ) { // flag it flags[j] |= 0x01; } } // free our stuff if ( flags!=localBuf ) { mfree (flags, need, "TITLEflags"); } // now get the highest scoring candidate title float max = -1.0; int32_t winner = -1; for ( int32_t i = 0 ; i < n ; i++ ) { // skip if got nuked if ( ! cptrs[i] ) { continue; } if ( winner != -1 && scores[i] <= max ) { continue; } // url path's cannot be titles in and of themselves if ( types[i] == TT_URLPATH ) { continue; } // skip if empty basically, like if title was exact // copy of root, then the whole thing got nuked and // some empty string added, where a > b if ( as[i] >= bs[i] ) { continue; } // got one max = scores[i]; // save it winner = i; } // if we are a root, always pick the title tag as the title if ( oldn == -2 && tti >= 0 ) { winner = tti; } // if no winner, all done. no title if ( winner == -1 ) { // last resort use file name if ((contentType == CT_PDF) && (firstUrl->getFilenameLen() != 0)) { Words w; w.set(firstUrl->getFilename(), firstUrl->getFilenameLen(), true); if (!copyTitle(&w, 0, w.getNumWords())) { return false; } } return true; } // point to the words class of the winner Words *w = cptrs[winner]; // skip if got nuked above if ( ! w ) { char *xx=NULL;*xx=0; } // need to make our own Pos class if title not from body Pos tp; if ( w != words ) { // set "Scores" ptr to NULL. we assume all are positive scores if ( ! tp.set ( w ) ) { return false; } } // the string ranges from word #a up to and including word #b int32_t a = as[winner]; int32_t b = bs[winner]; // sanity check if ( a < 0 || b > w->getNumWords() ) { char*xx=NULL;*xx=0; } // save the title if ( ! copyTitle(w, a, b) ) { return false; } /* // debug logging SafeBuf sb; SafeBuf *pbuf = &sb; log("title: candidates for %s",xd->getFirstUrl()->getUrl() ); pbuf->safePrintf("<div stype=\"border:1px solid black\">"); pbuf->safePrintf("<b>***Finding Title***</b><br>\n"); pbuf->safePrintf("<table cellpadding=5 border=2><tr>" "<td colspan=20><center><b>Title Generation</b>" "</center></td>" "</tr>\n<tr>" "<td>#</td>" "<td>type</td>" "<td>parent</td>" "<td>base score</td>" "<td>format penalty</td>" "<td>query term boost</td>" "<td>candidate intersection boost</td>" "<td>FINAL SCORE</td>" "<td>title</td>" "</tr>\n" ); // print out all candidates for ( int32_t i = 0 ; i < n ; i++ ) { char *ts = "unknown"; if ( types[i] == TT_LINKTEXTLOCAL ) ts = "local inlink text"; if ( types[i] == TT_LINKTEXTREMOTE ) ts = "remote inlink text"; if ( types[i] == TT_RSSITEMLOCAL ) ts = "local rss title"; if ( types[i] == TT_RSSITEMREMOTE ) ts = "remote rss title"; if ( types[i] == TT_BOLDTAG ) ts = "bold tag"; if ( types[i] == TT_HTAG ) ts = "header tag"; if ( types[i] == TT_TITLETAG ) ts = "title tag"; if ( types[i] == TT_FIRSTLINE ) ts = "first line in text"; if ( types[i] == TT_FONTTAG ) ts = "font tag"; if ( types[i] == TT_ATAG ) ts = "anchor tag"; if ( types[i] == TT_DIVTAG ) ts = "div tag"; if ( types[i] == TT_TDTAG ) ts = "td tag"; if ( types[i] == TT_PTAG ) ts = "p tag"; if ( types[i] == TT_URLPATH ) ts = "url path"; if ( types[i] == TT_TITLEATT ) ts = "title attribute"; // get the title pbuf->safePrintf( "<tr>" "<td>#%" PRId32"</td>" "<td><nobr>%s</nobr></td>" "<td>%" PRId32"</td>" "<td>%0.2f</td>" // baseScore "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>", i, ts , parent[i], baseScore[i], noCapsBoost[i], qtermsBoost[i], inCommonCandBoost[i], scores[i]); // ptrs Words *w = cptrs[i]; int32_t a = as[i]; int32_t b = bs[i]; // skip if no words if ( w->getNumWords() <= 0 ) continue; // the word ptrs char **wptrs = w->getWordPtrs(); // string ptrs char *ptr = wptrs[a];//w->getWord(a); int32_t size = w->getStringSize(a,b); // it is utf8 pbuf->safeMemcpy ( ptr , size ); // end the line pbuf->safePrintf("</td></tr>\n"); } pbuf->safePrintf("</table>\n<br>\n"); // log these for now log("title: %s",sb.getBufStart()); */ return true; }
// . add the phrase that starts with the ith word // . "read Of Mice and Men" should make 3 phrases: // . read.ofmice // . ofmice // . mice.andmen void Phrases::setPhrase ( int32_t i, int32_t niceness ) { // . if the ith word cannot start a phrase then we have no phrase // . we indicate NULL phrasesIds with a spam of PSKIP // . we now index all regardless! we want to be able to search // for "a thing" or something. so do it! //if ( ! m_bits->canStartPhrase ( i ) ) { // m_phraseSpam[i] = PSKIP; // m_phraseIds [i] = 0LL; // return; //} // MDW: now Weights.cpp should encompass all this logic // or if score <= 0, set in Scores.cpp //if ( m_wordScores && m_wordScores[i] <= 0 ) { // m_phraseSpam[i] = PSKIP; // m_phraseIds [i] = 0LL; // return; //} // hash of the phrase int64_t h = 0LL; // the hash of the two-word phrase (now we do 3,4 and 5 word phrases) int64_t h2 = 0LL; int64_t h3 = 0LL; //int64_t h4 = 0LL; //int64_t h5 = 0LL; // reset unsigned char pos = 0; // now look for other tokens that should follow the ith token int32_t nw = m_words->getNumWords(); int32_t numWordsInPhrase = 1; // use the min spam from all words in the phrase as the spam for phrase char minSpam = -1; // we need to hash "1 / 8" differently from "1.8" from "1,000" etc. char isNum = is_digit(m_wptrs[i][0]); // min score //int32_t minScore ; //if ( m_wordScores ) minScore = m_wordScores[i]; // if i is not a stop word, it can set the min spam initially //if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i); // do not include punct/tag words in the m_numWordsTotal[j] count // of the total words in the phrase. these are just usesless tails. int32_t lastWordj = -1; // loop over following words int32_t j; bool hasHyphen ; bool hasStopWord2 ; // . NOTE: a token can start a phrase but NOT be in it. // . like a large number for example. // . wordId is the lower ascii hash of the ith word // . NO... this is allowing the query operator PiiPe to start // a phrase but not be in it, then the phrase id ends up just // being the following word's id. causing the synonyms code to // give a synonym which it should not un Synonyms::set() if ( ! m_bits->canBeInPhrase(i) ) // so indeed, skip it then goto nophrase; //h = hash64 ( h, m_words->getWordId(i)); h = m_wids[i]; // set position pos = (unsigned char)m_wlens[i]; //if (m_words->getStripWordId(i)) // h2 = hash64 ( h2, m_words->getStripWordId(i)); //else h2 = h; hasHyphen = false; hasStopWord2 = m_bits->isStopWord(i); // this makes it true now too //if ( m_wlens[i] <= 2 ) hasStopWord = true; for ( j = i + 1 ; j < nw ; j++ ) { QUICKPOLL(niceness); // . do not allow more than 32 alnum/punct "words" in a phrase // . this prevents phrases with 100,000 words from slowing // us down. would put us in a huge double-nested for loop if ( j > i + 32 ) goto nophrase; // deal with punct words if ( ! m_wids[j] ) { // if we cannot pair across word j then break if ( ! m_bits->canPairAcross (j) ) break; // does it have a hyphen? if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true; /* // "D & B" --> dandb if (j==i+1 && m_words->hasChar(j,'&')) { // set this hasStopWord = true; // insert "and" int32_t conti=pos; h = hash64Lower_utf8_cont("and",3,h,&conti); pos=conti; // the two-word phrase, set it if we need to h2 = h; m_numWordsTotal2[i] = j-i+1; } */ continue; } // . if this word can not be in a phrase then continue our // search for a word that can // . no punctuation can be in a phrase currently (++?) //if ( m_bits->canBeInPhrase (j) ) { //} // keep this set right //if (m_bits->isStopWord(j)||m_wlens[j]<=2) hasStopWord = true; //if ( m_bits->isStopWord(j) ) hasStopWord = true; // record lastWordj to indicate that word #j was a true word lastWordj = j; // . stop words should have a 0 spam value so don't count those // . added by mdw in march 2002 /* if ( ! m_bits->isStopWord(j) && m_spam ) { // maintain the min spam char spam = m_spam->getSpam ( j ); if ( minSpam == -1 || spam < minSpam ) minSpam = spam; // . min weight from score vector // . normal score here is 256, not 128, so shift // down 3 to normalize it relatively //if ( m_wordScores && (m_wordScores[j]>>3)<minScore) // minScore = m_wordScores[j]>>3; //if ( m_wordScores && m_wordScores[j] < minScore ) // minScore = m_wordScores[j]; } */ // if word #j can be in phrase then incorporate it's hash if ( m_bits->canBeInPhrase (j) ) { // continue the hash //unsigned char *p= (unsigned char *)m_wptrs[j]; //unsigned char *pend = p + m_wlens[j]; //for ( ; p < pend ; p++ ) // h ^= g_hashtab[pos++][*p]; int32_t conti = pos; // . get the punctuation mark separting two numbers // . use space if can't find one // . 1/234 1,234 1.234 10/11 "1 234" 1-5 //if (isNum && j==i + 2 && is_digit(m_wptrs[j][0]) ) { // // get punct mark // char c = m_wptrs[i+1][0]; // // if space try next // if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1]; // // treat comma as nothing // if ( c==',' ) c='\0'; // // treat / and . and - as they are, everything // // else should be treated as a space // else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' '; // // incorporate into hash if c is there // if (c)h=hash64Lower_utf8_cont(&c,1,h,&conti); //} // hash the jth word into the hash h = hash64Lower_utf8_cont(m_wptrs[j], m_wlens[j], h, &conti ); pos = conti; //h = hash64 ( h , m_words->getWordId (j) ); //if (m_words->getStripWordId(j)) // h2 = hash64 ( h2, m_words->getStripWordId(j)); //else h2 = hash64(h2, m_words->getWordId(j)); numWordsInPhrase++; // N-word phrases? if ( numWordsInPhrase == 2 ) { // h != h2 ) { h2 = h; m_numWordsTotal2[i] = j-i+1; if ( m_bits->isStopWord(j) ) hasStopWord2 = true; continue; } if ( numWordsInPhrase == 3 ) { h3 = h; m_numWordsTotal3[i] = j-i+1; //continue; break; } /* if ( numWordsInPhrase == 4 ) { h4 = h; m_numWordsTotal4[i] = j-i+1; continue; } if ( numWordsInPhrase == 5 ) { h5 = h; m_numWordsTotal5[i] = j-i+1; continue; } */ } // if we cannot pair across word j then break if ( ! m_bits->canPairAcross (j) ) break; // keep chugging? if ( numWordsInPhrase >= 5 ) { // if we're not using stop words then break if ( ! m_useStopWords ) break; // if it's not a stop word then break if ( ! m_bits->isStopWord (j) ) break; } // otherwise, get the next word } // if we had no phrase then use 0 as id (need 2+ words to be a pharse) if ( numWordsInPhrase <= 1 ) { nophrase: m_phraseSpam[i] = PSKIP; //m_phraseIds [i] = 0LL; m_phraseIds2[i] = 0LL; m_phraseIds3[i] = 0LL; //m_stripPhraseIds [i] = 0LL; //m_numWordsTotal[i] = 0; m_numWordsTotal2[i] = 0; m_numWordsTotal3[i] = 0; return; } // don't jump the edge //if ( j >= nw ) j = nw - 1; // sanity check if ( lastWordj == -1 ) { char *xx = NULL; *xx = 0; } // set the phrase length (from word #i upto & including word #j) //m_numWordsTotal[i] = j - i + 1; //m_numWordsTotal [i] = lastWordj - i + 1; // sanity check if ( lastWordj - i + 1 > 255 ) { char *xx=NULL;*xx=0; } // set the phrase spam if ( minSpam == -1 ) minSpam = 0; m_phraseSpam[i] = minSpam; // return the phraseId //m_phraseIds [i] = h; // hyphen between numbers does not count (so 1-2 != 12) if ( isNum ) hasHyphen = false; // . the two word phrase id // . "cd rom" -> cdrom // . "fly paper" -> flypaper // . "i-phone" -> iphone // . "e-mail" -> email if ( hasHyphen || ! hasStopWord2 ) { //m_phraseIds [i] = h; m_phraseIds2[i] = h2; } // . "st. and" !-> stand // . "the rapist" !-> therapist else { //m_phraseIds [i] = h ^ 0x768867; m_phraseIds2[i] = h2 ^ 0x768867; } // forget hyphen logic for these m_phraseIds3[i] = h3; //m_phraseIds4[i] = h4; //m_phraseIds5[i] = h5; //if ( h != h2 ) m_stripPhraseIds[i] = h2; //else m_stripPhraseIds[i] = 0LL; // the score weight, if any //if ( m_phraseScores ) m_phraseScores [i] = minScore; // sanity check //if(m_phraseScores && minScore == 0x7fffffff ) {char *xx =NULL;*xx=0;} // debug msg //char *w = m_words->getWord(i) ; //int32_t wlen = m_words->getWordLen(i) ; //for ( int32_t k = 0 ; k < wlen ; k++ ) // fprintf(stderr,"%c",w[k]); //fprintf(stderr,"--> hash=%"UINT64"\n",(uint64_t)h); }
// . when a dump completes we free the primary mem space and make // the secondary mem space the new primary mem space void RdbMem::freeDumpedMem( RdbTree *tree ) { // bail if we have no mem if ( m_memSize == 0 ) return; log("rdbmem: start freeing dumped mem"); //char *memEnd = m_mem + m_memSize; // this should still be true so allocData() returns m_ptr2 ptrs if ( ! m_rdb->m_inDumpLoop ) { g_process.shutdownAbort(true); } // count how many data nodes we had to move to avoid corruption int32_t count = 0; int32_t scanned = 0; for ( int32_t i = 0 ; i < tree->m_minUnusedNode ; i++ ) { // give up control to handle search query stuff of niceness 0 QUICKPOLL ( MAX_NICENESS ); // skip node if parents is -2 (unoccupied) if ( tree->m_parents[i] == -2 ) continue; scanned++; // get the ptr char *data = tree->m_data[i]; if ( ! data ) continue; // how could it's data not be stored in here? // if ( data < m_mem ) { // log("rdbmem: bad data1"); // continue; // } // if ( data >= memEnd ) { // log("rdbmem: bad data2"); // continue; // } // is it in primary mem? m_ptr1 mem was just dump // if growing upward bool needsMove = false; // if the primary mem (that was dumped) is // growing upwards if ( m_ptr1 < m_ptr2 ) { // and the node data is in it... if ( data < m_ptr1 ) needsMove = true; } // growing downward otherwise else if ( data >= m_ptr1 ) { needsMove = true; } if ( ! needsMove ) continue; // move it. m_inDumpLoop should still // be true so we will get added to // m_ptr2 int32_t size; if ( tree->m_sizes ) size = tree->m_sizes[i]; else size = tree->m_fixedDataSize; if ( size < 0 ) { g_process.shutdownAbort(true); } if ( size == 0 ) continue; // m_inDumpLoop is still true at this point so // so allocData should return m_ptr2 guys char *newData = (char *)allocData(NULL,size,0); if ( ! newData ) { log("rdbmem: failed to alloc %i " "bytes node %i",(int)size,(int)i); continue; } // debug test bool stillNeedsMove = false; if ( m_ptr1 < m_ptr2 ) { // and the node data is in it... if ( newData < m_ptr1 ) stillNeedsMove = true; } // growing downward otherwise else if ( newData >= m_ptr1 ) { stillNeedsMove = true; } if ( stillNeedsMove ) {// this should never happen!! log("rdbmem: olddata=0x%" PTRFMT" newdata=0x%" PTRFMT, (PTRTYPE)data, (PTRTYPE)newData); log("rdbmem: still needs move!"); } count++; gbmemcpy(newData,data,size); tree->m_data[i] = newData; } if ( count > 0 ) log("rdbmem: moved %i tree nodes for %s",(int)count, m_rdb->m_dbname); log("rdbmem: stop freeing dumped mem. scanned %i nodes.",(int)scanned); // save primary ptr char *tmp = m_ptr1; // debug //logf(LOG_DEBUG, // "db: freeing dumped mem ptr1=%" PRIx32" ptr2=%" PRIx32".",m_ptr1,m_ptr2); // primary pointer, m_ptr1, becomes m_ptr2 m_ptr1 = m_ptr2; // secondary ptr becomes primary m_ptr2 = tmp; // reset secondary (old primary mem was dumped out to disk) if ( m_ptr2 > m_ptr1 ) m_ptr2 = m_mem + m_memSize; else m_ptr2 = m_mem; // no longer 90% full m_is90PercentFull = false; }
// returns -1 and sets g_errno on error, because 0 means langUnknown long Words::getLanguage( Sections *sections , long maxSamples, long niceness, long *langScore) { // calculate scores if not given //Scores calcdScores; //if ( ! scores ) { // if ( ! calcdScores.set( this,m_version,false ) ) // return -1; // scores = &calcdScores; //} // . take a random sample of words and look them up in the // language dictionary //HashTableT<long long, char> ht; HashTableX ht; long long langCount[MAX_LANGUAGES]; long long langWorkArea[MAX_LANGUAGES]; long numWords = m_numWords; //long skip = numWords/maxSamples; //if ( skip == 0 ) skip = 1; // reset the language count memset(langCount, 0, sizeof(long long)*MAX_LANGUAGES); // sample the words //long wordBase = 0; long wordi = 0; //if ( ! ht.set(maxSamples*1.5) ) return -1; if ( ! ht.set(8,1,(long)(maxSamples*8.0),NULL,0,false, niceness,"wordslang")) return -1; // . avoid words in these bad sections // . google seems to index SEC_MARQUEE so i took that out of badFlags long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // shortcuts long long *wids = m_wordIds; long *wlens = m_wordLens; char **wptrs = m_words; //long langTotal = 0; // log ( LOG_WARN, "xmldoc: Picking language from %li words with %li skip", // numWords, skip ); char numOne = 1; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // this means null too if ( sections && sections->m_numSections == 0 ) sp = NULL; long maxCount = 1000; while ( wordi < numWords ) { // breathe QUICKPOLL( niceness ); // move to the next valid word if ( ! wids [wordi] ) { wordi++; continue; } if ( wlens[wordi] < 2 ) { wordi++; continue; } // skip if in a bad section //long flags = sections->m_sectionPtrs[i]->m_flags; // meaning script section ,etc if ( sp && ( sp[wordi]->m_flags & badFlags ) ) { wordi++; continue; } // check the language //unsigned char lang = 0; // Skip if word is capitalized and not preceded by a tag //if(s_isWordCap(getWord(wordi), getWordLen(wordi)) && // wordi > 0 && !getTagId(wordi - 1)) { // wordi++; // continue; //} // Skip word if bounded by '/' or '?' might be in a URL if(isBounded(wordi)) { wordi++; continue; } // is it arabic? sometimes they are spammy pages and repeat // a few arabic words over and over again, so don't do deduping // with "ht" before checking this. char cl = getCharacterLanguage ( wptrs[wordi] ); if ( cl ) { langCount[(unsigned char)cl]++; wordi++; continue; } //if(ht.getSlot(m_wordIds[wordi]) !=-1) { if(!ht.isEmpty(&m_wordIds[wordi]) ) { wordi++; continue; } // If we can't add the word, it's not that bad. // Just gripe about it in the log. if(!ht.addKey(&m_wordIds[wordi], &numOne)) { log(LOG_WARN, "build: Could not add word to temporary " "table, memory error?\n"); g_errno = ENOMEM; return -1; } if ( maxCount-- <= 0 ) break; // No lang from charset, got a phrase, and 0 language does not have // a score Order is very important! int foundone = 0; if ( // lang == 0 && // we seem to be missing hungarian and thai g_speller.getPhraseLanguages(getWord(wordi), getWordLen(wordi), langWorkArea) && // why must it have an "unknown score" of 0? // allow -1... i don't know what that means!! langWorkArea[0] <= 0) { int lasty = -1; for(int y = 1; y < MAX_LANGUAGES; y++) { if(langWorkArea[y] == 0) continue; langCount[y]++; long pop = langWorkArea[y]; // negative means in an official dictionary if ( pop < 0 ) { pop *= -1; langCount[y] += 1; } // extra? if ( pop > 1000 ) langCount[y] += 2; if ( pop > 10000 ) langCount[y] += 2; lasty = y; foundone++; } // . if it can only belong to one language // . helps fix that fact that our unifiedDict is crummy // and identifes some words as being in a lot of languages // like "Pronto" as being in english and not giving // the popularities correctly. if ( foundone == 1 ) // give massive boost langCount[lasty] += 10; } // . try to skip unknown words without killing sample size // . we lack russian, hungarian and arabic in the unified // dict, so try to do character detection for those langs. // . should prevent them from being detected as unknown // langs and coming up for english search 'gigablast' if ( ! foundone ) { langCount[langUnknown]++; // do not count towards sample size maxCount++; } // skip to the next word //wordBase += skip; //if ( wordi < wordBase ) // wordi = wordBase; //else wordi++; } // punish unknown count in case a doc has a lot of proper names // or something //langCount[langUnknown] /= 2; // get the lang with the max score then int l = s_findMaxIndex(langCount, MAX_LANGUAGES); // if(langCount[l] < 15) return(langUnknown); if(langScore) *langScore = langCount[l]; // return if known now return l; }
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb //char *coll , collnum_t collnum , RdbList *list , //key_t startKey , //key_t endKey , char *startKey , char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int32_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , Msg5 *msg5b , bool isRealMerge , //#ifdef SPLIT_INDEXDB bool allowPageCache , bool forceLocalIndexdb , bool noSplit , // doIndexdbSplit , int32_t forceParitySplit ) { //#else // bool allowPageCache ) { //#endif // this is obsolete! mostly, but we need it for PageIndexdb.cpp to // show a "termlist" for a given query term in its entirety so you // don't have to check each machine in the network. if this is true it // means to query each split and merge the results together into a // single unified termlist. only applies to indexdb/datedb. //if ( doIndexdbSplit ) { char *xx = NULL; *xx = 0; } // note this because if caller is wrong it hurts performance major!! //if ( doIndexdbSplit ) // logf(LOG_DEBUG,"net: doing msg0 with indexdb split true"); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); //if ( doIndexdbSplit ) { char *xx=NULL;*xx=0; } // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; return true; } // debug msg //if ( niceness != 0 ) log("HEY start"); // ensure startKey last bit clear, endKey last bit set //if ( (startKey.n0 & 0x01) == 0x01 ) // log("Msg0::getList: warning startKey lastbit set"); //if ( (endKey.n0 & 0x01) == 0x00 ) // log("Msg0::getList: warning endKey lastbit clear"); // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; //m_ip = ip; //m_port = port; m_addToCache = addToCache; // . these define our request 100% //m_startKey = startKey; //m_endKey = endKey; KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? //if ( forceLocalIndexdb ) m_groupId = g_hostdb.m_groupId; if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) log(LOG_LOGIC,"net: msg0: " "Weird. check but don't add... rdbid=%"INT32".",(int32_t)m_rdbId); // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* m_numSplit = 1; if ( g_hostdb.m_indexSplits > 1 && ( rdbId == RDB_POSDB || rdbId==RDB_DATEDB)&& ! forceLocalIndexdb && doIndexdbSplit ) { isLocal = false; //m_numSplit = INDEXDB_SPLIT; m_numSplit = g_hostdb.m_indexSplits; char *xx=NULL;*xx=0; } */ /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // force a msg0 if doing a docid restrictive query like // gbdocid:xxxx|<query> so we call cacheTermLists() //if ( singleDocIdQuery ) isLocal = false; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { // && !g_conf.m_interfaceMachine ) { if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); // same for msg5b if ( msg5b ) { m_msg5b = msg5b; m_deleteMsg5b = false; } /* else if ( m_rdbId == RDB_TITLEDB ) { try { m_msg5b = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request. 2.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5b , sizeof(Msg5) , "Msg0b" ); m_deleteMsg5b = true; } */ QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , NULL,//m_msg5b , m_isRealMerge , m_allowPageCache ) ) return false; // nuke it reset(); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%"UINT32" " "listPtr=%"PTRFMT" minRecSizes=%"INT32" termId=%"UINT64" " //"startKey.n1=%"XINT32",n0=%"XINT64" (niceness=%"INT32")", "startKey.n1=%"XINT64",n0=%"XINT64" (niceness=%"INT32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // adjust niceness for net transmission bool realtime = false; //if ( minRecSizes + 32 < TMPBUFSIZE ) realtime = true; // if we're niceness 0 we need to pre-allocate for reply since it // might be received within the asynchronous signal handler which // cannot call mmalloc() if ( realtime ) { // niceness <= 0 || netnice == 0 ) { // . we should not get back more than minRecSizes bytes since // we are now performing merges // . it should not slow things down too much since the hashing // is 10 times slower than merging anyhow... // . CAUTION: if rdb is not fixed-datasize then this will // not work for us! it can exceed m_minRecSizes. replyBufMaxSize = m_minRecSizes ; // . get a little extra to fix the error where we ask for 64 // but get 72 // . where is that coming from? // . when getting titleRecs we often exceed the minRecSizes // . ?Msg8? was having trouble. was int16_t 32 bytes sometimes. replyBufMaxSize += 36; // why add ten percent? //replyBufMaxSize *= 110 ; //replyBufMaxSize /= 100 ; // make a buffer to hold the reply //#ifdef SPLIT_INDEXDB /* if ( m_numSplit > 1 ) { m_replyBufSize = replyBufMaxSize * m_numSplit; replyBuf = (char *) mmalloc(m_replyBufSize, "Msg0"); m_replyBuf = replyBuf; freeReply = false; } else */ //#endif replyBuf = (char *) mmalloc(replyBufMaxSize , "Msg0"); // g_errno is set and we return true if it failed if ( ! replyBuf ) { log("net: Failed to pre-allocate %"INT32" bytes to hold " "data read remotely from %s: %s.", replyBufMaxSize,getDbnameFromId(m_rdbId), mstrerror(g_errno)); return true; } } // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %"INT64".", m_hostId); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); //if ( niceness <= 0 || netnice == 0 ) { //if ( realtime ) { // us = &g_udpServer2; port = h->m_port2; } //else { us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) // cback niceness return true; // return false cuz it blocked return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; //if ( m_rdbId == RDB_INDEXDB ) log("Msg0:: getting remote indexlist. " // "termId=%"UINT64", " // "groupNum=%"UINT32"", // g_indexdb.getTermId(m_startKey) , // g_hostdb.makeHostId ( m_groupId ) ); /* // make the cache key so we can see what remote host cached it, if any char cacheKey[MAX_KEY_BYTES]; //key_t cacheKey = makeCacheKey ( startKey , makeCacheKey ( startKey , endKey , includeTree , minRecSizes , startFileNum , numFiles , cacheKey , m_ks ); */ // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); /* // allocate space if ( m_numSplit > 1 ) { int32_t need = m_numSplit * sizeof(Multicast) ; char *buf = (char *)mmalloc ( need,"msg0mcast" ); if ( ! buf ) return true; m_mcasts = (Multicast *)buf; for ( int32_t i = 0; i < m_numSplit ; i++ ) m_mcasts[i].constructor(); } */ // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) //#ifdef SPLIT_INDEXDB // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; /* if ( m_numSplit > 1 ) { gr = g_indexdb.getSplitGroupId ( baseGroupId, i ); buf = &replyBuf[i*replyBufMaxSize]; } else { */ //gr = m_groupId; buf = replyBuf; //} // get the multicast Multicast *m = &m_mcast; //if ( m_numSplit > 1 ) m = &m_mcasts[i]; if ( ! m->send ( m_request , //#else // if ( ! m_mcast.send ( m_request , //#endif m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , //#ifdef SPLIT_INDEXDB // gr , // group + offset //#else // m_groupId , // group to send to (groupKey) //#endif false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout , // timeout in seconds (was 30) niceness , realtime , firstHostId , //#ifdef SPLIT_INDEXDB // &replyBuf[i*replyBufMaxSize] , //#else // replyBuf , //#endif buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log("net: Failed to send request for data from %s in shard " "#%"UINT32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // no, multicast will free this when it is destroyed //if (replyBuf) mfree ( replyBuf , replyBufMaxSize , "Msg22" ); // but speed it up //#ifdef SPLIT_INDEXDB m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) return false; //#else // m_mcast.reset(); //#endif return true; } //#ifdef SPLIT_INDEXDB m_numRequests++; //#endif // we blocked return false; }
// . reply to a request for an RdbList // . MUST call g_udpServer::sendReply or sendErrorReply() so slot can // be destroyed void handleRequest0 ( UdpSlot *slot , int32_t netnice ) { // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; //if ( netnice == 0 ) us = &g_udpServer2; // get the request char *request = slot->m_readBuf; int32_t requestSize = slot->m_readBufSize; // collection is now stored in the request, so i commented this out //if ( requestSize != MSG0_REQ_SIZE ) { // log("net: Received bad data request size of %"INT32" bytes. " // "Should be %"INT32".", requestSize ,(int32_t)MSG0_REQ_SIZE); // us->sendErrorReply ( slot , EBADREQUESTSIZE ); // return; //} // parse the request char *p = request; int64_t syncPoint = *(int64_t *)p ; p += 8; //key_t startKey = *(key_t *)p ; p += sizeof(key_t); //key_t endKey = *(key_t *)p ; p += sizeof(key_t); int32_t minRecSizes = *(int32_t *)p ; p += 4; int32_t startFileNum = *(int32_t *)p ; p += 4; int32_t numFiles = *(int32_t *)p ; p += 4; int32_t maxCacheAge = *(int32_t *)p ; p += 4; char rdbId = *p++; char addToCache = *p++; char doErrorCorrection = *p++; char includeTree = *p++; // this was messing up our niceness conversion logic int32_t niceness = slot->m_niceness;//(int32_t)(*p++); // still need to skip it though! p++; bool allowPageCache = (bool)(*p++); char ks = getKeySizeFromRdbId ( rdbId ); char *startKey = p; p+=ks; char *endKey = p; p+=ks; // then null terminated collection //char *coll = p; collnum_t collnum = *(collnum_t *)p; p += sizeof(collnum_t); CollectionRec *xcr = g_collectiondb.getRec ( collnum ); if ( ! xcr ) g_errno = ENOCOLLREC; // error set from XmlDoc::cacheTermLists()? if ( g_errno ) { us->sendErrorReply ( slot , EBADRDBID ); return;} // is this being called from callWaitingHandlers() //bool isRecall = (netnice == 99); // . get the rdb we need to get the RdbList from // . returns NULL and sets g_errno on error //Msg0 msg0; //Rdb *rdb = msg0.getRdb ( rdbId ); Rdb *rdb = getRdbFromId ( rdbId ); if ( ! rdb ) { us->sendErrorReply ( slot , EBADRDBID ); return;} // keep track of stats rdb->readRequestGet ( requestSize ); /* // keep track of stats if ( ! isRecall ) rdb->readRequestGet ( requestSize ); int64_t singleDocId2 = 0LL; if ( rdbId == RDB_POSDB && maxCacheAge ) { int64_t d1 = g_posdb.getDocId(startKey); int64_t d2 = g_posdb.getDocId(endKey); if ( d1+1 == d2 ) singleDocId2 = d1; } // have we parsed this docid and cached its termlists? bool shouldBeCached2 = false; if ( singleDocId2 && isDocIdInTermListCache ( singleDocId2 , coll ) ) shouldBeCached2 = true; // if in the termlist cache, send it back right away char *trec; int32_t trecSize; if ( singleDocId2 && getRecFromTermListCache(coll, startKey, endKey, maxCacheAge, &trec, &trecSize) ) { // if in cache send it back! us->sendReply_ass(trec,trecSize,trec,trecSize,slot); return; } // if should be cached but was not found then it's probably a // synonym form not in the doc content. make an empty list then. if ( shouldBeCached2 ) { // send back an empty termlist us->sendReply_ass(NULL,0,NULL,0,slot); return; } // MUST be in termlist cache! if not in there it is a probably // a synonym term termlist of a word in the doc. if ( isRecall ) { // send back an empty termlist us->sendReply_ass(NULL,0,NULL,0,slot); return; } // init waiting table? static bool s_waitInit = false; if ( ! s_waitInit ) { // do not repeat s_waitInit = true; // niceness = 0 if ( ! g_waitingTable.set(8,4,2048,NULL,0,true,0,"m5wtbl")){ log("msg5: failed to init waiting table"); // error kills us! us->sendErrorReply ( slot , EBADRDBID ); return; } } // wait in waiting table? if ( singleDocId2 && g_waitingTable.isInTable ( &singleDocId2 ) ) { g_waitingTable.addKey ( &singleDocId2 , &slot ); return; } // if it's for a special gbdocid: query then cache ALL termlists // for this docid into g_termListCache right now if ( singleDocId2 ) { // have all further incoming requests for this docid // wait in the waiting table g_waitingTable.addKey ( &singleDocId2 , &slot ); // load the title rec and store its posdb termlists in cache XmlDoc *xd; try { xd = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; us->sendErrorReply ( slot , g_errno ); return; } mnew ( xd, sizeof(XmlDoc),"msg0xd"); // always use niceness 1 now even though we use niceness 0 // to make the cache hits fast //niceness = 1; // . load the old title rec first and just recycle all // . typically there might be a few hundred related docids // each with 50,000 matching queries on average to evaluate // with the gbdocid:xxxx| restriction? if ( ! xd->set3 ( singleDocId2 , coll , niceness ) ) { us->sendErrorReply ( slot , g_errno ); return;} // init the new xmldoc xd->m_callback1 = callWaitingHandlers; xd->m_state = xd; // . if this blocks then return // . should call loadOldTitleRec() and get JUST the posdb recs // by setting m_useTitledb, etc. to false. then it should // make posdb termlists with the compression using // RdbList::addRecord() and add those lists to // g_termListCache if ( ! xd->cacheTermLists ( ) ) return; // otherwise, it completed right away! callWaitingHandlers ( xd ); return; } */ /* // init special sectiondb cache? if ( rdbId == RDB_SECTIONDB && ! s_initCache ) { // try to init cache if ( ! s_sectiondbCache.init ( 20000000 , // 20MB max mem -1 , // fixed data size false , // support lists? 20000 , // 20k max recs false , // use half keys? "secdbche", // dbname false, // load from disk? sizeof(key128_t), //cachekeysize 0 , // data key size 20000 )) // numPtrs max log("msg0: failed to init sectiondb cache: %s", mstrerror(g_errno)); else s_initCache = true; } // check the sectiondb cache if ( rdbId == RDB_SECTIONDB ) { //int64_t sh48 = g_datedb.getTermId((key128_t *)startKey); // use the start key now!!! char *data; int32_t dataSize; if (s_sectiondbCache.getRecord ( coll, startKey,//&sh48, &data, &dataSize, true, // docopy? 600, // maxage (10 mins) true, // inc counts? NULL, // cachedtime true // promoteRec? )){ // debug //log("msg0: got sectiondblist in cache datasize=%"INT32"", // dataSize); // send that back g_udpServer.sendReply_ass ( data , dataSize , data , dataSize , slot , 60 , NULL , doneSending_ass , -1 , -1 , true ); return; } } */ // . do a local get // . create a msg5 to get the list State00 *st0 ; try { st0 = new (State00); } catch ( ... ) { g_errno = ENOMEM; log("Msg0: new(%"INT32"): %s", (int32_t)sizeof(State00),mstrerror(g_errno)); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st0 , sizeof(State00) , "State00" ); // timing debug if ( g_conf.m_logTimingNet ) st0->m_startTime = gettimeofdayInMilliseconds(); // save slot in state st0->m_slot = slot; // save udp server to send back reply on st0->m_us = us; // init this one st0->m_niceness = niceness; st0->m_rdbId = rdbId; QUICKPOLL(niceness); // debug msg if ( maxCacheAge != 0 && ! addToCache ) log(LOG_LOGIC,"net: msg0: check but don't add... rdbid=%"INT32".", (int32_t)rdbId); // . if this request came over on the high priority udp server // make sure the priority gets passed along // . return if this blocks // . we'll call sendReply later if ( ! st0->m_msg5.getList ( rdbId , collnum , &st0->m_list , startKey , endKey , minRecSizes , includeTree , // include tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , st0 , gotListWrapper , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum 2 , // maxRetries true , // compensateForMerge syncPoint , NULL,//&st0->m_msg5b , false, allowPageCache ) ) return; // call wrapper ouselves gotListWrapper ( st0 , NULL , NULL ); }
// . but now that we may get a list remotely to fix data corruption, // this may indeed block bool Msg3::doneScanning ( ) { QUICKPOLL(m_niceness); // . did we have any error on any scan? // . if so, repeat ALL of the scans g_errno = m_errno; // 2 retry is the default int32_t max = 2; // see if explicitly provided by the caller if ( m_maxRetries >= 0 ) max = m_maxRetries; // now use -1 (no max) as the default no matter what max = -1; // ENOMEM is particulary contagious, so watch out with it... if ( g_errno == ENOMEM && m_maxRetries == -1 ) max = 0; // msg0 sets maxRetries to 2, don't let max stay set to -1 if ( g_errno == ENOMEM && m_maxRetries != -1 ) max = m_maxRetries; // when thread cannot alloc enough read buf it keeps the read buf // set to NULL and BigFile.cpp sets g_errno to EBUFTOOSMALL if ( g_errno == EBUFTOOSMALL && m_maxRetries == -1 ) max = 0; // msg0 sets maxRetries to 2, don't let max stay set to -1 if ( g_errno == EBUFTOOSMALL && m_maxRetries != -1 ) max = m_maxRetries; // . if no thread slots available, that hogs up serious memory. // the size of Msg3 is 82k, so having just 5000 of them is 430MB. // . i just made Msg3 alloc mem when it needs more than about 2k // so this problem is greatly reduced, therefore let's keep // retrying... forever if no thread slots in thread queue since // we become the thread queue in a way. if ( g_errno == ENOTHREADSLOTS ) max = -1; // this is set above if the map has the same consecutive key repeated // and the read is enormous if ( g_errno == ECORRUPTDATA ) max = 0; // usually bad disk failures, don't retry those forever //if ( g_errno == EIO ) max = 3; // no, now our hitachis return these even when they're good so // we have to keep retrying forever if ( g_errno == EIO ) max = -1; // count these so we do not take drives offline just because // kernel ring buffer complains... if ( g_errno == EIO ) g_numIOErrors++; // bail early on high priority reads for these errors if ( g_errno == EDISKSTUCK && m_niceness == 0 ) max = 0; if ( g_errno == EIO && m_niceness == 0 ) max = 0; // how does this happen? we should never bail out on a low priority // disk read... we just wait for it to complete... if ( g_errno == EDISKSTUCK && m_niceness != 0 ) { char *xx=NULL;*xx=0;} // on I/O, give up at call it corrupt after a while. some hitachis // have I/O errros on little spots, like gk88, maybe we can fix him if ( g_errno == EIO && m_retryNum >= 5 ) { m_errno = ECORRUPTDATA; m_hadCorruption = true; // do not do any retries any more max = 0; } // convert m_errno to ECORRUPTDATA if it is EBUFTOOSMALL and the // max of the bytesToRead are over 500MB. // if bytesToRead was ludicrous, then assume that the data file // was corrupted, the map was regenerated and it patched // over the corrupted bits which were 500MB or more in size. // we cannot practically allocate that much, so let's just // give back an empty buffer. treat it like corruption... // the way it patches is to store the same key over all the corrupted // pages, which can get pretty big. so if you read a range with that // key you will be hurting!! // this may be the same scenario as when the rdbmap has consecutive // same keys. see above where we set m_errno to ECORRUPTDATA... if ( g_errno == EBUFTOOSMALL ) { int32_t biggest = 0; for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { if ( m_scans[i].m_bytesToRead < biggest ) continue; biggest = m_scans[i].m_bytesToRead; } if ( biggest > 500000000 ) { log("db: Max read size was %" PRId32" > 500000000. Assuming " "corrupt data in data file.",biggest); m_errno = ECORRUPTDATA; m_hadCorruption = true; // do not do any retries on this, the read was > 500MB max = 0; } } // if shutting down gb then limit to 20 so we can shutdown because // it can't shutdown until all threads are out of the queue i think if ( g_process.m_mode == EXIT_MODE && max < 0 ) { //log("msg3: forcing retries to 0 because shutting down"); max = 0; } // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base = getRdbBase( m_rdbId, m_collnum ); if ( ! base ) { return true; } // this really slows things down because it blocks the cpu so // leave it out for now #ifdef GBSANITYCHECK // check for corruption here, do not do it again in Msg5 if we pass if ( ! g_errno ) { // && g_conf.m_doErrorCorrection ) { int32_t i; for ( i = 0 ; i < m_numFileNums ; i++ ) if ( ! m_lists[i].checkList_r ( false, false ) ) break; if ( i < m_numFileNums ) { g_errno = ECORRUPTDATA; m_errno = ECORRUPTDATA; max = g_conf.m_corruptRetries; // try 100 times log("db: Encountered corrupt list in file %s.", base->getFile(m_fileNums[i])->getFilename()); } else m_listsChecked = true; } #endif // try to fix this error i've seen if ( g_errno == EBADENGINEER && max == -1 ) max = 100; // . if we had a ETRYAGAIN error, then try again now // . it usually means the whole file or a part of it was deleted // before we could finish reading it, so we should re-read all now // . RdbMerge deletes BigFiles after it merges them and also chops // off file heads // . now that we have threads i'd imagine we'd get EBADFD or something // . i've also seen "illegal seek" as well if ( m_errno && (m_retryNum < max || max < 0) && // this will complete in due time, we can't call a sleep wrapper // on it because the read is really still pending... m_errno != EDISKSTUCK ) { // print the error static time_t s_time = 0; time_t now = getTime(); if ( now - s_time > 5 || g_errno != ENOTHREADSLOTS ) { log("net: Had error reading %s: %s. Retrying. " "(retry #%" PRId32")", base->m_dbname,mstrerror(m_errno) , m_retryNum ); s_time = now; } // send email alert if in an infinite loop, but don't send // more than once every 2 hours static int32_t s_lastSendTime = 0; if ( m_retryNum == 100 && getTime() - s_lastSendTime > 3600*2){ // remove this for now it is going off all the time //g_pingServer.sendEmail(NULL,//g_hostdb.getMyHost(), // "100 read retries",true); s_lastSendTime = getTime(); } // clear g_errno cuz we should for call to readList() g_errno = 0; // free the list buffer since if we have 1000 Msg3s retrying // it will totally use all of our memory for ( int32_t i = 0 ; i < m_numChunks ; i++ ) m_lists[i].destructor(); // count retries m_retryNum++; // backoff scheme, wait 100ms more each time int32_t wait ; if ( m_retryNum == 1 ) wait = 10; else wait = 200 * m_retryNum; // . don't wait more than 10 secs between tries // . i've seen gf0 and gf16 get mega saturated if ( wait > 10000 ) wait = 10000; // wait 500 ms if ( g_loop.registerSleepCallback ( wait , // ms this , doneSleepingWrapper3, m_niceness)) return false; // otherwise, registration failed log( "net: Failed to register sleep callback for retry. " "Abandoning read. This is bad."); // return, g_errno should be set g_errno = EBUFTOOSMALL; m_errno = EBUFTOOSMALL; return true; } // if we got an error and should not retry any more then give up if ( g_errno ) { log( "net: Had error reading %s: %s. Giving up after %" PRId32" " "retries.", base->m_dbname,mstrerror(g_errno) , m_retryNum ); return true; } // note it if the retry finally worked if ( m_retryNum > 0 ) log(LOG_INFO,"disk: Read succeeded after retrying %" PRId32" times.", (int32_t)m_retryNum); // count total bytes for logging int32_t count = 0; // . constrain all lists to make merging easier // . if we have only one list, then that's nice cuz the constrain // will allow us to send it right away w/ zero copying // . if we have only 1 list, it won't be merged into a final list, // that is, we'll just set m_list = &m_lists[i] for ( int32_t i = 0 ; i < m_numFileNums ; i++ ) { QUICKPOLL(m_niceness); // count total bytes for logging count += m_lists[i].getListSize(); // . hint offset is relative to the offset of first key we read // . if that key was only 6 bytes RdbScan shift the list buf // down 6 bytes to make the first key 12 bytes... a // requirement for all RdbLists // . don't inc it, though, if it was 0, pointing to the start // of the list because our shift won't affect that if ( m_scans[i].m_shifted == 6 && m_hintOffsets[i] > 0 ) m_hintOffsets[i] += 6; // posdb double compression if ( m_scans[i].m_shifted == 12 && m_hintOffsets[i] > 0 ) m_hintOffsets[i] += 12; // . don't constrain on minRecSizes here because it may // make our endKey smaller, which will cause problems // when Msg5 merges these lists. // . If all lists have different endKeys RdbList's merge // chooses the min and will merge in recs beyond that // causing a bad list BECAUSE we don't check to make // sure that recs we are adding are below the endKey // . if we only read from one file then constrain based // on minRecSizes so we can send the list back w/o merging // OR if just merging with RdbTree's list int32_t mrs ; // . constrain to m_minRecSizesOrig, not m_minRecSizes cuz // that could be adjusted by compensateForNegativeRecs() // . but, really, they should be the same if we only read from // the root file if ( m_numFileNums == 1 ) mrs = m_minRecSizesOrig; else mrs = -1; // . this returns false and sets g_errno on error // . like if data is corrupt BigFile *ff = base->getFile(m_fileNums[i]); // if we did a merge really quick and delete one of the // files we were reading, i've seen 'ff' be NULL char *filename = "lostfilename"; if ( ff ) filename = ff->getFilename(); // compute cache info RdbCache *rpc = getDiskPageCache ( m_rdbId ); if ( ! m_allowPageCache ) rpc = NULL; int64_t vfd ; if ( ff ) vfd = ff->getVfd(); key192_t ck ; if ( ff ) ck = makeCacheKey ( vfd , m_scans[i].m_offset , m_scans[i].m_bytesToRead ); if ( m_validateCache && ff && rpc && vfd != -1 ) { bool inCache; char *rec; int32_t recSize; inCache = rpc->getRecord ( (collnum_t)0 , // collnum (char *)&ck , &rec , &recSize , true , // copy? -1 , // maxAge, none true ); // inccounts? if ( inCache && // 1st byte is RdbScan::m_shifted ( m_lists[i].m_listSize != recSize-1 || memcmp ( m_lists[i].m_list , rec+1,recSize-1) || *rec != m_scans[i].m_shifted ) ) { log("msg3: cache did not validate"); char *xx=NULL;*xx=0; } mfree ( rec , recSize , "vca" ); } /////// // // STORE IN PAGE CACHE // /////// // store what we read in the cache. don't bother storing // if it was a retry, just in case something strange happened. // store pre-constrain call is more efficient. if ( m_retryNum<=0 && ff && rpc && vfd != -1 && ! m_scans[i].m_inPageCache ) rpc->addRecord ( (collnum_t)0 , // collnum (char *)&ck , // rec1 is this little thingy &m_scans[i].m_shifted, 1, // rec2 m_lists[i].getList() , m_lists[i].getListSize() , 0 ); // timestamp. 0 = now QUICKPOLL(m_niceness); // if from our 'page' cache, no need to constrain if ( ! m_lists[i].constrain ( m_startKey , m_constrainKey , // m_endKey mrs , // m_minRecSizes m_hintOffsets[i] , //m_hintKeys [i] , &m_hintKeys [i*m_ks] , filename,//ff->getFilename() , m_niceness ) ) { log("net: Had error while constraining list read from " "%s: %s/%s. vfd=%" PRId32" parts=%" PRId32". " "This is likely caused by corrupted " "data on disk.", mstrerror(g_errno), ff->getDir(), ff->getFilename(), ff->m_vfd , (int32_t)ff->m_numParts ); continue; } } // print the time if ( g_conf.m_logTimingDb ) { int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_startTime; log(LOG_TIMING, "net: Took %" PRId64" ms to read %" PRId32" lists of %" PRId32" bytes total" " from %s (niceness=%" PRId32").", took,m_numFileNums,count,base->m_dbname,m_niceness); } return true; }
bool Statsdb::addEventPoint ( long t1 , long parmHash , float oldVal , float newVal , long thickness ) { // convert t1 into pixel position float af = (float)DX * (float)(t1 - m_t1) / (float)(m_t2 - m_t1); // round it to nearest pixel long a = (long)(af + .5) ;//+ m_bx; // convert t2 into pixel position //float bf = (float)DX * (float)(t2 - m_t1) / (float)(m_t2 - m_t1); // round it to nearest pixel //long b = (long)(bf + .5) + m_bx; //if ( a > b ) { char *xx=NULL;*xx=0; } // 5 pixel width when rendering the square, 2 pixel boundary long b = a + 7; // make sure we got it Parm *m = g_parms.getParmFromParmHash ( parmHash ); if ( ! m ) { log("statsdb: unrecognized parm hash = %li",parmHash); return true; //char *xx=NULL;*xx=0; } } // go down each line of points for ( long i = 0 ; i < MAX_LINES ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // . is there room for us in this line? // . see what other lines/events are on this line long slot = m_ht3.getSlot ( &i ); // loop over all events on this line for ( ; slot >= 0 ; slot = m_ht3.getNextSlot ( slot , &i ) ) { // breathe QUICKPOLL ( m_niceness ); // get the offset long offset = *(long *)m_ht3.getValueFromSlot ( slot ); // get buffer char *buf = m_sb3.getBufStart(); // get its value EventPoint *p = (EventPoint *)(buf + offset); // check its boundaries, require 2 pixel spacing if ( a < p->m_a && b >= p->m_a ) break; if ( b > p->m_b && a <= p->m_b ) break; if ( a >= p->m_a && b <= p->m_b ) break; } // we collided with another event on this line, try next line if ( slot >= 0 ) continue; // make sure we got room if ( ! m_sb3.reserve2x ( sizeof(EventPoint) ) ) return false; // add it in EventPoint *pp = (EventPoint *)m_sb3.getBuf(); // set it pp->m_a = a; pp->m_b = b; //pp->m_colorRGB = colorRGB; pp->m_parmHash = parmHash; pp->m_oldVal = oldVal; pp->m_newVal = newVal; pp->m_thickness = thickness; // store the offset incase m_sb3 reallocates long length = m_sb3.length(); // tell safebuf to skip over it now m_sb3.incrementLength ( sizeof(EventPoint) ); // add line to hashtable if ( ! m_ht3.addKey ( (void *)&i , &length ) ) return false; // all done now return true; } // crap no room! log("stats: no room in graph for event"); return true; }
bool sendTurkPageReply ( State60 *st ) { XmlDoc *xd = &st->m_xd; //char *content = xd->ptr_utf8Content; //int32_t contentLen = xd->size_utf8Content - 1; // count the total number of EventDesc classes for all evids //char *evd = xd->ptr_eventData; //EventDisplay *ed = (EventDisplay *)evd; //char *addr = evd + (int32_t)ed->m_addr; //char timeZoneOffset = getTimeZoneFromAddr ( addr ); // in case getSections() block come right back in xd->setCallback ( st , xdcallback ); // . set niceness to 1 so all this processing doesn't slow queries down // . however, g_niceness should still be zero... hmmm... xd->m_niceness = 1; // default to 1 niceness st->m_niceness = 1; // now set the sections class Sections *ss = xd->getSections(); // now for each section with alnum text, telescope up as far as // possible without containing anymore alnum text than what it // contained. set SEC_CONTROL bit. such sections will have the // 2 green/blue dots, that are used for turning on/off title/desc. // but really the indians will only turn off sections that should // not have a title/desc. for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) { // breathe QUICKPOLL(st->m_niceness); // skip if does not have text if ( si->m_firstWordPos < 0 ) continue; // otherwise, find biggest parent that contains just that text Section *p = si->m_parent; Section *last = si; for ( ; p ; p = p->m_parent ) { if ( p->m_firstWordPos != si->m_firstWordPos ) break; if ( p->m_lastWordPos != si->m_lastWordPos ) break; last = p; } // set that bit then last->m_flags |= SEC_CONTROL; // and speed up the loop si = last; } // * now each SEC_CONTROL sections have a fence activated by a turker // * an event title or description can not span a fence. it must be // confined within a fence. however, it is allowed to include // title or description from a "title section". // * hold shift down to designate as title section when clicking it // * show the raw text of each event changing as you fence // sections in or out. show in a right frame. // * show list of events on page in the top frame. can toggle them // all individually. // * and remove no-display from all tags so we can see everything. // * highlight addresses, not just dates. // * each section hash has its own unique bg color when activated // * with a single click, completely reject an event because: // contains bad time, address, title or desc. specify which so // we can improve our algo. // * when selecting an individual event, scroll to its tod... // * remove all color from webpage that we can so our colors show up // * remove all imgs. just src them to dev null. // * allow for entering a custom title for an event or all events // that are or will ever appear on the page. // * when displaying the text of the events, use hyphens to // delineate the section topology. strike out text as a section // fence is activated. // * when a section is activated is it easier to just redownload // the whole text of the page? maybe just the text frame? // * clicking on an individual sentence section should just remove // that sentence. that is kinda a special content hash removal // tag. like "Click here for video." // * when an event id is selected i guess activate its bgcolor to // be light blue for all sentences currently in the event that // are not in activated sections. (make exception for designated // title sections). so we need multiple tags for each events // sentence div section. if sentence is split use multiple div tags // then to keep the order. so each event sentence would have // <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and // 10. that way we can activate it when one of those event ids is // activated. SafeBuf sb; // int16_tcuts if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; } Words *words = &xd->m_words; int32_t nw = words->getNumWords(); char **wptrs = words->getWords(); int32_t *wlens = words->getWordLens(); nodeid_t *tids = words->getTagIds(); // a special array for printing </div> tags char *endCounts = (char *)mcalloc ( nw ,"endcounts"); if ( ! endCounts ) return sendErrorReply ( st , g_errno ); // // now loop over all the words. if word starts a section that has // SEC_CONTROL bit set, and print out the section hash and a color // tag to be activated if the turkey activates us. // CAUTION: word may start multiple sections. // for ( int32_t i = 0 ; i < nw ; i++ ) { // get section ptr Section *sj = ss->m_sectionPtrs[i]; // sanity check. sj must be first section ptr that starts @ a if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) { char *xx=NULL;*xx=0; } // . does word #i start a section? // . if section is control, print out the control while ( sj && sj->m_a == i ) { // print this section's hash if ( sj->m_flags & SEC_CONTROL) { // after the turkeys have made all the edits // they need to submit the changes they made. // how can we get that data sent back to the // back end? we need to send back the colors // of the sections that have been activated // i guess. just do a loop over them. sb.safePrintf("<div nobreak gbsecid=%"UINT32" " "bgcolor=#%"XINT32" " "onclick=gbtogglecolor()>", (uint32_t)sj->m_tagHash, (uint32_t)sj->m_tagHash); // sanity check if ( sj->m_b < 0 ) { char *xx=NULL;*xx=0; } if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; } // and inc the /div count for that word endCounts[sj->m_b-1]++; } // try next section too sj = sj->m_next; } // if this is a tag, remove any coloring if ( tids[i] ) { } // print the word, be it a tag, alnum, punct sb.safeMemcpy ( wptrs[i] , wlens[i] ); // end a div tag? if ( ! endCounts[i] ) continue; // might be many so loop it for ( int32_t j = 0 ; j < endCounts[i] ; j++ ) sb.safePrintf("</div>"); } return false; }
char *Statsdb::plotGraph ( char *pstart , char *pend , long graphHash , //GIFPlotter *plotter , SafeBuf &gw , long zoff ) { // . use "graphHash" to map to unit display // . this is a disk read volume Label *label = getLabel ( graphHash ); if ( ! label ) { char *xx=NULL;*xx=0; } log("stats: plotting %s",label->m_keyDesc) ; // let's first scan m_sb1 to normalize the y values bool needMin = true; bool needMax = true; float ymin = 0.0; float ymax = 0.0; char *p = pstart; for ( ; p < pend ; p += 12 ) { // breathe QUICKPOLL ( m_niceness ); // get the y float y2 = *(float *)(p+4); // get color of this point long gh = *(long *)(p +8); // stop if not us if ( gh != graphHash ) continue; // put into scaled space right away y2 = y2 * label->m_yscalar; // . limit y to absolute max // . these units should be scaled as well! if ( y2 > label->m_absYMax && label->m_absYMax > 0.0 ) y2 = label->m_absYMax; // get min and max if ( y2 < ymin || needMin ) ymin = y2; if ( y2 > ymax || needMax ) ymax = y2; needMin = false; needMax = false; } // force to zero for now ymin = 0.0; // . and force to ymax for now as well // . -1 indicates dynamic though! if ( label->m_absYMax > 0.0 ) ymax = label->m_absYMax; // add a 20% ceiling else ymax *= 1.20; // return that! char *retp = p; // set the line width //plotter->linewidth ( 1 ); long color = label->m_color; // use the color specified from addStat_r() for this line/pt //plotter->pencolor ( ((color >> 16) & 0xff) << 8 , // ((color >> 8) & 0xff) << 8 , // ((color >> 0) & 0xff) << 8 ); // . the minimum difference between ymax and ymin is minDiff. // . this prevents us from zooming in too close! float minDiff = (float)DY * label->m_minRes ; // we are already scaled! float ourDiff = (ymax - ymin) ; // . pad y range if total range is small // . only do this for certain types of stats, like qps and disk i/o if ( ourDiff < minDiff ) { float pad = (minDiff - ourDiff) / 2; // pad it out ymin -= pad ; ymax += pad ; // fix it some if ( ymin < 0 ) { ymax += -1*ymin; ymin = 0; } // limit again just in case if ( ymax > label->m_absYMax && label->m_absYMax > 0.0 ) ymax = label->m_absYMax; } // set the line width //plotter->linewidth ( 2 ); // reset for 2nd scan p = pstart; long lastx = -1; float lasty ; bool firstPoint = true; // now the m_sb1 buffer consists of points to make lines with for ( ; p < pend ; ) { // breathe QUICKPOLL ( m_niceness ); // first is x pixel pos long x2 = *(long *)p; p += 4; // then y pos float y2 = *(float *)p; p += 4; // scale it right away y2 *= label->m_yscalar; // adjust if ( y2 > ymax ) y2 = ymax; // then graphHash long gh = *(long *)p; p += 4; // skip if wrong graph if ( gh != graphHash ) continue; // set first point for making the line long x1 = lastx; float y1 = lasty; // normalize y into pixel space y2 = ((float)DY * (y2 - ymin)) / (ymax-ymin); // set lasts for next iteration of this loop lastx = x2; lasty = y2; // . flip the y so we don't have to scroll the browser down // . DY does not include the axis and tick marks // . do not flip y any more for statsdb graphs long fy1 = (long)(y1+.5);// + m_by ; long fy2 = (long)(y2+.5);// + m_by ; // how are we getting -.469 for "query" point? if ( fy1 < 0 ) continue; if ( fy2 < 0 ) continue; // skip if can't make a line if ( firstPoint ) { //plotter->circle ( x2 , fy2 , 2 ); long width = POINTWIDTH; // draw a 4x4 box now: drawLine3(m_gw,x2-width/2,x2+width/2,fy2,color,width); firstPoint = false; continue; } // log it //logf(LOG_DEBUG,"plot: (%li,%.02f) - (%li,%.02f) [%s]", // x1 , y1 , x2 , y2 , label->m_label ); // ensure at least 3 units wide for visibility //if ( x2 < x1 + 10 ) x2 = x1 + 10; // plot it // BUT only iff not more than 5 seconds difference //float secondsPerPixel = (m_t2-m_t1)/(float)DX; // avoid this for now. mdw oct 14 2013. //float dt = (x2 - x1) * secondsPerPixel; //if ( dt <= 13 || x2 - x1 <= 10 ) // plotter->line ( x1 , fy1 , x2 , fy2 ); // circle second point //plotter->circle ( x1 , fy1 , 2 ); //plotter->circle ( x2 , fy2 , 2 ); // draw a 4x4 boxes now: long width = POINTWIDTH; drawLine3 ( m_gw,x1-width/2, x1+width/2, fy1,color, width); drawLine3 ( m_gw,x2-width/2, x2+width/2, fy2,color, width); } //plotter->linewidth ( 1 ); // plot unit lines float deltaz = (ymax-ymin) / 6; if ( strstr(label->m_keyDesc,"latency" ) ) { // draw it drawHR ( 400.0 - 111.0 , ymin,ymax,m_gw,label,zoff,0xff0000); drawHR ( 600.0-111.0,ymin,ymax,m_gw,label,zoff,color); } if ( strstr(label->m_keyDesc,"queries per sec" ) ) { // draw it //deltaz /= 2; //drawHR(120.0, ymin , ymax , plotter , label , zoff , color ); //drawHR(130.0, ymin , ymax , plotter , label , zoff , color ); drawHR ( 140.0 , ymin , ymax ,m_gw , label , zoff , color ); } for ( float z = ymin ; z < ymax ; z += deltaz ) { // breathe QUICKPOLL ( m_niceness ); // draw it drawHR ( z , ymin , ymax , m_gw , label , zoff , color ); } return retp; //#endif }
void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) { State22 *st = (State22 *)state; // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; // shortcut Msg22Request *r = st->m_r; // breathe QUICKPOLL(r->m_niceness); // send error reply on error if ( g_errno ) { hadError: log("db: Had error getting title record from titledb: %s.", mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } us->sendErrorReply ( st->m_slot , g_errno ); mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return ; } // convenience var RdbList *tlist = &st->m_tlist; // set probable docid long long pd = 0LL; if ( r->m_url[0] ) { pd = g_titledb.getProbableDocId(r->m_url); if ( pd != st->m_pd ) { log("db: crap probable docids do not match! u=%s", r->m_url); g_errno = EBADENGINEER; goto hadError; } // sanity //if ( pd != st->m_pd ) { char *xx=NULL;*xx=0; } } // the probable docid is the PREFERRED docid in this case if ( r->m_getAvailDocIdOnly ) pd = st->m_r->m_docId; // . these are both meant to be available docids // . if ad2 gets exhausted we use ad1 long long ad1 = st->m_docId1; long long ad2 = pd; bool docIdWasFound = false; // scan the titleRecs in the list for ( ; ! tlist->isExhausted() ; tlist->skipCurrentRecord ( ) ) { // breathe QUICKPOLL ( r->m_niceness ); // get the rec char *rec = tlist->getCurrentRec(); long recSize = tlist->getCurrentRecSize(); // get that key key_t *k = (key_t *)rec; // skip negative recs, first one should not be negative however if ( ( k->n0 & 0x01 ) == 0x00 ) continue; // get docid of that titlerec long long dd = g_titledb.getDocId(k); if ( r->m_getAvailDocIdOnly ) { // make sure our available docids are availble! if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; continue; } // if we had a url make sure uh48 matches else if ( r->m_url[0] ) { // get it long long uh48 = g_titledb.getUrlHash48(k); // sanity check if ( st->m_uh48 == 0 ) { char *xx=NULL;*xx=0; } // make sure our available docids are availble! if ( dd == ad1 ) ad1++; if ( dd == ad2 ) ad2++; // we must match this exactly if ( uh48 != st->m_uh48 ) continue; } // otherwise, check docid else { // compare that if ( r->m_docId != dd ) continue; } // flag that we matched m_docId docIdWasFound = true; // do not set back titlerec if just want avail docid //if ( r->m_getAvailDocIdOnly ) continue; // ok, if just "checking tfndb" no need to go further if ( r->m_justCheckTfndb ) { // send back a good reply (empty means found!) us->sendReply_ass ( NULL,0,NULL,0,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // use rec as reply char *reply = rec; // . send this rec back, it's a match // . if only one rec in list, steal the list's memory if ( recSize != tlist->getAllocSize() ) { // otherwise, alloc space for the reply reply = (char *)mmalloc (recSize, "Msg22"); if ( ! reply ) goto hadError; memcpy ( reply , rec , recSize ); } // otherwise we send back the whole list! else { // we stole this from list tlist->m_ownData = false; } // off ya go us->sendReply_ass(reply,recSize,reply,recSize,st->m_slot); // don't forget to free the state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); // all done return; } // maybe no available docid if we breached our range if ( ad1 >= pd ) ad1 = 0LL; if ( ad2 > st->m_docId2 ) ad2 = 0LL; // get best long long ad = ad2; // but wrap around if we need to if ( ad == 0LL ) ad = ad1; // if "docId" was unmatched that should be the preferred available // docid then... //if(! docIdWasFound && r->m_getAvailDocIdOnly && ad != r->m_docId ) { // char *xx=NULL;*xx=0; } // remember it. this might be zero if none exist! st->m_availDocId = ad; // note it if ( ad == 0LL && (r->m_getAvailDocIdOnly || r->m_url[0]) ) log("msg22: avail docid is 0 for pd=%lli!",pd); // . ok, return an available docid if ( r->m_url[0] || r->m_justCheckTfndb || r->m_getAvailDocIdOnly ) { // store docid in reply char *p = st->m_slot->m_tmpBuf; // send back the available docid *(long long *)p = st->m_availDocId; // send it us->sendReply_ass ( p , 8 , p , 8 , st->m_slot ); // don't forget to free state mdelete ( st , sizeof(State22) , "Msg22" ); delete ( st ); return; } // not found! and it was a docid based request... log("msg22: could not find title rec for docid %llu",r->m_docId); g_errno = ENOTFOUND; goto hadError; }
// . returns false if blocked, true otherwise // . sets g_errno on error // . called either from // 1) doDocIdSplitLoop // 2) or getDocIds2() if only 1 docidsplit bool Msg39::getLists () { if ( m_debug ) m_startTime = gettimeofdayInMilliseconds(); // . ask Indexdb for the IndexLists we need for these termIds // . each rec in an IndexList is a termId/score/docId tuple // // restrict to docid range? // // . get the docid start and end // . do docid paritioning so we can send to all hosts // in the network, not just one stripe int64_t docIdStart = 0; int64_t docIdEnd = MAX_DOCID; // . restrict to this docid? // . will really make gbdocid:| searches much faster! int64_t dr = m_tmpq.m_docIdRestriction; if ( dr ) { docIdStart = dr; docIdEnd = dr + 1; } // . override // . this is set from Msg39::doDocIdSplitLoop() to compute // search results in stages, so that we do not load massive // termlists into memory and got OOM (out of memory) if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId; if ( m_r->m_maxDocId != -1 ) docIdEnd = m_r->m_maxDocId+1; // if we have twins, then make sure the twins read different // pieces of the same docid range to make things 2x faster //bool useTwins = false; //if ( g_hostdb.getNumStripes() == 2 ) useTwins = true; //if ( useTwins ) { // int64_t delta2 = ( docIdEnd - docIdStart ) / 2; // if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2; // else docIdStart = docIdStart + delta2; //} // new striping logic: int32_t numStripes = g_hostdb.getNumStripes(); int64_t delta2 = ( docIdEnd - docIdStart ) / numStripes; int32_t stripe = g_hostdb.getMyHost()->m_stripe; docIdStart += delta2 * stripe; // is this right? docIdEnd = docIdStart + delta2; // add 1 to be safe so we don't lose a docid docIdEnd++; // TODO: add triplet support later for this to split the // read 3 ways. 4 ways for quads, etc. //if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;} // do not go over MAX_DOCID because it gets masked and // ends up being 0!!! and we get empty lists if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID; // remember so Msg2.cpp can use them to restrict the termlists // from "whiteList" as well m_docIdStart = docIdStart; m_docIdEnd = docIdEnd; // // set startkey/endkey for each term/termlist // for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // breathe QUICKPOLL ( m_r->m_niceness ); // int16_tcuts QueryTerm *qterm = &m_tmpq.m_qterms[i]; char *sk = qterm->m_startKey; char *ek = qterm->m_endKey; // get the term id int64_t tid = m_tmpq.getTermId(i); // if only 1 stripe //if ( g_hostdb.getNumStripes() == 1 ) { // docIdStart = 0; // docIdEnd = MAX_DOCID; //} // debug if ( m_debug ) log("query: setting sk/ek for docids %"INT64"" " to %"INT64" for termid=%"INT64"" , docIdStart , docIdEnd , tid ); // store now in qterm g_posdb.makeStartKey ( sk , tid , docIdStart ); g_posdb.makeEndKey ( ek , tid , docIdEnd ); qterm->m_ks = sizeof(POSDBKEY);//key144_t); } // debug msg if ( m_debug || g_conf.m_logDebugQuery ) { for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) { // get the term in utf8 //char bb[256]; QueryTerm *qt = &m_tmpq.m_qterms[i]; //utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen); char *tpc = qt->m_term + qt->m_termLen; char tmp = *tpc; *tpc = '\0'; char sign = qt->m_termSign; if ( sign == 0 ) sign = '0'; QueryWord *qw = qt->m_qword; int32_t wikiPhrId = qw->m_wikiPhraseId; if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0; char leftwikibigram = 0; char rightwikibigram = 0; if ( qt->m_leftPhraseTerm && qt->m_leftPhraseTerm->m_isWikiHalfStopBigram ) leftwikibigram = 1; if ( qt->m_rightPhraseTerm && qt->m_rightPhraseTerm->m_isWikiHalfStopBigram ) rightwikibigram = 1; /* char c = m_tmpq.getTermSign(i); char tt[512]; int32_t ttlen = m_tmpq.getTermLen(i); if ( ttlen > 254 ) ttlen = 254; if ( ttlen < 0 ) ttlen = 0; // old:painful: convert each term from unicode to ascii gbmemcpy ( tt , m_tmpq.getTerm(i) , ttlen ); */ int32_t isSynonym = 0; QueryTerm *st = qt->m_synonymOf; if ( st ) isSynonym = true; SafeBuf sb; // now we can display it //tt[ttlen]='\0'; //if ( c == '\0' ) c = ' '; sb.safePrintf( "query: msg39: [%"PTRFMT"] " "query term #%"INT32" \"%s\" " "phr=%"INT32" termId=%"UINT64" rawTermId=%"UINT64" " //"estimatedTermFreq=%"INT64" (+/- ~16000) " "tfweight=%.02f " "sign=%c " "numPlusses=%hhu " "required=%"INT32" " "fielcode=%"INT32" " "ebit=0x%0"XINT64" " "impBits=0x%0"XINT64" " "wikiphrid=%"INT32" " "leftwikibigram=%"INT32" " "rightwikibigram=%"INT32" " //"range.startTermNum=%hhi range.endTermNum=%hhi " //"minRecSizes=%"INT32" " "readSizeInBytes=%"INT32" " //"ebit=0x%"XINT64" " //"impBits=0x%"XINT64" " "hc=%"INT32" " "component=%"INT32" " "otermLen=%"INT32" " "isSynonym=%"INT32" " "querylangid=%"INT32" " , (PTRTYPE)this , i , qt->m_term,//bb , (int32_t)m_tmpq.isPhrase (i) , m_tmpq.getTermId (i) , m_tmpq.getRawTermId (i) , ((float *)m_r->ptr_termFreqWeights)[i] , sign , //c , 0 , (int32_t)qt->m_isRequired, (int32_t)qt->m_fieldCode, (int64_t)qt->m_explicitBit , (int64_t)qt->m_implicitBits , wikiPhrId, (int32_t)leftwikibigram, (int32_t)rightwikibigram, ((int32_t *)m_r->ptr_readSizes)[i] , //(int64_t)m_tmpq.m_qterms[i].m_explicitBit , //(int64_t)m_tmpq.m_qterms[i].m_implicitBits , (int32_t)m_tmpq.m_qterms[i].m_hardCount , (int32_t)m_tmpq.m_componentCodes[i], (int32_t)m_tmpq.getTermLen(i) , isSynonym, (int32_t)m_tmpq.m_langId ); // ,tt // put it back *tpc = tmp; if ( st ) { int32_t stnum = st - m_tmpq.m_qterms; sb.safePrintf("synofterm#=%"INT32"",stnum); //sb.safeMemcpy(st->m_term,st->m_termLen); sb.pushChar(' '); sb.safePrintf("synwid0=%"INT64" ",qt->m_synWids0); sb.safePrintf("synwid1=%"INT64" ",qt->m_synWids1); sb.safePrintf("synalnumwords=%"INT32" ", qt->m_numAlnumWordsInSynonym); // like for synonym "nj" it's base, // "new jersey" has 2 alnum words! sb.safePrintf("synbasealnumwords=%"INT32" ", qt->m_numAlnumWordsInBase); } logf(LOG_DEBUG,"%s",sb.getBufStart()); } m_tmpq.printBooleanTree(); } // timestamp log if ( m_debug ) log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] " "Getting %"INT32" index lists ", (PTRTYPE)this,m_tmpq.getNumTerms()); // . now get the index lists themselves // . return if it blocked // . not doing a merge (last parm) means that the lists we receive // will be an appending of a bunch of lists so keys won't be in order // . merging is uneccessary for us here because we hash the keys anyway // . and merging takes up valuable cpu time // . caution: the index lists returned from Msg2 are now compressed // . now i'm merging because it's 10 times faster than hashing anyway // and the reply buf should now always be <= minRecSizes so we can // pre-allocate one better, and, 3) this should fix the yahoo.com // reindex bug char rdbId = RDB_POSDB; // . TODO: MDW: fix // . partap says there is a bug in this??? we can't cache UOR'ed lists? bool checkCache = false; // split is us???? //int32_t split = g_hostdb.m_myHost->m_group; int32_t split = g_hostdb.m_myHost->m_shardNum; // call msg2 if ( ! m_msg2.getLists ( rdbId , m_r->m_collnum,//m_r->ptr_coll , m_r->m_maxAge , m_r->m_addToCache , //m_tmpq.m_qterms , &m_tmpq, m_r->ptr_whiteList, // we need to restrict docid range for // whitelist as well! this is from // doDocIdSplitLoop() m_docIdStart, m_docIdEnd, // how much of each termlist to read in bytes (int32_t *)m_r->ptr_readSizes , //m_tmpq.getNumTerms() , // numLists // 1-1 with query terms m_lists , this , controlLoopWrapper,//gotListsWrapper , m_r , m_r->m_niceness , true , // do merge? m_debug , NULL , // best hostids m_r->m_restrictPosdbForQuery , split , checkCache )) { m_blocked = true; return false; } // error? //if ( g_errno ) { // log("msg39: Had error getting termlists2: %s.", // mstrerror(g_errno)); // // don't bail out here because we are in docIdSplitLoop() // //sendReply (m_slot,this,NULL,0,0,true); // return true; //} //return gotLists ( true ); return true; }
// . reply to a request for an RdbList // . MUST call g_udpServer::sendReply or sendErrorReply() so slot can // be destroyed void handleRequest0 ( UdpSlot *slot , int32_t netnice ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. Got request for an RdbList" ); // if niceness is 0, use the higher priority udpServer UdpServer *us = &g_udpServer; //if ( netnice == 0 ) us = &g_udpServer2; // get the request char *request = slot->m_readBuf; int32_t requestSize = slot->m_readBufSize; // collection is now stored in the request, so i commented this out //if ( requestSize != MSG0_REQ_SIZE ) { // log("net: Received bad data request size of %" PRId32" bytes. " // "Should be %" PRId32".", requestSize ,(int32_t)MSG0_REQ_SIZE); // us->sendErrorReply ( slot , EBADREQUESTSIZE ); // return; //} // parse the request char *p = request; int64_t syncPoint = *(int64_t *)p ; p += 8; //key_t startKey = *(key_t *)p ; p += sizeof(key_t); //key_t endKey = *(key_t *)p ; p += sizeof(key_t); int32_t minRecSizes = *(int32_t *)p ; p += 4; int32_t startFileNum = *(int32_t *)p ; p += 4; int32_t numFiles = *(int32_t *)p ; p += 4; int32_t maxCacheAge = *(int32_t *)p ; p += 4; char rdbId = *p++; char addToCache = *p++; char doErrorCorrection = *p++; char includeTree = *p++; // this was messing up our niceness conversion logic int32_t niceness = slot->m_niceness;//(int32_t)(*p++); // still need to skip it though! p++; bool allowPageCache = (bool)(*p++); char ks = getKeySizeFromRdbId ( rdbId ); char *startKey = p; p+=ks; char *endKey = p; p+=ks; collnum_t collnum = *(collnum_t *)p; p += sizeof(collnum_t); CollectionRec *xcr = g_collectiondb.getRec ( collnum ); if ( ! xcr ) g_errno = ENOCOLLREC; if( g_conf.m_logTraceMsg0 ) { logTrace( g_conf.m_logTraceMsg0, "rdbId....... %d", (int)rdbId ); logTrace( g_conf.m_logTraceMsg0, "key size.... %d", (int)ks ); logTrace( g_conf.m_logTraceMsg0, "startFileNum %" PRId32, startFileNum ); logTrace( g_conf.m_logTraceMsg0, "numFiles.... %" PRId32, numFiles ); } // error set from XmlDoc::cacheTermLists()? if ( g_errno ) { logTrace( g_conf.m_logTraceMsg0, "END. Invalid collection" ); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , EBADRDBID ); return; } // . get the rdb we need to get the RdbList from // . returns NULL and sets g_errno on error //Msg0 msg0; //Rdb *rdb = msg0.getRdb ( rdbId ); Rdb *rdb = getRdbFromId ( rdbId ); if ( ! rdb ) { logTrace( g_conf.m_logTraceMsg0, "END. Invalid rdbId" ); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , EBADRDBID ); return; } // keep track of stats rdb->readRequestGet ( requestSize ); // . do a local get // . create a msg5 to get the list State00 *st0 ; try { st0 = new (State00); } catch ( ... ) { g_errno = ENOMEM; log("Msg0: new(%" PRId32"): %s", (int32_t)sizeof(State00),mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } mnew ( st0 , sizeof(State00) , "State00" ); // timing debug if ( g_conf.m_logTimingNet ) st0->m_startTime = gettimeofdayInMilliseconds(); // save slot in state st0->m_slot = slot; // save udp server to send back reply on st0->m_us = us; // init this one st0->m_niceness = niceness; st0->m_rdbId = rdbId; QUICKPOLL(niceness); // debug msg if ( maxCacheAge != 0 && ! addToCache ) { log( LOG_LOGIC, "net: msg0: check but don't add... rdbid=%" PRId32".", ( int32_t ) rdbId ); } // . if this request came over on the high priority udp server // make sure the priority gets passed along // . return if this blocks // . we'll call sendReply later if ( ! st0->m_msg5.getList ( rdbId , collnum , &st0->m_list , startKey , endKey , minRecSizes , includeTree , // include tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , st0 , gotListWrapper , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum 2 , // maxRetries true , // compensateForMerge syncPoint , false, allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END. m_msg5.getList returned false" ); return; } // call wrapper ouselves logTrace( g_conf.m_logTraceMsg0, "Calling gotListWrapper" ); gotListWrapper ( st0 , NULL , NULL ); logTrace( g_conf.m_logTraceMsg0, "END" ); }
void handleRequest12 ( UdpSlot *udpSlot , int32_t niceness ) { // get request char *request = udpSlot->m_readBuf; int32_t reqSize = udpSlot->m_readBufSize; // shortcut UdpServer *us = &g_udpServer; // breathe QUICKPOLL ( niceness ); // shortcut char *reply = udpSlot->m_tmpBuf; // // . is it confirming that he got all the locks? // . if so, remove the doledb record and dock the doleiptable count // before adding a waiting tree entry to re-pop the doledb record // if ( reqSize == sizeof(ConfirmRequest) ) { char *msg = NULL; ConfirmRequest *cq = (ConfirmRequest *)request; // confirm the lock HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t slot = ht->getSlot ( &cq->m_lockKeyUh48 ); if ( slot < 0 ) { log("spider: got a confirm request for a key not " "in the table! coll must have been deleted " " or reset " "while lock request was outstanding."); g_errno = EBADENGINEER; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; //char *xx=NULL;*xx=0; } } UrlLock *lock = (UrlLock *)ht->getValueFromSlot ( slot ); lock->m_confirmed = true; // note that if ( g_conf.m_logDebugSpider ) // Wait ) log("spider: got confirm lock request for ip=%s", iptoa(lock->m_firstIp)); // get it SpiderColl *sc = g_spiderCache.getSpiderColl(cq->m_collnum); // make it negative cq->m_doledbKey.n0 &= 0xfffffffffffffffeLL; // and add the negative rec to doledb (deletion operation) Rdb *rdb = &g_doledb.m_rdb; if ( ! rdb->addRecord ( cq->m_collnum, (char *)&cq->m_doledbKey, NULL , // data 0 , //dataSize 1 )){ // niceness // tree is dumping or something, probably ETRYAGAIN if ( g_errno != ETRYAGAIN ) {msg = "error adding neg rec to doledb"; log("spider: %s %s",msg,mstrerror(g_errno)); } //char *xx=NULL;*xx=0; log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // now remove from doleiptable since we removed from doledb if ( sc ) sc->removeFromDoledbTable ( cq->m_firstIp ); // how many spiders outstanding for this coll and IP? //int32_t out=g_spiderLoop.getNumSpidersOutPerIp ( cq->m_firstIp); // DO NOT add back to waiting tree if max spiders // out per ip was 1 OR there was a crawldelay. but better // yet, take care of that in the winReq code above. // . now add to waiting tree so we add another spiderdb // record for this firstip to doledb // . true = callForScan // . do not add to waiting tree if we have enough outstanding // spiders for this ip. we will add to waiting tree when // we receive a SpiderReply in addSpiderReply() if ( sc && //out < cq->m_maxSpidersOutPerIp && // this will just return true if we are not the // responsible host for this firstip // DO NOT populate from this!!! say "false" here... ! sc->addToWaitingTree ( 0 , cq->m_firstIp, false ) && // must be an error... g_errno ) { msg = "FAILED TO ADD TO WAITING TREE"; log("spider: %s %s",msg,mstrerror(g_errno)); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // success!! reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // sanity check if ( reqSize != sizeof(LockRequest) ) { log("spider: bad msg12 request size of %" PRId32,reqSize); log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , EBADREQUEST ); return; } // deny it if we are not synced yet! otherwise we core in // getTimeGlobal() below if ( ! isClockInSync() ) { // log it so we can debug it //log("spider: clock not in sync with host #0. so " // "returning etryagain for lock reply"); // let admin know why we are not spidering log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , ETRYAGAIN ); return; } LockRequest *lr = (LockRequest *)request; //uint64_t lockKey = *(int64_t *)request; //int32_t lockSequence = *(int32_t *)(request+8); // is this a remove operation? assume not //bool remove = false; // get top bit //if ( lockKey & 0x8000000000000000LL ) remove = true; // mask it out //lockKey &= 0x7fffffffffffffffLL; // sanity check, just 6 bytes! (48 bits) if ( lr->m_lockKeyUh48 &0xffff000000000000LL ) { char *xx=NULL;*xx=0; } // note it if ( g_conf.m_logDebugSpider ) log("spider: got msg12 request uh48=%" PRId64" remove=%" PRId32, lr->m_lockKeyUh48, (int32_t)lr->m_removeLock); // get time int32_t nowGlobal = getTimeGlobal(); // shortcut HashTableX *ht = &g_spiderLoop.m_lockTable; int32_t hostId = g_hostdb.getHostId ( udpSlot->m_ip , udpSlot->m_port ); // this must be legit - sanity check if ( hostId < 0 ) { char *xx=NULL;*xx=0; } // remove expired locks from locktable removeExpiredLocks ( hostId ); int64_t lockKey = lr->m_lockKeyUh48; // check tree int32_t slot = ht->getSlot ( &lockKey ); // lr->m_lockKeyUh48 ); // put it here UrlLock *lock = NULL; // if there say no no if ( slot >= 0 ) lock = (UrlLock *)ht->getValueFromSlot ( slot ); // if doing a remove operation and that was our hostid then unlock it if ( lr->m_removeLock && lock && lock->m_hostId == hostId && lock->m_lockSequence == lr->m_lockSequence ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: removing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // ok, at this point all remove ops return if ( lr->m_removeLock ) { reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } ///////// // // add new lock // ///////// // if lock > 1 hour old then remove it automatically!! if ( lock && nowGlobal - lock->m_timestamp > MAX_LOCK_AGE ) { // note it for now log("spider: removing lock after %" PRId32" seconds " "for lockKey=%" PRIu64" hid=%" PRId32, (nowGlobal - lock->m_timestamp), lr->m_lockKeyUh48,hostId); // unlock it ht->removeSlot ( slot ); // it is gone lock = NULL; } // if lock still there, do not grant another lock if ( lock ) { // note it for now if ( g_conf.m_logDebugSpider ) log("spider: refusing lock for lockkey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); reply[0] = 0; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; } // make the new lock UrlLock tmp; tmp.m_hostId = hostId; tmp.m_lockSequence = lr->m_lockSequence; tmp.m_timestamp = nowGlobal; tmp.m_expires = 0; tmp.m_firstIp = lr->m_firstIp; tmp.m_collnum = lr->m_collnum; // when the spider returns we remove its lock on reception of the // spiderReply, however, we actually just set the m_expires time // to 5 seconds into the future in case there is a current request // to get a lock for that url in progress. but, we do need to // indicate that the spider has indeed completed by setting // m_spiderOutstanding to true. this way, addToWaitingTree() will // not count it towards a "max spiders per IP" quota when deciding // on if it should add a new entry for this IP. tmp.m_spiderOutstanding = true; // this is set when all hosts in the group (shard) have granted the // lock and the host sends out a confirmLockAcquisition() request. // until then we do not know if the lock will be granted by all hosts // in the group (shard) tmp.m_confirmed = false; // put it into the table if ( ! ht->addKey ( &lockKey , &tmp ) ) { // return error if that failed! log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( udpSlot , g_errno ); return; } // note it for now if ( g_conf.m_logDebugSpider ) log("spider: granting lock for lockKey=%" PRIu64" hid=%" PRId32, lr->m_lockKeyUh48,hostId); // grant the lock reply[0] = 1; us->sendReply_ass ( reply , 1 , reply , 1 , udpSlot ); return; }
// . slot should be auto-nuked upon transmission or error // . TODO: ensure if this sendReply() fails does it really nuke the slot? void gotListWrapper ( void *state , RdbList *listb , Msg5 *msg5xx ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN" ); // get the state State00 *st0 = (State00 *)state; // extract the udp slot and list and msg5 UdpSlot *slot = st0->m_slot; RdbList *list = &st0->m_list; Msg5 *msg5 = &st0->m_msg5; UdpServer *us = st0->m_us; // timing debug if ( g_conf.m_logTimingNet || g_conf.m_logDebugNet ) { //log("Msg0:hndled request %" PRIu64,gettimeofdayInMilliseconds()); int32_t size = -1; if ( list ) size = list->getListSize(); log(LOG_TIMING|LOG_DEBUG, "net: msg0: Handled request for data. " "Now sending data termId=%" PRIu64" size=%" PRId32 " transId=%" PRId32" ip=%s port=%i took=%" PRId64" " "(niceness=%" PRId32").", g_posdb.getTermId(msg5->m_startKey), size,slot->m_transId, iptoa(slot->m_ip),slot->m_port, gettimeofdayInMilliseconds() - st0->m_startTime , st0->m_niceness ); } // on error nuke the list and it's data if ( g_errno ) { mdelete ( st0 , sizeof(State00) , "Msg0" ); delete (st0); // TODO: free "slot" if this send fails log(LOG_ERROR,"%s:%s:%d: call sendErrorReply.", __FILE__, __func__, __LINE__); us->sendErrorReply ( slot , g_errno ); return; } QUICKPOLL(st0->m_niceness); // point to the serialized list in "list" char *data = list->getList(); int32_t dataSize = list->getListSize(); char *alloc = list->getAlloc(); int32_t allocSize = list->getAllocSize(); // tell list not to free the data since it is a reply so UdpServer // will free it when it destroys the slot list->setOwnData ( false ); // keep track of stats Rdb *rdb = getRdbFromId ( st0->m_rdbId ); if ( rdb ) rdb->sentReplyGet ( dataSize ); // TODO: can we free any memory here??? // keep track of how long it takes to complete the send st0->m_startTime = gettimeofdayInMilliseconds(); // debug point int32_t oldSize = msg5->m_minRecSizes; int32_t newSize = msg5->m_minRecSizes + 20; // watch for wrap around if ( newSize < oldSize ) newSize = 0x7fffffff; if ( dataSize > newSize && list->getFixedDataSize() == 0 && // do not annoy me with these linkdb msgs dataSize > newSize+100 ) log(LOG_LOGIC,"net: msg0: Sending more data than what was " "requested. Ineffcient. Bad engineer. dataSize=%" PRId32" " "minRecSizes=%" PRId32".",dataSize,oldSize); // // for linkdb lists, remove all the keys that have the same IP32 // and store a count of what we removed somewhere // if ( st0->m_rdbId == RDB_LINKDB ) { // store compressed list on itself char *dst = list->m_list; // keep stats int32_t totalOrigLinks = 0; int32_t ipDups = 0; int32_t lastIp32 = 0; char *listEnd = list->getListEnd(); // compress the list for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL ( st0->m_niceness ); // count it totalOrigLinks++; // get rec char *rec = list->getCurrentRec(); int32_t ip32 = g_linkdb.getLinkerIp_uk((key224_t *)rec ); // same as one before? if ( ip32 == lastIp32 && // are we the last rec? include that for // advancing the m_nextKey in Linkdb more // efficiently. rec + LDBKS < listEnd ) { ipDups++; continue; } // store it gbmemcpy (dst , rec , LDBKS ); dst += LDBKS; // update it lastIp32 = ip32; } // . if we removed one key, store the stats // . caller should recognize reply is not a multiple of // the linkdb key size LDBKS and no its there! if ( ipDups ) { //*(int32_t *)dst = totalOrigLinks; //dst += 4; //*(int32_t *)dst = ipDups; //dst += 4; } // update list parms list->m_listSize = dst - list->m_list; list->m_listEnd = list->m_list + list->m_listSize; data = list->getList(); dataSize = list->getListSize(); } //log("sending replySize=%" PRId32" min=%" PRId32,dataSize,msg5->m_minRecSizes); // . TODO: dataSize may not equal list->getListMaxSize() so // Mem class may show an imblanace // . now g_udpServer is responsible for freeing data/dataSize // . the "true" means to call doneSending_ass() from the signal handler // if need be st0->m_us->sendReply_ass( data, dataSize, alloc, allocSize, slot, st0, doneSending_ass, -1, -1, true ); logTrace( g_conf.m_logTraceMsg0, "END" ); }
// . set table size to "n" slots // . rehashes the termId/score pairs into new table // . returns false and sets errno on error bool HashTableX::setTableSize ( long oldn , char *buf , long bufSize ) { // don't change size if we do not need to if ( oldn == m_numSlots ) return true; long long n = (long long)oldn; // make it a power of 2 for speed if small n = getHighestLitBitValueLL((unsigned long long)oldn * 2LL -1); // sanity check, must be less than 1B if ( n > 1000000000 ) { char *xx=NULL;*xx=0; } // limit... //if ( n > m_maxSlots ) n = m_maxSlots; // do not go negative on me if ( oldn == 0 ) n = 0; // sanity check if ( n < oldn ) { char *xx = NULL; *xx = 0; } // do we have a buf? long need = (m_ks+m_ds+1) * n; // sanity check, buf should also meet what we need if ( buf && bufSize < need ) { char *xx = NULL; *xx = 0; } // we grow kinda slow, it slows things down, so note it long long startTime =0LL; long old = -1; if ( m_numSlots > 2000 ) { startTime = gettimeofdayInMilliseconds(); old = m_numSlots; } // if we should not free note that bool savedDoFree = m_doFree ; char *savedBuf = m_buf; long savedBufSize = m_bufSize; // use what they gave us if we can m_buf = buf; m_doFree = false; // alloc if we should if ( ! m_buf ) { m_buf = (char *)mmalloc ( need , m_allocName); m_bufSize = need; m_doFree = true; if ( ! m_buf ) return false; } // save the old junk char *oldFlags = m_flags; char *oldKeys = m_keys; char *oldVals = m_vals; // now point to the new bigger and empty table m_keys = m_buf; m_vals = m_buf + m_ks * n; m_flags = m_buf + m_ks * n + m_ds * n; // clear flags only //bzero ( m_flags , n ); memset ( m_flags , 0 , n ); // rehash the slots if we had some long ns = m_numSlots; if ( ! m_keys ) ns = 0; // update these for the new empty table m_numSlots = n; m_mask = n - 1; long oldUsed = m_numSlotsUsed; // reset this before re-adding all of them m_numSlotsUsed = 0; // loop over results in old table, if any for ( long i = 0 ; i < ns ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // skip the empty slots if ( oldFlags [ i ] == 0 ) continue; // add old key/val into the empty table if ( m_ks == sizeof(key144_t) ) // use this special adder that hashes it up better! addTerm144 ( (key144_t *)(oldKeys + m_ks * i) , *(long *)(oldVals + m_ds * i) ); else addKey ( oldKeys + m_ks * i , oldVals + m_ds * i ); } if ( startTime ) { char *name =""; if ( m_allocName ) name = m_allocName; //if ( name && strcmp(name,"HashTableX")==0 ) // log("hey"); long long now = gettimeofdayInMilliseconds(); logf(LOG_DEBUG,"table: grewtable %s from %li to %li slots " "in %lli ms (this=0x%lx) (used=%li)", name,old,m_numSlots ,now - startTime,(long)this,oldUsed); } // free the old guys if ( ! savedDoFree ) return true; if ( ! savedBuf ) return true; // let the old table go mfree ( savedBuf , savedBufSize , m_allocName ); return true; }
bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { long i = 0; long j; //long k = 0; long wlen; //unsigned long e; //long skip; long badCount = 0; bool hadApostrophe = false; UCScript oldScript = ucScriptCommon; UCScript saved; UCProps props; uptop: // bad utf8 can cause a breach if ( i >= nodeLen ) goto done; if ( ! s[i] ) goto done; if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) { if ( m_numWords >= m_preCount ) goto done; // tag? if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) { // get the tag id if ( s[i+1]=='/' ) { // skip over / m_tagIds [m_numWords] = ::getTagId(s+i+2); m_tagIds [m_numWords] |= BACKBIT; } else m_tagIds [m_numWords] = ::getTagId(s+i+1); // word start m_words [m_numWords] = s + i; m_wordIds [m_numWords] = 0LL; // skip till end long tagLen = getTagLen(s+i); // ,niceness); m_wordLens [m_numWords] = tagLen; m_numWords++; // advance i += tagLen; goto uptop; } // it is a punct word, find end of it char *start = s+i; //for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i)); for ( ; s[i] ; i += getUtf8CharSize(s+i)){ // stop on < if we got tags if ( s[i] == '<' && m_hasTags ) break; // breathe QUICKPOLL(niceness); // if we are simple ascii, skip quickly if ( is_ascii(s[i]) ) { // accumulate NON-alnum chars if ( ! is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got alnum break; } // if we are utf8 we stop on special props UChar32 c = utf8Decode ( s+i ); // stop if word char if ( ! ucIsWordChar ( c ) ) continue; // update first though oldScript = ucGetScript ( c ); // then stop break; } m_words [ m_numWords ] = start; m_wordLens [ m_numWords ] = s+i - start; m_wordIds [ m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; goto uptop; } // get an alnum word j = i; again: //for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) ); for ( ; s[i] ; i += getUtf8CharSize(s+i) ) { // breathe QUICKPOLL(niceness); // simple ascii? if ( is_ascii(s[i]) ) { // accumulate alnum chars if ( is_alnum_a(s[i]) ) continue; // update oldScript = ucScriptCommon; // otherwise, stop we got punct break; } // get the code point of the utf8 char UChar32 c = utf8Decode ( s+i ); // get props props = ucProperties ( c ); // good stuff? if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue; // stop? if UC_WORCHAR is set, that means its an alnum if ( ! ( props & UC_WORDCHAR ) ) { // reset script between words oldScript = ucScriptCommon; break; } // save it saved = oldScript; // update here oldScript = ucGetScript(c); // treat ucScriptLatin (30) as common so we can have latin1 // like char without breaking the word! if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon; // stop on this crap too i guess. like japanes chars? if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) { // include it i += getUtf8CharSize(s+i); // but stop break; } // script change? if ( saved != oldScript ) break; } // . java++, A++, C++ exception // . A+, C+, exception // . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE if ( s[i]=='+' ) { if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2; else if ( !is_alnum_utf8(&s[i+1]) ) i++; } // . c#, j#, ... if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++; // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; hadApostrophe = true; goto again; } hadApostrophe = false; // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [ m_numWords ] = &s[j]; m_wordLens[ m_numWords ] = wlen; // . Lars says it's better to leave the accented chars intact // . google agrees // . but what about "re'sume"? if ( computeWordIds ) { long long h = hash64Lower_utf8(&s[j],wlen); m_wordIds [m_numWords] = h; // until we get an accent removal algo, comment this // out and possibly use the query synonym pipeline // to search without accents. MDW //long long h2 = hash64AsciiLowerE(&s[j],wlen); //if ( h2 != h ) m_stripWordIds [m_numWords] = h2; //else m_stripWordIds [m_numWords] = 0LL; //m_stripWordIds[m_numWords] = 0; } if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; m_numAlnumWords++; // break on \0 or MAX_WORDS //if ( ! s[i] ) goto done; // get a punct word goto uptop; /* j = i; // delineate the "punctuation" word for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i)); // bad utf8 could cause us to breach the node, so watch out! if ( i > nodeLen ) { badCount++; i = nodeLen; } // get word length wlen = i - j; if ( m_numWords >= m_preCount ) goto done; m_words [m_numWords ] = &s[j]; m_wordLens [m_numWords ] = wlen; m_wordIds [m_numWords ] = 0LL; if (m_tagIds) m_tagIds[m_numWords] = 0; m_numWords++; */ done: // bad programming warning if ( m_numWords > m_preCount ) { log(LOG_LOGIC, "build: words: set: Fix counting routine."); char *xx = NULL; *xx = 0; } // compute total length if ( m_numWords <= 0 ) m_totalLen = 0; else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1]; if ( badCount ) log("words: had %li bad utf8 chars",badCount); return true; }