void TestCppTools::hash_fnv_test() { // TODO: fill testDataVec with http://www.isthe.com/chongo/src/fnv/test_fnv.c const hash_fnv_test::TestData testDataVec[] = { { "", 0x811c9dc5UL, 0xcbf29ce484222325ULL }, { "a", 0xe40c292cUL, 0xaf63dc4c8601ec8cULL }, { "b", 0xe70c2de5UL, 0xaf63df4c8601f1a5ULL }, { "c", 0xe60c2c52UL, 0xaf63de4c8601eff2ULL }, { "d", 0xe10c2473UL, 0xaf63d94c8601e773ULL }, { "e", 0xe00c22e0UL, 0xaf63d84c8601e5c0ULL } }; for (const auto& testData : testDataVec) { cpp::hash32_fnv_1a hash32; cpp::hash64_fnv_1a hash64; const auto byteSeqLen = std::strlen(testData.byteSeq); QCOMPARE(hash32(testData.byteSeq), testData.hash32_fnv_1a); QCOMPARE(hash32(testData.byteSeq, byteSeqLen), testData.hash32_fnv_1a); QCOMPARE(hash32(testData.byteSeq, testData.byteSeq + byteSeqLen), testData.hash32_fnv_1a); QCOMPARE(hash64(testData.byteSeq), testData.hash64_fnv_1a); QCOMPARE(hash64(testData.byteSeq, byteSeqLen), testData.hash64_fnv_1a); QCOMPARE(hash64(testData.byteSeq, testData.byteSeq + byteSeqLen), testData.hash64_fnv_1a); } }
uint32 operator()(const Map<K, V, C>& value) const { uint32 h= 0; for (auto&& p : value) h += hash32(p); return h; }
bool GDI2FT_RENDERER::fetch_glyph_run( bool is_glyph_index, bool is_pdy, LPCWSTR lpString, int c, CONST INT* lpDx, GDI2FT_GLPYH_RUN& glyph_run ) /* -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- */ { HASH_VALUE erased_trait; if( glyph_cache.glyph_run_lru.access( font_trait, erased_trait ) ) glyph_cache.erase_font_trait( erased_trait ); #ifdef _M_X64 const HASH_VALUE str_hash = hash64( lpString, c * sizeof( WCHAR ), is_glyph_index ); #else const HASH_VALUE str_hash = hash32( lpString, c * sizeof( WCHAR ), is_glyph_index ); #endif // _M_X64 if( !glyph_cache.lookup_glyph_run( font_trait, str_hash, glyph_run ) ) { GDI2FT_MUTEX mutex( GDI2FT_MUTEX::MUTEX_GLYPH_RUN_CACHE ); if( !glyph_cache.lookup_glyph_run( font_trait, str_hash, glyph_run ) ) { if( render( is_glyph_index, is_pdy, lpString, c, lpDx, glyph_run ) == 0 ) return false; glyph_cache.store_glyph_run( font_trait, str_hash, glyph_run ); } } return true; }
struct cache_block * search_cache_block(struct sfs_fs *sfs, uint32_t ino) { struct cache_block *cb = sfs->blocks[hash32(ino)]; while (cb != NULL && cb->ino != ino) { cb = cb->hash_next; } return cb; }
Component* GlobalFactory::addComponent(char* id, char* type, bool standardComponent) { Component* obj = (Component*)g_factory.construct(type, id); hashTableInsert(&idPool, hash32(id, (int)strlen(id)), obj); List *l = (List*)hashTableLookup(&typePool, hash32(id, (int)strlen(id))); if (l == NULL) { l = (List*)malloc(sizeof(List)); listInit(l); hashTableInsert(&typePool, hash32(id, (int)strlen(id)), l); } listAddElement(l, obj); return obj; }
static struct cache_block * alloc_cache_block(struct sfs_fs *sfs, uint32_t ino) { struct cache_block *cb = safe_malloc(sizeof(struct cache_block)); cb->ino = (ino != 0) ? ino : sfs_alloc_ino(sfs); cb->cache = memset(safe_malloc(SFS_BLKSIZE), 0, SFS_BLKSIZE); struct cache_block **head = sfs->blocks + hash32(ino); cb->hash_next = *head, *head = cb; return cb; }
Thermal::Thermal(int nx, int ny, int nz) : SpinOperation(nx, ny, nz, hash32(Thermal::typeName())) { setSlotName("Thermal"); scale = luaT_inc<dArray>(new dArray(nx,ny,nz)); scale->setAll(1.0); temperature = 0; myRNG = 0; }
const char *extensionToContentTypeStr2 ( const char *ext , int32_t elen ) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return NULL; if ( elen <= 0 ) return NULL; // get hash for table look up int32_t key = hash32 ( ext , elen ); char **pp = (char **)s_mimeTable.getValue ( &key ); if ( ! pp ) return NULL; return *pp; }
bool AutoBan::hasCode(char *code, long codeLen, long ip ) { if(codeLen == 0) return false; long h = hash32(code,codeLen); CodeVal *cv = m_ht.getValuePointer ( h ); if ( ! cv ) return log(LOG_INFO, "query: unrecognized code: %s", code); cv->m_ip = ip; cv->m_count++; return true; }
int main(int argc, char **argv) { int i; for (i = 1; i < argc; i++) { printf("%u\n", hash32(argv[i])); //printf("%ld\n", hash64(argv[i])); //printf("%08lx\n", taketwo(argv[i])); } return 0; }
const char *HttpMime::getContentTypeFromExtension ( const char *ext , int32_t elen) { // assume text/html if no extension provided if ( ! ext || ! ext[0] ) return "text/html"; if ( elen <= 0 ) return "text/html"; // get hash for table look up int32_t key = hash32 ( ext , elen ); char **pp = (char **)s_mimeTable.getValue ( &key ); // if not found in table, assume text/html if ( ! pp ) return "text/html"; return *pp; }
ShortRange::ShortRange(int nx, int ny, int nz) : SpinOperation(nx, ny, nz, hash32(ShortRange::typeName())) { setSlotName("ShortRange"); size = 32; num = 0; pathways = 0; pbc[0] = 1; pbc[1] = 1; pbc[2] = 1; }
static unsigned int refhash(naRef key) { if(IS_STR(key)) { struct naStr* s = PTR(key).str; if(s->hashcode) return s->hashcode; return s->hashcode = hash32((void*)naStr_data(key), naStr_len(key)); } else { /* must be a number */ union { double d; unsigned int u[2]; } n; n.d = key.num == -0.0 ? 0.0 : key.num; /* remember negative zero! */ return mix32(mix32(n.u[0]) ^ n.u[1]); } }
TEST(core, byteRef) { dev::bytes originalSequence = dev::fromHex("0102030405060708091011121314151617181920212223242526272829303132"); dev::bytesRef out(&originalSequence.at(0), 32); dev::h256 hash32("1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347"); hash32.ref().copyTo(out); EXPECT_EQ(out.size(), 32) << "Error wrong result size when h256::ref().copyTo(dev::bytesRef out)"; EXPECT_EQ(out.toBytes(), originalSequence) << "Error when h256::ref().copyTo(dev::bytesRef out)"; }
Component* GlobalFactory::getComponentById(char* id) { Component* component = (Component*)hashTableLookup(&idPool, hash32(id, (int)strlen(id))); if (component == NULL) { char* message = (char*)"The component "; message = stringInsert(message, id, (int)strlen(message)); message = stringInsert(message, " doesn't exist", (int)strlen(message)); throw message; } return component; }
List* GlobalFactory::getComponentsByType(char* type) { List* list = (List*)hashTableLookup(&typePool, hash32(type, (int)strlen(type))); if (list == NULL) { char* message = (char*)"No component typed "; message = stringInsert(message, type, (int)strlen(message)); message = stringInsert(message, "exists for now", (int)strlen(message)); throw message; } return list; }
IInstruction *CpuRiscV_Functional::decodeInstruction(uint32_t *rpayload) { IInstruction *instr = NULL; int hash_idx = hash32(rpayload[0]); for (unsigned i = 0; i < listInstr_[hash_idx].size(); i++) { instr = static_cast<IInstruction *>( listInstr_[hash_idx][i].to_iface()); if (instr->parse(rpayload)) { break; } instr = NULL; } return instr; }
// . get startKey,endKey for all SiteRecs from "url"'s domain // . key has the following format: // . dddddddd dddddddd dddddddd dddddddd d = domain hash w/ collection // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu u = url hash // . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu // . putting domain as first 32bits will cluster all SiteRecs from the // same domain together on the same machine void Catdb::getKeyRange ( bool useIp , Url *url, key_t *startKey , key_t *endKey ) { // log warning msg if we need to if ( useIp && ! url->hasIp() ) log(LOG_LOGIC,"db: tagdb: getKeyRange: useIp is true, " "but url has no ip"); // . the upper 32 bits of the key is basically hash of the domain // . mask out the low-order byte (hi byte in little endian order) unsigned long h; // . make sure we use htonl() on ip domain so top byte is not zero! // . this made all our ip-based sites stored in group #0 before // if ( useIp ) h = htonl ( url->getIpDomain() ) ; // . only hash first 3 bytes of ip domain to keep together w/ ip // . if rdbid is tagdb then use hostname as key else use domain if ( useIp ) { // do htonl so most significant byte is first long ipdom = htonl(url->getIpDomain()); h = hash32 ( (char *)&ipdom , 3 ) ; } else h = hash32 (url->getDomain(), url->getDomainLen()); // incorporate collection into "h" //h = hash32 ( coll , collLen , h ); // now make the keys key_t k; // top 4 bytes is always the domain hash (ip or canonical domain) k.n1 = h; // don't set the low del bit for startKey k.n0 = 0x0000000000000000LL; // assign the startKey if ( startKey ) *startKey = k; // set the low del bit for startKey k.n0 = 0xffffffffffffffffLL; // endkey is just as simple if ( endKey ) *endKey = k; }
bool AutoBan::setCodesFromConf() { static bool s_firstTime = true; m_codeResetTime = getTime(); char *p = g_conf.m_validCodes; while(*p) { if(!isspace(*p)) { long len = 0; while(p[len] && !isspace(p[len])) len++; //now p points to a code, with length len. //log(LOG_WARN, "autoban code is %s %li", p, len); long h = hash32(p,len); CodeVal cv; long max = len; if ( max > 30 ) max = 30; strncpy(cv.m_code,p,max); cv.m_code[max]='\0'; cv.m_ip = 0; cv.m_count = 0; cv.m_bytesSent = 0; cv.m_bytesRead = 0; // we might be doing an update, so only set this // count to 0 the first time we are called on startup if ( s_firstTime ) cv.m_outstanding = 0; cv.m_maxEver = 0; cv.m_maxOutstanding = 5000; //m_numCodes++; p += len; // skip spaces or tabs while ( *p == ' ' || *p == '\t' ) p++; // do we got a number? that is the max outstanding cnt if ( is_digit ( *p ) ) cv.m_maxOutstanding = atoi(p); // ensure no breach if ( cv.m_maxOutstanding < 10 ) log("gb: client code %s has LOW max " "outstanding limit of %li", cv.m_code,cv.m_maxOutstanding); // skip the digits, until we hit \r or \n while ( is_digit ( *p ) ) p++; // now add it if ( ! m_ht.addKey ( h , cv ) ) return false; } p++; } s_firstTime = false; return true; }
GDI2FT_RENDERER::GDI2FT_RENDERER( const GDI2FT_CONTEXT& _context ) /* -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- */ { context = &_context; render_mode = FT_RENDER_MODE_LCD; char_extra = GetTextCharacterExtra( context->hdc ); const int lf_metric_size = sizeof( context->log_font ) - sizeof( context->log_font.lfFaceName ); const int lf_facename_size = static_cast<const int>( ( wcslen( context->log_font.lfFaceName ) * sizeof( wchar_t ) ) ); const int lf_total_size = lf_metric_size + lf_facename_size; #ifdef _M_X64 font_trait = hash64( &context->log_font, lf_total_size, 0 ); #else font_trait = hash32( &context->log_font, lf_total_size, 0 ); #endif // _M_X64 }
int main(int argc, char **argv) { if (argc < 2) { print_usage(argv[0]); return 1; } if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) { print_usage(argv[0]); return 1; } // initialize library g_mem.init(); hashinit(); g_conf.init(NULL); g_log.m_logPrefix = false; const char *input = argv[1]; size_t inputLen = strlen(input); Url url; url.set(input, inputLen); url.print(); logf(LOG_TRACE, "\t"); SiteGetter sg; sg.getSite(input, NULL, 0, 0, 0); logf(LOG_TRACE, "Site info"); logf(LOG_TRACE, "\tsite : %.*s", sg.getSiteLen(), sg.getSite()); logf(LOG_TRACE, "\tsitehash32 : %" PRIx32, hash32(sg.getSite(), sg.getSiteLen(), 0)); logf(LOG_TRACE, "\t"); uint64_t probableDocId = Titledb::getProbableDocId(&url); logf(LOG_TRACE, "Document info"); logf(LOG_TRACE, "\tprobabledocid : %" PRIu64, probableDocId); logf(LOG_TRACE, "\tfirstprobabledocid : %" PRIu64, Titledb::getFirstProbableDocId(probableDocId)); logf(LOG_TRACE, "\tlastprobabledocid : %" PRIu64, Titledb::getLastProbableDocId(probableDocId)); return 0; }
static void packet_to_sig(struct packet_data* pk, struct tcp_sig* ts) { ts->opt_hash = hash32(pk->opt_layout, pk->opt_cnt, hash_seed); ts->quirks = pk->quirks; ts->opt_eol_pad = pk->opt_eol_pad; ts->ip_opt_len = pk->ip_opt_len; ts->ip_ver = pk->ip_ver; ts->ttl = pk->ttl; ts->mss = pk->mss; ts->win = pk->win; ts->win_type = WIN_TYPE_NORMAL; /* Keep as-is. */ ts->wscale = pk->wscale; ts->pay_class = !!pk->pay_len; ts->tot_hdr = pk->tot_hdr; ts->ts1 = pk->ts1; ts->recv_ms = get_unix_time_ms(); ts->matched = NULL; ts->fuzzy = 0; ts->dist = 0; };
// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc. // . if list is stored locally this tries to get it locally // . otherwise tries to get the list from the network // . returns false if blocked, true otherwise // . sets g_errno on error // . NOTE: i was having problems with queries being cached too long, you // see the cache here is a NETWORK cache, so when the machines that owns // the list updates it on disk it can't flush our cache... so use a small // maxCacheAge of like , 30 seconds or so... bool Msg0::getList ( int64_t hostId , // host to ask (-1 if none) int32_t ip , // info on hostId int16_t port , int32_t maxCacheAge , // max cached age in seconds bool addToCache , // add net recv'd list to cache? char rdbId , // specifies the rdb collnum_t collnum , RdbList *list , const char *startKey , const char *endKey , int32_t minRecSizes , // use -1 for no max void *state , void (* callback)(void *state ),//, RdbList *list ) , int32_t niceness , bool doErrorCorrection , bool includeTree , bool doMerge , int32_t firstHostId , int32_t startFileNum , int32_t numFiles , int64_t timeout , int64_t syncPoint , int32_t preferLocalReads , Msg5 *msg5 , bool isRealMerge , bool allowPageCache , bool forceLocalIndexdb , bool noSplit , int32_t forceParitySplit ) { logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId ); // warning if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0."); // reset the list they passed us list->reset(); // get keySize of rdb m_ks = getKeySizeFromRdbId ( rdbId ); // if( g_conf.m_logTraceMsg0 ) // { // log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId); // log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks); // log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId); // } // if startKey > endKey, don't read anything //if ( startKey > endKey ) return true; if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue // . reset hostid if it is dead // . this is causing UOR queries to take forever when we have a dead if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1; // no longer accept negative minrecsize if ( minRecSizes < 0 ) { g_errno = EBADENGINEER; logTrace( g_conf.m_logTraceMsg0, "END" ); log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported."); char *xx=NULL;*xx=0; } // remember these m_state = state; m_callback = callback; m_list = list; m_hostId = hostId; m_niceness = niceness; m_addToCache = addToCache; // . these define our request 100% KEYSET(m_startKey,startKey,m_ks); KEYSET(m_endKey,endKey,m_ks); m_minRecSizes = minRecSizes; m_rdbId = rdbId; m_collnum = collnum;// = coll; m_isRealMerge = isRealMerge; m_allowPageCache = allowPageCache; // . group to ask is based on the first key // . we only do 1 group per call right now // . groupMask must turn on higher bits first (count downwards kinda) // . titledb and spiderdb use special masks to get groupId // if diffbot.cpp is reading spiderdb from each shard we have to // get groupid from hostid here lest we core in getGroupId() below. // it does that for dumping spiderdb to the client browser. they // can download the whole enchilada. if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB ) m_shardNum = 0; // did they force it? core until i figure out what this is else if ( forceParitySplit >= 0 ) //m_groupId = g_hostdb.getGroupId ( forceParitySplit ); m_shardNum = forceParitySplit; else //m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit ); m_shardNum = getShardNum ( m_rdbId , startKey ); // if we are looking up a termlist in posdb that is split by termid and // not the usual docid then we have to set this posdb key bit that tells // us that ... if ( noSplit && m_rdbId == RDB_POSDB ) m_shardNum = g_hostdb.getShardNumByTermId ( startKey ); // how is this used? if ( forceLocalIndexdb ) m_shardNum = getMyShardNum(); // if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum); // . store these parameters // . get a handle to the rdb in case we can satisfy locally // . returns NULL and sets g_errno on error QUICKPOLL((m_niceness)); Rdb *rdb = getRdbFromId ( m_rdbId ); if ( ! rdb ) return true; // we need the fixedDataSize m_fixedDataSize = rdb->getFixedDataSize(); m_useHalfKeys = rdb->useHalfKeys(); // . debug msg // . Msg2 does this when checking for a cached compound list. // compound lists do not actually exist, they are merges of smaller // UOR'd lists. if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) { log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId ); } // set this here since we may not call msg5 if list not local //m_list->setFixedDataSize ( m_fixedDataSize ); // . now that we do load balancing we don't want to do a disk lookup // even if local if we are merging or dumping // . UNLESS g_conf.m_preferLocalReads is true if ( preferLocalReads == -1 ) preferLocalReads = g_conf.m_preferLocalReads; // . always prefer local for full split clusterdb // . and keep the tfndb/titledb lookups in the same stripe // . so basically we can't do biased caches if fully split //if ( g_conf.m_fullSplit ) preferLocalReads = true; preferLocalReads = true; // it it stored locally? bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId ); m_shardNum == getMyShardNum() ); // only do local lookups if this is true if ( ! preferLocalReads ) isLocal = false; /* int64_t singleDocIdQuery = 0LL; if ( rdbId == RDB_POSDB ) { int64_t d1 = g_posdb.getDocId(m_startKey); int64_t d2 = g_posdb.getDocId(m_endKey); if ( d1+1 == d2 ) singleDocIdQuery = d1; } // . try the LOCAL termlist cache // . so when msg2 is evaluating a gbdocid:| query and it has to // use msg0 to go across the network to get the same damn termlist // over and over again for the same docid, this will help alot. // . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to // send the same gbdocid:xxxx docids to the same hosts. maybe hash // based on docid into the list of hosts and if that host is busy // just chain until we find someone not busy. if ( singleDocIdQuery && getListFromTermListCache ( coll, m_startKey, m_endKey, maxCacheAge, list ) ) // found! return true; */ // but always local if only one host if ( g_hostdb.getNumHosts() == 1 ) isLocal = true; // . if the group is local then do it locally // . Msg5::getList() returns false if blocked, true otherwise // . Msg5::getList() sets g_errno on error // . don't do this if m_hostId was specified if ( isLocal ) { logTrace( g_conf.m_logTraceMsg0, "isLocal" ); if ( msg5 ) { m_msg5 = msg5; m_deleteMsg5 = false; } else { try { m_msg5 = new ( Msg5 ); } catch ( ... ) { g_errno = ENOMEM; log("net: Local alloc for disk read failed " "while tring to read data for %s. " "Trying remote request.", getDbnameFromId(m_rdbId)); goto skip; } mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" ); m_deleteMsg5 = true; } QUICKPOLL(m_niceness); if ( ! m_msg5->getList ( rdbId, m_collnum , m_list , m_startKey , m_endKey , m_minRecSizes , includeTree , // include Tree? addToCache , // addToCache? maxCacheAge , startFileNum , numFiles , this , gotListWrapper2 , niceness , doErrorCorrection , NULL , // cacheKeyPtr 0 , // retryNum -1 , // maxRetries true , // compensateForMerge syncPoint , m_isRealMerge , m_allowPageCache ) ) { logTrace( g_conf.m_logTraceMsg0, "END, return false" ); return false; } // nuke it reset(); logTrace( g_conf.m_logTraceMsg0, "END, return true" ); return true; } skip: // debug msg if ( g_conf.m_logDebugQuery ) log(LOG_DEBUG,"net: msg0: Sending request for data to " "shard=%" PRIu32" " "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" " //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")", "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")", //g_hostdb.makeHostId ( m_groupId ) , m_shardNum, (PTRTYPE)m_list, m_minRecSizes, g_posdb.getTermId(m_startKey) , //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness); KEY1(m_startKey,m_ks),KEY0(m_startKey), (int32_t)m_niceness); char *replyBuf = NULL; int32_t replyBufMaxSize = 0; bool freeReply = true; // . make a request with the info above (note: not in network order) // . IMPORTANT!!!!! if you change this change // Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!! // no, not anymore, we commented out that request peeking code char *p = m_request; *(int64_t *) p = syncPoint ; p += 8; //*(key_t *) p = m_startKey ; p += sizeof(key_t); //*(key_t *) p = m_endKey ; p += sizeof(key_t); *(int32_t *) p = m_minRecSizes ; p += 4; *(int32_t *) p = startFileNum ; p += 4; *(int32_t *) p = numFiles ; p += 4; *(int32_t *) p = maxCacheAge ; p += 4; if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; } *p = m_rdbId ; p++; *p = addToCache ; p++; *p = doErrorCorrection; p++; *p = includeTree ; p++; *p = (char)niceness ; p++; *p = (char)m_allowPageCache; p++; KEYSET(p,m_startKey,m_ks); ; p+=m_ks; KEYSET(p,m_endKey,m_ks); ; p+=m_ks; // NULL terminated collection name //strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0'; *(collnum_t *)p = m_collnum; p += sizeof(collnum_t); m_requestSize = p - m_request; // ask an individual host for this list if hostId is NOT -1 if ( m_hostId != -1 ) { // get Host Host *h = g_hostdb.getHost ( m_hostId ); if ( ! h ) { g_errno = EBADHOSTID; log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId); logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" ); return true; } // if niceness is 0, use the higher priority udpServer UdpServer *us ; uint16_t port; QUICKPOLL(m_niceness); us = &g_udpServer ; port = h->m_port ; // . returns false on error and sets g_errno, true otherwise // . calls callback when reply is received (or error) // . we return true if it returns false if ( ! us->sendRequest ( m_request , m_requestSize , 0x00 , // msgType h->m_ip , port , m_hostId , NULL , // the slotPtr this , gotSingleReplyWrapper , timeout , -1 , // backoff -1 , // maxwait replyBuf , replyBufMaxSize , m_niceness ) ) { // cback niceness logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" ); return true; } // return false cuz it blocked logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" ); return false; } // timing debug if ( g_conf.m_logTimingNet ) m_startTime = gettimeofdayInMilliseconds(); else m_startTime = 0; // . get the top int32_t of the key // . i guess this will work for 128 bit keys... hmmmmm int32_t keyTop = hash32 ( (char *)startKey , m_ks ); // . otherwise, multicast to a host in group "groupId" // . returns false and sets g_errno on error // . calls callback on completion // . select first host to send to in group based on upper 32 bits // of termId (m_startKey.n1) // . need to send out to all the indexdb split hosts m_numRequests = 0; m_numReplies = 0; //for ( int32_t i = 0; i < m_numSplit; i++ ) { QUICKPOLL(m_niceness); //int32_t gr; char *buf; buf = replyBuf; // get the multicast Multicast *m = &m_mcast; if ( ! m->send ( m_request , m_requestSize, 0x00 , // msgType 0x00 false , // does multicast own request? m_shardNum , false , // send to whole group? //m_startKey.n1, // key is passed on startKey keyTop , // key is passed on startKey this , // state data NULL , // state data gotMulticastReplyWrapper0 , timeout*1000 , // timeout niceness , firstHostId , buf , replyBufMaxSize , freeReply , // free reply buf? true , // do disk load balancing? maxCacheAge , //(key_t *)cacheKey , // multicast uses it for determining the best // host to send the request to when doing // disk load balancing. if the host has our // data cached, then it will probably get to // handle the request. for now let's just assume // this is a 96-bit key. TODO: fix... 0 , // *(key_t *)cacheKey , rdbId , minRecSizes ) ) { log(LOG_ERROR, "net: Failed to send request for data from %s in shard " "#%" PRIu32" over network: %s.", getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno)); // but speed it up m_errno = g_errno; m->reset(); if ( m_numRequests > 0 ) { logTrace( g_conf.m_logTraceMsg0, "END - returning false" ); return false; } logTrace( g_conf.m_logTraceMsg0, "END - returning true" ); return true; } m_numRequests++; // we blocked logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" ); return false; }
inline size_t hash32(const string& key) { return hash32(key.c_str(),key.size()); }
uint32_t hash32(const std::string& string) { return hash32(string.c_str(), string.size()); }
// . returns -1 on error, 0 on success // . reads HTTP reply from filename given as argument, filters it, // and then writes it to stdout // . originally, we read from stdin, but popen was causing problems when called // from a thread on linux 2.4.17 with the old linux threads int main ( int argc , char *argv[] ) { // should have one and only 1 arg (excluding filename) if ( argc != 2 ) { fprintf(stderr,"usage: fql <querylogfilename1>..." "<querylogfilenameN>\n"); return -1; } // each log file should be <= 2GB char *buf = (char *)malloc ( MAX_READ_SIZE ); if ( ! buf ) { fprintf(stderr,"fql:malloc:li: %s: %s\n", (int32_t)MAX_READ_SIZE,strerror(errno)); return -1; } // seed with same value so we get same rand sequence for all srand ( 1945687 ); for ( int32_t i = 0 ; i < 256 ; i++ ) for ( int32_t j = 0 ; j < 256 ; j++ ) { g_hashtab [i][j] = (uint64_t)rand(); // the top bit never gets set, so fix if ( rand() > (0x7fffffff / 2) ) g_hashtab[i][j] |= 0x80000000; g_hashtab [i][j] <<= 32; g_hashtab [i][j] |= (uint64_t)rand(); // the top bit never gets set, so fix if ( rand() > (0x7fffffff / 2) ) g_hashtab[i][j] |= 0x80000000; } if ( g_hashtab[0][0] != 6720717044602784129LL ) return false; fprintf(stderr,"fql: reading %s\n", argv[1]); // first and only arg is the input file to read from int fd = open ( argv[1] , O_RDONLY ); if ( fd < 0 ) { fprintf(stderr,"fql:open: %s: %s\n", argv[1],strerror(errno)); free ( buf ); return -1; } int n = read ( fd , buf , MAX_READ_SIZE ); close ( fd ); fprintf(stderr,"fql: done reading %s\n", argv[1]); // return -1 on read error if ( n < 0 ) { fprintf(stderr,"fql:fread: %s\n",strerror(errno)); free ( buf ); return -1; } // warn if the doc was bigger than expected if ( n >= (int32_t)MAX_READ_SIZE ) fprintf(stderr,"fql: WARNING: MAX_READ_SIZE " "needs boost\n"); // if nothing came in then nothing goes out, we're done if ( n == 0 ) { free ( buf ) ; return 0; } // store last 1000 hashes in a ring int32_t hashes[MAX_HASHES]; memset ( hashes, 0 , MAX_HASHES * 4 ); int32_t nh = 0; // parse out query from each url char *p = buf; for ( ; *p ; p++ ) { if ( p[0] != '?' && p[0] != '&' ) continue; if ( p[1] != 'q' ) continue; if ( p[2] != '=' ) continue; p += 3; // mark the end char *end = p; bool good = true; for ( ; *end && *end!='&' && *end!='\n' && *end!=' '; end++ ) { // double quote? if ( *end == '%' && end[1] == '2' && end[2] == '2' ) { good = false; break; } // colon or pipe operators, ignore if ( *end == '|') { good = false; break; } if ( *end == '%' && end[1] == '3' && end[2] == 'a' ) { good = false; break; } if ( *end == '%' && end[1] == '3' && end[2] == 'A' ) { good = false; break; } } // filter out? if ( ! good ) continue; // limit size. 150 is too big. if ( end - p > 150 ) continue; // scan backwards to get ip char *ips = p; for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- ); if ( ips>buf ) ips--; for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- ); char *ipend = ips; if ( ips>buf ) ips--; for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- ); ips++; // should be ip now! int32_t iplen = ipend - ips; //int32_t uip = atoip(ips,ipend-ips); //if ( ! uip ) continue; // must be ip # if ( !isdigit(ips[0]) ) continue; // replace comma with space for ( char *r = p ; r < end ; r++ ) { if ( *r == ',' ) *r = '+'; } char *dst2 = p; for ( char *r = p ; r < end ; r++ ) { *dst2 = *r; if ( *r == '%' && r[1] == '2' && r[2] == '0' ) { *dst2 = '+'; r += 2; } dst2++; } end = dst2; // skip initial spaces char *x = p; for ( ; x < end ; x++ ) { if ( *x == '+' ) continue; break; } char *query = p; // filter out back to back spaces char *dst = p; bool lastWasSpace = false; for ( char *x = p ; x < end ; x++ ) { // skip back to back spaces if ( *x == '+' && lastWasSpace ) continue; // skip initial spaces if ( x == p && *x == '+' ) { lastWasSpace = true; continue; } // skip initial spaces *dst++ = *x; if ( *x == '+' ) lastWasSpace = true; else lastWasSpace = false; } // null term the overwritten buffer *dst = '\0'; // get the length of the query int32_t queryLen = dst - p; // skip that for the for loop p = dst; // skip empty queries if ( queryLen==0 ) continue; // hash it up int32_t h = hash32(query,queryLen); for ( int32_t i = 0 ; i < MAX_HASHES ; i++ ) { if ( hashes[i] == h ) { good = false; break; } } hashes[nh] = h; // inc and wrap if ( ++nh >= MAX_HASHES ) nh = 0; // filter out? if ( ! good ) continue; // cblock it char dotCount = 0; for ( int32_t k = 0 ; k < iplen ; k++ ) { if ( ips[k] != '.' ) continue; if ( ++dotCount < 3 ) continue; ips[k] = '\0'; break; } if ( dotCount != 3 ) continue; // print ip //ips[iplen] = '\0'; // write that out fprintf(stdout,"%s %s\n",ips,query); } return 0; }
void Blaster::gotDoc2 ( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: Lost the Request in gotDoc2"); m_launched--; //No need to point p2 // Free stateBD freeStateBD(st); return; } // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it // s->m_readBuf = NULL; long long now = gettimeofdayInMilliseconds(); // So now after getting both docIds, get their contents char *reply1 = st->m_buf1 ; long size1 = st->m_buf1Len; HttpMime mime1; mime1.set ( reply1 , size1 , NULL ); char *content1 = reply1 + mime1.getMimeLen(); long content1Len = size1 - mime1.getMimeLen(); unsigned long h = hash32 ( content1 , content1Len ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u2 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u2 ); if (m_verbose){ log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s", content1Len,content1); log(LOG_WARN,"\n"); } char *reply2 = s->m_readBuf ; long size2 = s->m_readOffset; HttpMime mime2; mime2.set ( reply2 , size2 , NULL ); char *content2 = reply2 + mime2.getMimeLen(); long content2Len = size2 - mime2.getMimeLen(); if (m_verbose) log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s", content2Len,content2); // Now that we've got the contents, lets get the url links out // of these pages.Passing them to function getSearchLinks should // get the first x links found out. /* st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3"); st->m_links2=st->m_links1+100*MAX_URL_LEN; st->m_numLinks1=100; st->m_numLinks2=100;*/ /* long numLinks1=getSearchLinks(content1,content1Len, st->m_links1,st->m_numLinks1); long numLinks2=getSearchLinks(content2,content2Len, st->m_links2,st->m_numLinks2);*/ content1[content1Len]='\0'; //short csEnum1= get_iana_charset(mime1.getCharset(), // mime1.getCharsetLen()); /* if (csEnum1== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml1; // assume utf8 if (!xml1.set(content1, content1Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2"); } Links links1; Url parent; parent.set ( st->m_u1); if (!links1.set(false , // userellnofollow &xml1, &parent,//mime1.getLocationUrl(), parent Url false, // setLinkHashes NULL , // baseUrl TITLEREC_CURRENT_VERSION, // version 0 , // niceness false , // parent is permalink? NULL )) { // oldLinks log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2"); } content2[content2Len]='\0'; //short csEnum2= get_iana_charset(mime2.getCharset(), // mime2.getCharsetLen()); /* if (csEnum2== csUnknown) log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/ Xml xml2; if (!xml2.set(content2, content2Len, false, 0, false, TITLEREC_CURRENT_VERSION)){ log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2"); } Links links2; parent.set(st->m_u2); if (!links2.set(0,//siterec xml &xml2, &parent,//&st->m_u2,//mime2.getLocationUrl(), false, NULL, TITLEREC_CURRENT_VERSION, 0, false, NULL)){ log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2"); } // put the hash of the sites into a hashtable, since we have // about a 100 or so of them HashTableT<unsigned long, bool> urlHash; // put the urls from doc2 into the hastable, but first check if // they are links to google or gigablast (for now). For msn and // yahoo we have to add other checks. char domain2[256]; long dlen = 0; char *dom = getDomFast ( st->m_u2 , &dlen ); if ( dom ) strncpy(domain2,dom,dlen); domain2[dlen]='\0'; for (long i=0;i<links2.getNumLinks();i++){ // The dots check if exactly google or gigablast are present // in the link char *ss=links2.getLink(i); char *p; p=strstr(ss,domain2); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc2=%s" ,links2.getLink(i)); unsigned long h=hash32Lower_a(links2.getLink(i), links2.getLinkLen(i)); //should i check for conflict. no, because it doesn't matter urlHash.addKey(h,1); } // now check if the urls from doc1 are in doc2. save the // ones that are not // in there for later. /* long numUrlsToCheck=links2.getNumLinks();*/ long numUrlsNotFound=0; /*if (numLinks1<numUrlsToCheck) numUrlsToCheck=numLinks1;*/ char domain1[256]; dlen = 0; dom = getDomFast ( st->m_u1 ,&dlen ); if ( dom ) strncpy(domain1,dom,dlen); domain1[dlen]='\0'; for (long i=0;i<links1.getNumLinks();i++){ char *ss=links1.getLink(i); char *p; p=strstr(ss,domain1); if(p) continue; p=strstr(ss,"google."); if(p) continue; p=strstr(ss,"cache:"); //googles cache page if(p) continue; p= strstr(ss,"gigablast."); if(p) continue; p= strstr(ss,"web.archive.org");//older copies on gigablast if(p) continue; p= strstr(ss,"search.yahoo.com");//from gigablast search if(p) continue; p= strstr(ss,"search.msn.com");//from gigablast search if(p) continue; p= strstr(ss,"s.teoma.com");//from gigablast search if(p) continue; p= strstr(ss,"search.dmoz.org");//from gigablast search if(p) continue; p= strstr(ss,"www.answers.com");//from gigablast search if(p) continue; p= strstr(ss,"cc.msncache.com");//msn's cache page if(p) continue; if (m_verbose) log(LOG_WARN,"blaster: link in Doc1=%s" ,links1.getLink(i)); unsigned long h=hash32Lower_a(links1.getLink(i), links1.getLinkLen(i)); long slot= urlHash.getSlot(h); if(slot!=-1) continue; // if url is not present, get its doc. if (m_verbose || m_justDisplay) log(LOG_WARN,"blaster: NOT FOUND %s in %s" ,links1.getLink(i),domain2); numUrlsNotFound++; //Don't do anything else if just have to display the urls if (m_justDisplay) continue; //now get the doc of these urls //initialize st->m_numUrlDocsReceived=0; StateBD2 *st2; try { st2 = new (StateBD2); } catch ( ... ) { g_errno = ENOMEM; log("blaster: Failed. " "Could not allocate %li bytes for query. " "Returning HTTP status of 500.", (long)sizeof(StateBD2)); return; } mnew ( st2 , sizeof(StateBD2) , "Blaster4" ); //Point to the big state; st2->m_st=st; //Msg16 does 6 redirects, so I do 6 too st2->m_numRedirects=6; //st2->m_url.set(links1.getLink(i),links1.getLinkLen(i)); st2->m_url = links1.getLink(i); // No need for a proxy ip here, since we are fetching // doc's from different IPs. Faster this way bool status = g_httpServer.getDoc ( st2->m_url, // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st2, // state gotDocWrapper3, // callback 60*1000, // timeout 0, // proxy ip 0, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) continue; // If not blocked, there is an error. st->m_numUrlDocsReceived++; } st->m_numUrlDocsSent=numUrlsNotFound; //There might have been an error while sending the docs, so if there //has been put a check if ( st->m_numUrlDocsReceived > 0 && st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){ log(LOG_WARN,"blaster: %li docs could not be sent due to " "error",st->m_numUrlDocsReceived); m_launched--; freeStateBD(st); return; } if (numUrlsNotFound==0){ //job done for this pair log(LOG_WARN,"blaster: All urls from %s found in " "%s",domain1,domain2); m_launched--; // Free stateBD freeStateBD(st); return; } log(LOG_WARN,"blaster: %li urls from %s Not found in %s", numUrlsNotFound,domain1,domain2); if(m_justDisplay){ m_launched--; // Free stateBD freeStateBD(st); } return; }
u_int96_t hash96 ( char *s, int32_t slen, u_int96_t startHash ) { u_int96_t h; h.n0 = hash64 ( s , slen , startHash.n0 ); h.n1 = hash32 ( s , slen , startHash.n1 ); return h; }
void gotDocWrapper ( void *state , TcpSocket *s ) { // no longer launched s_launched--; char* url = (char*)state; // bail if got cut off if ( s->m_readOffset == 0 ) { log("lost %s",(char *) state); if(s_server) mfree(url, gbstrlen(url)+1, "saved url"); return; } // got one more result page s_total++; // allow printing s_printIt = true; // get time now int64_t now = gettimeofdayInMilliseconds(); // get hash char *reply = s->m_readBuf ; int32_t size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); int32_t contentLen = size - mime.getMimeLen(); int32_t status = mime.getHttpStatus(); uint32_t h = hash32 ( content , contentLen ); char *p = mime.getMime(); char *pend = p + mime.getMimeLen(); char message[256]; int32_t mlen = 0; // parse status message out of response // HTTP/1.0 while ( p < pend && !isspace(*p) ) p++; // skip space while ( p < pend && isspace(*p) ) p++; // copy to end of line while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){ message[mlen++] = *p; } message[mlen] = '\0'; // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) %s : " "%s", status, s->m_readOffset , (int32_t)(now - s->m_startTime) , (char *)state , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) " "(hash=%"XINT32") %s", status, s->m_readOffset , (int32_t)(now - s->m_startTime) , h , (char *)state ); if(s_server) mfree(url, gbstrlen(url)+1, "saved url"); // try to launch another startSpidering(); }
void Blaster::gotDoc1( void *state, TcpSocket *s){ StateBD *st=(StateBD *)state; // Even if we loose the request, still count it as done. m_totalDone++; m_print=true; // bail if got cut off if ( s->m_readOffset == 0 ) { log("blaster: lost the Request in gotDoc1"); m_launched--; freeStateBD(st); return; } //if we are not doing diff if (!m_blasterDiff){ m_launched--; } long long now = gettimeofdayInMilliseconds(); // get hash char *reply = s->m_readBuf ; long size = s->m_readOffset; HttpMime mime; mime.set ( reply , size , NULL ); char *content = reply + mime.getMimeLen(); long contentLen = size - mime.getMimeLen(); unsigned long h = hash32 ( content , contentLen ); // log msg if ( g_errno ) logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s", s->m_readOffset , (long)(now - s->m_startTime) , st->m_u1 , mstrerror(g_errno) ); else logf(LOG_INFO,"blaster: got doc (%li) (%li ms) " "(hash=%lx) %s", s->m_readOffset , (long)(now - s->m_startTime) , h , st->m_u1 ); if (!m_blasterDiff){ // try to launch another if not using log file freeStateBD(st); if (!m_isLogFile){ startBlastering(); } if (m_isLogFile && --m_totalUrls==0) exit(0); return; } // Store the buffer from socket so that it does not get destroyed // at the end. Also, add another space because in gotDoc2 xml.set // demands the content to be null ended, so we need to store the // null character there. So as a precaution, just allocating the // max buf size. st->m_buf1=(char*) mcalloc(s->m_readBufSize,"Blaster5"); memcpy(st->m_buf1,s->m_readBuf,s->m_readOffset); //st->m_buf1=(char*) mdup(s->m_readBuf,s->m_readOffset,"Blaster5"); st->m_buf1Len=s->m_readOffset; st->m_buf1MaxLen=s->m_readBufSize; // . don't let TcpServer free m_buf when socket is recycled/closed // . we own it now and are responsible for freeing it. DON'T do this // because I believe this makes malloc crash, since TcpServer says // that it has freed the memory so malloc tries to allocate wrong // memory and gives a seg fault. // s->m_readBuf = NULL; log(LOG_WARN,"blaster: Downloading %s",st->m_u2); //char *ss="www.gigablast.com/search?q=hoopla&code=gbmonitor"; // st->m_u2.set(ss,gbstrlen(ss)); // get it bool status = g_httpServer.getDoc ( st->m_u2 , // url 0,//ip 0 , // offset -1 , // size 0 , // ifModifiedSince st , // state gotDocWrapper2, // callback 60*1000, // timeout 0,//atoip("66.154.102.20",13),//proxy ip 0,//3128,//80, // proxy port 30*1024*1024, //maxLen 30*1024*1024);//maxOtherLen // continue if it blocked if ( ! status ) return; // If not blocked, there is an error. m_launched--; // log msg log("From file2, gotdoc2 %s: %s", st->m_u2, mstrerror(g_errno) ); // No need to point p2 ahead because already been done // Free stateBD freeStateBD(st); return; }