C++ (Cpp) hash32 Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_cpptools.cpp Projekt: fougue/fougtools

void TestCppTools::hash_fnv_test()
{
    // TODO: fill testDataVec with http://www.isthe.com/chongo/src/fnv/test_fnv.c
    const hash_fnv_test::TestData testDataVec[] =
    {
        { "", 0x811c9dc5UL, 0xcbf29ce484222325ULL },
        { "a", 0xe40c292cUL, 0xaf63dc4c8601ec8cULL },
        { "b", 0xe70c2de5UL, 0xaf63df4c8601f1a5ULL },
        { "c", 0xe60c2c52UL, 0xaf63de4c8601eff2ULL },
        { "d", 0xe10c2473UL, 0xaf63d94c8601e773ULL },
        { "e", 0xe00c22e0UL, 0xaf63d84c8601e5c0ULL }
    };
    for (const auto& testData : testDataVec) {
        cpp::hash32_fnv_1a hash32;
        cpp::hash64_fnv_1a hash64;
        const auto byteSeqLen = std::strlen(testData.byteSeq);

        QCOMPARE(hash32(testData.byteSeq), testData.hash32_fnv_1a);
        QCOMPARE(hash32(testData.byteSeq, byteSeqLen), testData.hash32_fnv_1a);
        QCOMPARE(hash32(testData.byteSeq, testData.byteSeq + byteSeqLen), testData.hash32_fnv_1a);

        QCOMPARE(hash64(testData.byteSeq), testData.hash64_fnv_1a);
        QCOMPARE(hash64(testData.byteSeq, byteSeqLen), testData.hash64_fnv_1a);
        QCOMPARE(hash64(testData.byteSeq, testData.byteSeq + byteSeqLen), testData.hash64_fnv_1a);
    }
}

Beispiel #2

0

Datei anzeigen

Datei: map.hpp Projekt: crafn/clover

	uint32 operator()(const Map<K, V, C>& value) const
	{
		uint32 h= 0;
		for (auto&& p : value)
			h += hash32(p);
		return h;
	}

Beispiel #3

0

Datei anzeigen

Datei: renderer.cpp Projekt: oviano/gdi2ft

bool GDI2FT_RENDERER::fetch_glyph_run( bool is_glyph_index, bool is_pdy, LPCWSTR lpString, int c, CONST INT* lpDx, GDI2FT_GLPYH_RUN& glyph_run )
/* --------------------------------------------------------------------------------
-------------------------------------------------------------------------------- */
{
	HASH_VALUE erased_trait;
	if( glyph_cache.glyph_run_lru.access( font_trait, erased_trait ) )
		glyph_cache.erase_font_trait( erased_trait );

#ifdef _M_X64
	const HASH_VALUE str_hash = hash64( lpString, c * sizeof( WCHAR ), is_glyph_index );
#else
	const HASH_VALUE str_hash = hash32( lpString, c * sizeof( WCHAR ), is_glyph_index );
#endif // _M_X64

	if( !glyph_cache.lookup_glyph_run( font_trait, str_hash, glyph_run ) )
	{
		GDI2FT_MUTEX mutex( GDI2FT_MUTEX::MUTEX_GLYPH_RUN_CACHE );

		if( !glyph_cache.lookup_glyph_run( font_trait, str_hash, glyph_run ) )
		{
			if( render( is_glyph_index, is_pdy, lpString, c, lpDx, glyph_run ) == 0 )
				return false;

			glyph_cache.store_glyph_run( font_trait, str_hash, glyph_run );
		}
	}

	return true;
}

Beispiel #4

0

Datei anzeigen

Datei: mksfs.c Projekt: 151706061/ucore_lab

struct cache_block *
search_cache_block(struct sfs_fs *sfs, uint32_t ino) {
    struct cache_block *cb = sfs->blocks[hash32(ino)];
    while (cb != NULL && cb->ino != ino) {
        cb = cb->hash_next;
    }
    return cb;
}

Beispiel #5

0

Datei anzeigen

Datei: globalfactory.cpp Projekt: s-faychatelard/API

Component* GlobalFactory::addComponent(char* id, char* type, bool standardComponent)
{
    Component* obj = (Component*)g_factory.construct(type, id);
    
    hashTableInsert(&idPool, hash32(id, (int)strlen(id)), obj);
    
    List *l = (List*)hashTableLookup(&typePool, hash32(id, (int)strlen(id)));
    if (l == NULL)
    {
        l = (List*)malloc(sizeof(List));
        listInit(l);
        hashTableInsert(&typePool, hash32(id, (int)strlen(id)), l);
    }
    listAddElement(l, obj);
		
    return obj;
}

Beispiel #6

0

Datei anzeigen

Datei: mksfs.c Projekt: 151706061/ucore_lab

static struct cache_block *
alloc_cache_block(struct sfs_fs *sfs, uint32_t ino) {
    struct cache_block *cb = safe_malloc(sizeof(struct cache_block));
    cb->ino = (ino != 0) ? ino : sfs_alloc_ino(sfs);
    cb->cache = memset(safe_malloc(SFS_BLKSIZE), 0, SFS_BLKSIZE);
    struct cache_block **head = sfs->blocks + hash32(ino);
    cb->hash_next = *head, *head = cb;
    return cb;
}

Beispiel #7

0

Datei anzeigen

Datei: spinoperationthermal.cpp Projekt: jasonimercer/maglua

Thermal::Thermal(int nx, int ny, int nz)
	: SpinOperation(nx, ny, nz, hash32(Thermal::typeName()))
{
	setSlotName("Thermal");
	scale = luaT_inc<dArray>(new dArray(nx,ny,nz));
	scale->setAll(1.0);
	temperature = 0;
	
	myRNG = 0;
}

Beispiel #8

0

Datei anzeigen

Datei: HttpMime.cpp Projekt: lemire/open-source-search-engine

const char *extensionToContentTypeStr2 ( const char *ext , int32_t elen ) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return NULL;
	if ( elen <= 0 ) return NULL;
	// get hash for table look up
	int32_t key = hash32 ( ext , elen );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	if ( ! pp ) return NULL;
	return *pp;
}

Beispiel #9

0

Datei anzeigen

Datei: AutoBan.cpp Projekt: RevBooyah/open-source-search-engine

bool AutoBan::hasCode(char *code, long codeLen, long ip ) {
	if(codeLen == 0) return false;
	long h = hash32(code,codeLen);
	CodeVal *cv = m_ht.getValuePointer ( h );
	if ( ! cv ) 
		return log(LOG_INFO, "query: unrecognized code: %s", code);
	cv->m_ip = ip;
	cv->m_count++;
	return true;
}

Beispiel #10

0

Datei anzeigen

Datei: djb-hash.c Projekt: mct/junkdrawer

int main(int argc, char **argv)
{
    int i;

    for (i = 1; i < argc; i++) {
        printf("%u\n",  hash32(argv[i]));
        //printf("%ld\n", hash64(argv[i]));
        //printf("%08lx\n", taketwo(argv[i]));
    }
    return 0;
}

Beispiel #11

0

Datei anzeigen

Datei: HttpMime.cpp Projekt: lemire/open-source-search-engine

const char *HttpMime::getContentTypeFromExtension ( const char *ext , int32_t elen) {
	// assume text/html if no extension provided
	if ( ! ext || ! ext[0] ) return "text/html";
	if ( elen <= 0 ) return "text/html";
	// get hash for table look up
	int32_t key = hash32 ( ext , elen );
	char **pp = (char **)s_mimeTable.getValue ( &key );
	// if not found in table, assume text/html
	if ( ! pp ) return "text/html";
	return *pp;
}

Beispiel #12

0

Datei anzeigen

Datei: spinoperationshortrange.cpp Projekt: jasonimercer/maglua

ShortRange::ShortRange(int nx, int ny, int nz)
    : SpinOperation(nx, ny, nz, hash32(ShortRange::typeName()))
{
    setSlotName("ShortRange");
    size = 32;
    num  = 0;
    pathways = 0;
    pbc[0] = 1;
    pbc[1] = 1;
    pbc[2] = 1;

}

Beispiel #13

0

Datei anzeigen

Datei: hash.c Projekt: andyross/nasal

static unsigned int refhash(naRef key)
{
    if(IS_STR(key)) {
        struct naStr* s = PTR(key).str;
        if(s->hashcode) return s->hashcode;
        return s->hashcode = hash32((void*)naStr_data(key), naStr_len(key));
    } else { /* must be a number */
        union { double d; unsigned int u[2]; } n;
        n.d = key.num == -0.0 ? 0.0 : key.num; /* remember negative zero! */ 
        return mix32(mix32(n.u[0]) ^ n.u[1]);
    }
}

Beispiel #14

0

Datei anzeigen

Datei: core.cpp Projekt: beautifularea/aleth

TEST(core, byteRef)
{
    dev::bytes originalSequence =
        dev::fromHex("0102030405060708091011121314151617181920212223242526272829303132");
    dev::bytesRef out(&originalSequence.at(0), 32);
    dev::h256 hash32("1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347");
    hash32.ref().copyTo(out);

    EXPECT_EQ(out.size(), 32)
        << "Error wrong result size when h256::ref().copyTo(dev::bytesRef out)";
    EXPECT_EQ(out.toBytes(), originalSequence)
        << "Error when h256::ref().copyTo(dev::bytesRef out)";
}

Beispiel #15

0

Datei anzeigen

Datei: globalfactory.cpp Projekt: s-faychatelard/API

Component* GlobalFactory::getComponentById(char* id)
{
    Component* component = (Component*)hashTableLookup(&idPool, hash32(id, (int)strlen(id)));
    if (component == NULL)
    {
        char* message = (char*)"The component ";
        message = stringInsert(message, id, (int)strlen(message));
        message = stringInsert(message, " doesn't exist", (int)strlen(message));
        
        throw message;
    }
    return component;
}

Beispiel #16

0

Datei anzeigen

Datei: globalfactory.cpp Projekt: s-faychatelard/API

List* GlobalFactory::getComponentsByType(char* type)
{
    List* list = (List*)hashTableLookup(&typePool, hash32(type, (int)strlen(type)));
    if (list == NULL)
    {
        char* message = (char*)"No component typed ";
        message = stringInsert(message, type, (int)strlen(message));
        message = stringInsert(message, "exists for now", (int)strlen(message));
        
        throw message;
    }
    return list;
}

Beispiel #17

0

Datei anzeigen

Datei: cpu_riscv_func.cpp Projekt: sergeykhbr/riscv_vhdl

IInstruction *CpuRiscV_Functional::decodeInstruction(uint32_t *rpayload) {
    IInstruction *instr = NULL;
    int hash_idx = hash32(rpayload[0]);
    for (unsigned i = 0; i < listInstr_[hash_idx].size(); i++) {
        instr = static_cast<IInstruction *>(
                        listInstr_[hash_idx][i].to_iface());
        if (instr->parse(rpayload)) {
            break;
        }
        instr = NULL;
    }

    return instr;
}

Beispiel #18

0

Datei anzeigen

Datei: Catdb.cpp Projekt: harkhuang/open-source-search-engine

// . get startKey,endKey for all SiteRecs from "url"'s domain
// . key has the following format:
// . dddddddd dddddddd dddddddd dddddddd  d = domain hash w/ collection
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu  u = url hash
// . uuuuuuuu uuuuuuuu uuuuuuuu uuuuuuuu
// . putting domain as first 32bits will cluster all SiteRecs from the
//   same domain together on the same machine
void Catdb::getKeyRange ( bool useIp , Url *url,
                          key_t *startKey , key_t *endKey ) {
    // log warning msg if we need to
    if ( useIp && ! url->hasIp() )
        log(LOG_LOGIC,"db: tagdb: getKeyRange: useIp is true, "
            "but url has no ip");
    // . the upper 32 bits of the key is basically hash of the domain
    // . mask out the low-order byte (hi byte in little endian order)
    unsigned long h;
    // . make sure we use htonl() on ip domain so top byte is not zero!
    // . this made all our ip-based sites stored in group #0 before
    //   if ( useIp ) h = htonl ( url->getIpDomain() ) ;
    // . only hash first 3 bytes of ip domain to keep together w/ ip
    // . if rdbid is tagdb then use hostname as key else use domain
    if   ( useIp ) {
        // do htonl so most significant byte is first
        long ipdom = htonl(url->getIpDomain());
        h = hash32 ( (char *)&ipdom , 3 ) ;
    }
    else
        h = hash32 (url->getDomain(), url->getDomainLen());

    // incorporate collection into "h"
    //h = hash32 ( coll , collLen , h  );
    // now make the keys
    key_t k;
    // top 4 bytes is always the domain hash (ip or canonical domain)
    k.n1 = h;
    // don't set the low del bit for startKey
    k.n0 = 0x0000000000000000LL;
    // assign the startKey
    if ( startKey ) *startKey = k;
    // set the low del bit for startKey
    k.n0 = 0xffffffffffffffffLL;
    // endkey is just as simple
    if ( endKey   ) *endKey   = k;
}

Beispiel #19

0

Datei anzeigen

Datei: AutoBan.cpp Projekt: RevBooyah/open-source-search-engine

bool AutoBan::setCodesFromConf() {

	static bool s_firstTime = true;
	m_codeResetTime = getTime();
	char *p = g_conf.m_validCodes;
	while(*p) {
		if(!isspace(*p)) {
			long len = 0;
			while(p[len] && !isspace(p[len])) len++;
			//now p points to a code, with length len.
			//log(LOG_WARN, "autoban code is %s %li", p, len);
			long h = hash32(p,len);
			CodeVal cv;
			long max = len;	if ( max > 30 ) max = 30;
			strncpy(cv.m_code,p,max);
			cv.m_code[max]='\0';
			cv.m_ip             = 0;
			cv.m_count          = 0;
			cv.m_bytesSent      = 0;
			cv.m_bytesRead      = 0;
			// we might be doing an update, so only set this
			// count to 0 the first time we are called on startup
			if ( s_firstTime )
				cv.m_outstanding = 0;
			cv.m_maxEver        = 0;
			cv.m_maxOutstanding = 5000;
			//m_numCodes++;
			p += len;
			// skip spaces or tabs
			while ( *p == ' ' || *p == '\t' ) p++;
			// do we got a number? that is the max outstanding cnt
			if ( is_digit ( *p ) )
				cv.m_maxOutstanding = atoi(p);
			// ensure no breach
			if ( cv.m_maxOutstanding < 10 ) 
				log("gb: client code %s has LOW max "
				    "outstanding limit of %li",
				    cv.m_code,cv.m_maxOutstanding);
			// skip the digits, until we hit \r or \n
			while ( is_digit ( *p ) ) p++;
			// now add it
			if ( ! m_ht.addKey ( h , cv ) ) return false;
		}
		p++;
	}
	s_firstTime = false;
	return true;
}

Beispiel #20

0

Datei anzeigen

Datei: renderer.cpp Projekt: oviano/gdi2ft

GDI2FT_RENDERER::GDI2FT_RENDERER( const GDI2FT_CONTEXT& _context )
/* --------------------------------------------------------------------------------
-------------------------------------------------------------------------------- */
{
	context = &_context;
	render_mode = FT_RENDER_MODE_LCD;
	char_extra = GetTextCharacterExtra( context->hdc );

	const int lf_metric_size = sizeof( context->log_font ) - sizeof( context->log_font.lfFaceName );
	const int lf_facename_size = static_cast<const int>( ( wcslen( context->log_font.lfFaceName ) * sizeof( wchar_t ) ) );
	const int lf_total_size = lf_metric_size + lf_facename_size;

#ifdef _M_X64
	font_trait = hash64( &context->log_font, lf_total_size, 0 );
#else
	font_trait = hash32( &context->log_font, lf_total_size, 0 );
#endif // _M_X64
}

Beispiel #21

0

Datei anzeigen

Datei: print_urlinfo.cpp Projekt: privacore/open-source-search-engine

int main(int argc, char **argv) {
	if (argc < 2) {
		print_usage(argv[0]);
		return 1;
	}

	if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) {
		print_usage(argv[0]);
		return 1;
	}

	// initialize library
	g_mem.init();
	hashinit();

	g_conf.init(NULL);

	g_log.m_logPrefix = false;

	const char *input = argv[1];
	size_t inputLen = strlen(input);

	Url url;
	url.set(input, inputLen);
	url.print();
	logf(LOG_TRACE, "\t");

	SiteGetter sg;
	sg.getSite(input, NULL, 0, 0, 0);
	logf(LOG_TRACE, "Site info");
	logf(LOG_TRACE, "\tsite         : %.*s", sg.getSiteLen(), sg.getSite());
	logf(LOG_TRACE, "\tsitehash32   : %" PRIx32, hash32(sg.getSite(), sg.getSiteLen(), 0));
	logf(LOG_TRACE, "\t");

	uint64_t probableDocId = Titledb::getProbableDocId(&url);
	logf(LOG_TRACE, "Document info");
	logf(LOG_TRACE, "\tprobabledocid      : %" PRIu64, probableDocId);
	logf(LOG_TRACE, "\tfirstprobabledocid : %" PRIu64, Titledb::getFirstProbableDocId(probableDocId));
	logf(LOG_TRACE, "\tlastprobabledocid  : %" PRIu64, Titledb::getLastProbableDocId(probableDocId));

	return 0;
}

Beispiel #22

0

Datei anzeigen

Datei: fp_tcp.c Projekt: yersinia/p0f

static void packet_to_sig(struct packet_data* pk, struct tcp_sig* ts) {

  ts->opt_hash = hash32(pk->opt_layout, pk->opt_cnt, hash_seed);

  ts->quirks      = pk->quirks;
  ts->opt_eol_pad = pk->opt_eol_pad;
  ts->ip_opt_len  = pk->ip_opt_len;
  ts->ip_ver      = pk->ip_ver;
  ts->ttl         = pk->ttl;
  ts->mss         = pk->mss;
  ts->win         = pk->win;
  ts->win_type    = WIN_TYPE_NORMAL; /* Keep as-is. */
  ts->wscale      = pk->wscale;
  ts->pay_class   = !!pk->pay_len;
  ts->tot_hdr     = pk->tot_hdr;
  ts->ts1         = pk->ts1;
  ts->recv_ms     = get_unix_time_ms();
  ts->matched     = NULL;
  ts->fuzzy       = 0;
  ts->dist        = 0;

};

Beispiel #23

0

Datei anzeigen

Datei: Msg0.cpp Projekt: lemire/open-source-search-engine

// . THIS Msg0 class must be alloc'd, i.e. not on the stack, etc.
// . if list is stored locally this tries to get it locally
// . otherwise tries to get the list from the network
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . NOTE: i was having problems with queries being cached too long, you
//   see the cache here is a NETWORK cache, so when the machines that owns
//   the list updates it on disk it can't flush our cache... so use a small
//   maxCacheAge of like , 30 seconds or so...
bool Msg0::getList ( int64_t hostId      , // host to ask (-1 if none)
		     int32_t      ip          , // info on hostId
		     int16_t     port        ,
		     int32_t      maxCacheAge , // max cached age in seconds
		     bool      addToCache  , // add net recv'd list to cache?
		     char      rdbId       , // specifies the rdb
		     collnum_t collnum ,
		     RdbList  *list        ,
		     const char     *startKey    ,
		     const char     *endKey      ,
		     int32_t      minRecSizes ,  // use -1 for no max
		     void     *state       ,
		     void    (* callback)(void *state ),//, RdbList *list ) ,
		     int32_t      niceness    ,
		     bool      doErrorCorrection ,
		     bool      includeTree ,
		     bool      doMerge     ,
		     int32_t      firstHostId   ,
		     int32_t      startFileNum  ,
		     int32_t      numFiles      ,
		     int64_t      timeout       ,
		     int64_t syncPoint     ,
		     int32_t      preferLocalReads ,
		     Msg5     *msg5             ,
		     bool      isRealMerge      ,
		     bool      allowPageCache    ,
		     bool      forceLocalIndexdb ,
		     bool      noSplit ,
		     int32_t      forceParitySplit  ) {
	logTrace( g_conf.m_logTraceMsg0, "BEGIN. hostId: %" PRId64", rdbId: %d", hostId, (int)rdbId );

	// warning
	if ( collnum < 0 ) log(LOG_LOGIC,"net: NULL collection. msg0.");

	// reset the list they passed us
	list->reset();
	// get keySize of rdb
	m_ks = getKeySizeFromRdbId ( rdbId );
	
//	if( g_conf.m_logTraceMsg0 ) 
//	{
//		log("%s:%s:%d: rdbId. [%d]", __FILE__,__func__,__LINE__, (int)rdbId);
//		log("%s:%s:%d: m_ks.. [%d]", __FILE__,__func__,__LINE__, (int)m_ks);
//		log("%s:%s:%d: hostId [%" PRId64"]", __FILE__,__func__,__LINE__, hostId);
//	}

	// if startKey > endKey, don't read anything
	//if ( startKey > endKey ) return true;
	if ( KEYCMP(startKey,endKey,m_ks)>0 ) { char *xx=NULL;*xx=0; }//rettrue
	// . reset hostid if it is dead
	// . this is causing UOR queries to take forever when we have a dead
	if ( hostId >= 0 && g_hostdb.isDead ( hostId ) ) hostId = -1;
	// no longer accept negative minrecsize
	if ( minRecSizes < 0 ) {
		g_errno = EBADENGINEER;
		logTrace( g_conf.m_logTraceMsg0, "END" );

		log(LOG_LOGIC, "net: msg0: Negative minRecSizes no longer supported.");
		char *xx=NULL;*xx=0;
	}

	// remember these
	m_state         = state;
	m_callback      = callback;
	m_list          = list;
	m_hostId        = hostId;
	m_niceness      = niceness;
	m_addToCache    = addToCache;
	// . these define our request 100%
	KEYSET(m_startKey,startKey,m_ks);
	KEYSET(m_endKey,endKey,m_ks);
	m_minRecSizes   = minRecSizes;
	m_rdbId         = rdbId;
	m_collnum = collnum;//          = coll;
	m_isRealMerge   = isRealMerge;
	m_allowPageCache = allowPageCache;

	// . group to ask is based on the first key 
	// . we only do 1 group per call right now
	// . groupMask must turn on higher bits first (count downwards kinda)
	// . titledb and spiderdb use special masks to get groupId

	// if diffbot.cpp is reading spiderdb from each shard we have to
	// get groupid from hostid here lest we core in getGroupId() below.
	// it does that for dumping spiderdb to the client browser. they
	// can download the whole enchilada.
	if ( hostId >= 0 && m_rdbId == RDB_SPIDERDB )
		m_shardNum = 0;
	// did they force it? core until i figure out what this is
	else if ( forceParitySplit >= 0 ) 
		//m_groupId =  g_hostdb.getGroupId ( forceParitySplit );
		m_shardNum = forceParitySplit;
	else
		//m_groupId = getGroupId ( m_rdbId , startKey , ! noSplit );
		m_shardNum = getShardNum ( m_rdbId , startKey );

	// if we are looking up a termlist in posdb that is split by termid and
	// not the usual docid then we have to set this posdb key bit that tells
	// us that ...
	if ( noSplit && m_rdbId == RDB_POSDB )
		m_shardNum = g_hostdb.getShardNumByTermId ( startKey );

	// how is this used?
	if ( forceLocalIndexdb ) m_shardNum = getMyShardNum();


//	if( g_conf.m_logTraceMsg0 ) log("%s:%s:%d: shardNum [%" PRId32"]", __FILE__,__func__, __LINE__, m_shardNum);


	// . store these parameters
	// . get a handle to the rdb in case we can satisfy locally
	// . returns NULL and sets g_errno on error
	QUICKPOLL((m_niceness));
	Rdb *rdb = getRdbFromId ( m_rdbId );
	if ( ! rdb ) return true;
	// we need the fixedDataSize
	m_fixedDataSize = rdb->getFixedDataSize();
	m_useHalfKeys   = rdb->useHalfKeys();
	// . debug msg
	// . Msg2 does this when checking for a cached compound list.
	//   compound lists do not actually exist, they are merges of smaller
	//   UOR'd lists.
	if ( maxCacheAge != 0 && ! addToCache && (numFiles > 0 || includeTree)) {
		log( LOG_LOGIC, "net: msg0: Weird. check but don't add... rdbid=%" PRId32".", ( int32_t ) m_rdbId );
	}

	// set this here since we may not call msg5 if list not local
	//m_list->setFixedDataSize ( m_fixedDataSize );

	// . now that we do load balancing we don't want to do a disk lookup
	//   even if local if we are merging or dumping
	// . UNLESS g_conf.m_preferLocalReads is true
	if ( preferLocalReads == -1 ) 
		preferLocalReads = g_conf.m_preferLocalReads;

	// . always prefer local for full split clusterdb
	// . and keep the tfndb/titledb lookups in the same stripe
	// . so basically we can't do biased caches if fully split
	//if ( g_conf.m_fullSplit ) preferLocalReads = true;
	preferLocalReads = true;

	// it it stored locally?
	bool isLocal = ( m_hostId == -1 && //g_hostdb.m_groupId == m_groupId );
			 m_shardNum == getMyShardNum() );
	// only do local lookups if this is true
	if ( ! preferLocalReads ) isLocal = false;

	/*
	int64_t singleDocIdQuery = 0LL;
	if ( rdbId == RDB_POSDB ) {
		int64_t d1 = g_posdb.getDocId(m_startKey);
		int64_t d2 = g_posdb.getDocId(m_endKey);
		if ( d1+1 == d2 ) singleDocIdQuery = d1;
	}

	// . try the LOCAL termlist cache
	// . so when msg2 is evaluating a gbdocid:| query and it has to
	//   use msg0 to go across the network to get the same damn termlist
	//   over and over again for the same docid, this will help alot.
	// . ideally it'd be nice if the seo pipe in xmldoc.cpp can try to
	//   send the same gbdocid:xxxx docids to the same hosts. maybe hash
	//   based on docid into the list of hosts and if that host is busy
	//   just chain until we find someone not busy.
	if ( singleDocIdQuery &&
	     getListFromTermListCache ( coll,
					m_startKey,
					m_endKey,
					maxCacheAge,
					list ) )
		// found!
		return true;
	*/

	// but always local if only one host
	if ( g_hostdb.getNumHosts() == 1 ) isLocal = true;

	// . if the group is local then do it locally
	// . Msg5::getList() returns false if blocked, true otherwise
	// . Msg5::getList() sets g_errno on error
	// . don't do this if m_hostId was specified
	if ( isLocal ) {
		logTrace( g_conf.m_logTraceMsg0, "isLocal" );

		if ( msg5 ) {
			m_msg5 = msg5;
			m_deleteMsg5 = false;
		}
		else {
			try { m_msg5 = new ( Msg5 ); } 
			catch ( ... ) {
				g_errno = ENOMEM;
				log("net: Local alloc for disk read failed "
				    "while tring to read data for %s. "
				    "Trying remote request.",
				    getDbnameFromId(m_rdbId));
				goto skip;
			}
			mnew ( m_msg5 , sizeof(Msg5) , "Msg0::Msg5" );
			m_deleteMsg5 = true;
		}

		QUICKPOLL(m_niceness);
		if ( ! m_msg5->getList ( rdbId,
					 m_collnum ,
					 m_list ,
					 m_startKey ,
					 m_endKey   ,
					 m_minRecSizes ,
					 includeTree   , // include Tree?
					 addToCache    , // addToCache?
					 maxCacheAge   ,
					 startFileNum  , 
					 numFiles      ,
					 this ,
					 gotListWrapper2   ,
					 niceness          ,
					 doErrorCorrection ,
					 NULL , // cacheKeyPtr
					 0    , // retryNum
					 -1   , // maxRetries
					 true , // compensateForMerge
					 syncPoint ,
					 m_isRealMerge ,
					 m_allowPageCache ) ) {
			logTrace( g_conf.m_logTraceMsg0, "END, return false" );
			return false;
		}

		// nuke it
		reset();
		logTrace( g_conf.m_logTraceMsg0, "END, return true" );
		return true;
	}
skip:
	// debug msg
	if ( g_conf.m_logDebugQuery )
		log(LOG_DEBUG,"net: msg0: Sending request for data to "
		    "shard=%" PRIu32" "
		    "listPtr=%" PTRFMT" minRecSizes=%" PRId32" termId=%" PRIu64" "
		    //"startKey.n1=%" PRIx32",n0=%" PRIx64" (niceness=%" PRId32")",
		    "startKey.n1=%" PRIx64",n0=%" PRIx64" (niceness=%" PRId32")",
		    //g_hostdb.makeHostId ( m_groupId ) ,
		    m_shardNum,
		    (PTRTYPE)m_list,
		    m_minRecSizes, g_posdb.getTermId(m_startKey) , 
		    //m_startKey.n1,m_startKey.n0 , (int32_t)m_niceness);
		    KEY1(m_startKey,m_ks),KEY0(m_startKey),
		    (int32_t)m_niceness);

	char *replyBuf = NULL;
	int32_t  replyBufMaxSize = 0;
	bool  freeReply = true;

	// . make a request with the info above (note: not in network order)
	// . IMPORTANT!!!!! if you change this change 
	//   Multicast.cpp::sleepWrapper1 too!!!!!!!!!!!!
	//   no, not anymore, we commented out that request peeking code
	char *p = m_request;
	*(int64_t *) p = syncPoint        ; p += 8;
	//*(key_t     *) p = m_startKey       ; p += sizeof(key_t);
	//*(key_t     *) p = m_endKey         ; p += sizeof(key_t);
	*(int32_t      *) p = m_minRecSizes    ; p += 4;
	*(int32_t      *) p = startFileNum     ; p += 4;
	*(int32_t      *) p = numFiles         ; p += 4;
	*(int32_t      *) p = maxCacheAge      ; p += 4;
	if ( p - m_request != RDBIDOFFSET ) { char *xx=NULL;*xx=0; }
	*p               = m_rdbId          ; p++;
	*p               = addToCache       ; p++;
	*p               = doErrorCorrection; p++;
	*p               = includeTree      ; p++;
	*p               = (char)niceness   ; p++;
	*p               = (char)m_allowPageCache; p++;
	KEYSET(p,m_startKey,m_ks);          ; p+=m_ks;
	KEYSET(p,m_endKey,m_ks);            ; p+=m_ks;
	// NULL terminated collection name
	//strcpy ( p , coll ); p += gbstrlen ( coll ); *p++ = '\0';
	*(collnum_t *)p = m_collnum; p += sizeof(collnum_t);
	m_requestSize    = p - m_request;
	// ask an individual host for this list if hostId is NOT -1
	if ( m_hostId != -1 ) {
		// get Host
		Host *h = g_hostdb.getHost ( m_hostId );
		if ( ! h ) { 
			g_errno = EBADHOSTID; 
			log(LOG_LOGIC,"net: msg0: Bad hostId of %" PRId64".", m_hostId);
			logTrace( g_conf.m_logTraceMsg0, "END, return true. Bad hostId" );
			return true;
		}
		
		// if niceness is 0, use the higher priority udpServer
		UdpServer *us ;
		uint16_t port;
		QUICKPOLL(m_niceness);

		us = &g_udpServer ; port = h->m_port ; 
		// . returns false on error and sets g_errno, true otherwise
		// . calls callback when reply is received (or error)
		// . we return true if it returns false
		if ( ! us->sendRequest ( m_request     ,
					 m_requestSize ,
					 0x00          , // msgType
					 h->m_ip       ,
					 port          ,
					 m_hostId      ,
					 NULL          , // the slotPtr
					 this          ,
					 gotSingleReplyWrapper ,
					 timeout       ,
					 -1            , // backoff
					 -1            , // maxwait
					 replyBuf      ,
					 replyBufMaxSize ,
					 m_niceness     ) ) { // cback niceness
			logTrace( g_conf.m_logTraceMsg0, "END, return true. Request sent" );
			return true;
		}
		
		// return false cuz it blocked
		logTrace( g_conf.m_logTraceMsg0, "END, return false. sendRequest blocked" );
		return false;
	}
	// timing debug
	if ( g_conf.m_logTimingNet )
		m_startTime = gettimeofdayInMilliseconds();
	else
		m_startTime = 0;

	// . get the top int32_t of the key
	// . i guess this will work for 128 bit keys... hmmmmm
	int32_t keyTop = hash32 ( (char *)startKey , m_ks );

	// . otherwise, multicast to a host in group "groupId"
	// . returns false and sets g_errno on error
	// . calls callback on completion
	// . select first host to send to in group based on upper 32 bits
	//   of termId (m_startKey.n1)
	// . need to send out to all the indexdb split hosts
	m_numRequests = 0;
	m_numReplies  = 0;
	//for ( int32_t i = 0; i < m_numSplit; i++ ) {

	QUICKPOLL(m_niceness);
	//int32_t gr;
	char *buf;
	buf = replyBuf;

	// get the multicast
	Multicast *m = &m_mcast;

        if ( ! m->send ( m_request    , 
			      m_requestSize,
			      0x00         , // msgType 0x00
			      false        , // does multicast own request?
			 m_shardNum ,
			      false        , // send to whole group?
			      //m_startKey.n1, // key is passed on startKey
			      keyTop       , // key is passed on startKey
			      this         , // state data
			      NULL         , // state data
			      gotMulticastReplyWrapper0 ,
			      timeout*1000 , // timeout
			      niceness     ,
			      firstHostId  ,
			      buf             ,
			      replyBufMaxSize ,
			      freeReply       , // free reply buf?
			      true            , // do disk load balancing?
			      maxCacheAge     ,
			      //(key_t *)cacheKey        ,
			      // multicast uses it for determining the best
			      // host to send the request to when doing 
			      // disk load balancing. if the host has our 
			      // data cached, then it will probably get to
			      // handle the request. for now let's just assume
			      // this is a 96-bit key. TODO: fix...
			 0 , // *(key_t *)cacheKey        ,
			      rdbId           ,
			      minRecSizes     ) ) 
	{
		log(LOG_ERROR, "net: Failed to send request for data from %s in shard "
		    "#%" PRIu32" over network: %s.",
		    getDbnameFromId(m_rdbId),m_shardNum, mstrerror(g_errno));
		// but speed it up
		m_errno = g_errno;
		m->reset();
		if ( m_numRequests > 0 ) {
			logTrace( g_conf.m_logTraceMsg0, "END - returning false" );
			
			return false;
		}

		logTrace( g_conf.m_logTraceMsg0, "END - returning true" );
		return true;
	}

	m_numRequests++;

	// we blocked
	logTrace( g_conf.m_logTraceMsg0, "END - returning false, blocked" );
	return false;
}

Beispiel #24

0

Datei anzeigen

Datei: fnv1hash.hpp Projekt: tmacam/CrawlingBeast

	inline size_t hash32(const string& key)
	{
		return hash32(key.c_str(),key.size());
	}

Beispiel #25

0

Datei anzeigen

Datei: math_tools.cpp Projekt: mkuitune/glhack

uint32_t hash32(const std::string& string)
{
    return hash32(string.c_str(), string.size());
}

Beispiel #26

0

Datei anzeigen

Datei: filterquerylogs.cpp Projekt: DeadNumbers/open-source-search-engine

// . returns -1 on error, 0 on success
// . reads HTTP reply from filename given as argument, filters it, 
//   and then writes it to stdout
// . originally, we read from stdin, but popen was causing problems when called
//   from a thread on linux 2.4.17 with the old linux threads
int main ( int argc , char *argv[] ) {

	// should have one and only 1 arg (excluding filename)
	if ( argc != 2 ) {
		fprintf(stderr,"usage: fql <querylogfilename1>..."
			"<querylogfilenameN>\n");
		return -1;
	}

	// each log file should be <= 2GB
	char *buf = (char *)malloc ( MAX_READ_SIZE );
	if ( ! buf ) {
		fprintf(stderr,"fql:malloc:li: %s: %s\n",
			(int32_t)MAX_READ_SIZE,strerror(errno)); 
		return -1;
	}


	// seed with same value so we get same rand sequence for all
	srand ( 1945687 );
	for ( int32_t i = 0 ; i < 256 ; i++ )
		for ( int32_t j = 0 ; j < 256 ; j++ ) {
			g_hashtab [i][j]  = (uint64_t)rand();
			// the top bit never gets set, so fix
			if ( rand() > (0x7fffffff / 2) ) 
				g_hashtab[i][j] |= 0x80000000;
			g_hashtab [i][j] <<= 32;
			g_hashtab [i][j] |= (uint64_t)rand();
			// the top bit never gets set, so fix
			if ( rand() > (0x7fffffff / 2) ) 
				g_hashtab[i][j] |= 0x80000000;
		}
	if ( g_hashtab[0][0] != 6720717044602784129LL ) return false;


	fprintf(stderr,"fql: reading %s\n", argv[1]);

	// first and only arg is the input file to read from
	int fd = open ( argv[1] , O_RDONLY );
	if ( fd < 0 ) {
		fprintf(stderr,"fql:open: %s: %s\n",
			argv[1],strerror(errno)); 
		free ( buf );
		return -1;
	}

	int n = read ( fd , buf , MAX_READ_SIZE );

	close ( fd );

	fprintf(stderr,"fql: done reading %s\n", argv[1]);

	// return -1 on read error
	if ( n < 0 ) {
		fprintf(stderr,"fql:fread: %s\n",strerror(errno)); 
		free ( buf );
		return -1;
	}

	// warn if the doc was bigger than expected
	if ( n >= (int32_t)MAX_READ_SIZE ) 
		fprintf(stderr,"fql: WARNING: MAX_READ_SIZE "
			"needs boost\n");
	// if nothing came in then nothing goes out, we're done
	if ( n == 0 ) { free ( buf ) ; return 0; }

	// store last 1000 hashes in a ring
	int32_t hashes[MAX_HASHES];
	memset ( hashes, 0 , MAX_HASHES * 4 );
	int32_t nh = 0;

	// parse out query from each url
	char *p = buf;
	for ( ; *p ; p++ ) {
		if ( p[0] != '?' && p[0] != '&' ) continue;
		if ( p[1] != 'q' ) continue;
		if ( p[2] != '=' ) continue;
		p += 3;
		// mark the end
		char *end = p;
		bool good = true;
		for ( ; *end && *end!='&' && *end!='\n' && *end!=' '; end++ ) {
			// double quote?
			if ( *end == '%' &&
			     end[1] == '2' &&
			     end[2] == '2' ) {
				good = false;
				break;
			}
			// colon or pipe operators, ignore
			if ( *end == '|') {
				good = false;
				break;
			}
			if ( *end == '%' &&
			     end[1] == '3' &&
			     end[2] == 'a' ) {
				good = false;
				break;
			}
			if ( *end == '%' &&
			     end[1] == '3' &&
			     end[2] == 'A' ) {
				good = false;
				break;
			}

		}
		// filter out?
		if ( ! good ) continue;
		// limit size. 150 is too big.
		if ( end - p > 150 ) continue;

		// scan backwards to get ip
		char *ips = p;
		for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- ); 
		if ( ips>buf ) ips--;
		for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- ); 
		char *ipend = ips;
		if ( ips>buf ) ips--;
		for ( ; ips>buf && *ips != ' ' && *ips != '\t' ; ips-- );
		ips++;
		// should be ip now!
		int32_t iplen = ipend - ips;
		//int32_t uip = atoip(ips,ipend-ips);
		//if ( ! uip ) continue;
		// must be ip #
		if ( !isdigit(ips[0]) ) continue;

		// replace comma with space
		for ( char *r = p ; r < end ; r++ ) {
			if ( *r == ',' ) *r = '+';
		}

		char *dst2 = p;
		for ( char *r = p ; r < end ; r++ ) {
			*dst2 = *r;
			if ( *r == '%' &&
			     r[1] == '2' &&
			     r[2] == '0' ) {
				*dst2 = '+';
				r += 2;
			}
			dst2++;
		}
		end = dst2;


		// skip initial spaces
		char *x = p;
		for ( ; x < end ; x++ ) {
			if ( *x == '+' ) continue;
			break;
		}
		char *query = p;
		// filter out back to back spaces
		char *dst = p;
		bool lastWasSpace = false;
		for ( char *x = p ; x < end ; x++ ) {
			// skip back to back spaces
			if ( *x == '+' && lastWasSpace ) continue;
			// skip initial spaces
			if ( x == p && *x == '+' ) {
				lastWasSpace = true;
				continue;
			}
			// skip initial spaces
			*dst++ = *x;
			if      ( *x == '+' ) lastWasSpace = true;
			else                  lastWasSpace = false;
		}
		// null term the overwritten buffer
		*dst = '\0';
		// get the length of the query
		int32_t queryLen = dst - p;
		// skip that for the for loop
		p = dst;
		// skip empty queries
		if ( queryLen==0 ) continue;
		// hash it up
		int32_t h = hash32(query,queryLen);
		for ( int32_t i = 0 ; i < MAX_HASHES ; i++ ) {
			if ( hashes[i] == h ) { good = false; break; }
		}
		hashes[nh] = h;
		// inc and wrap
		if ( ++nh >= MAX_HASHES ) nh = 0;
		// filter out?
		if ( ! good ) continue;
		// cblock it
		char dotCount = 0;
		for ( int32_t k = 0 ; k < iplen ; k++ ) {
			if ( ips[k] != '.' ) continue;
			if ( ++dotCount < 3 ) continue;
			ips[k] = '\0';
			break;
		}
		if ( dotCount != 3 ) continue;
		// print ip 
		//ips[iplen] = '\0';
		// write that out
		fprintf(stdout,"%s %s\n",ips,query);
	}


	return 0;
}

Beispiel #27

0

Datei anzeigen

Datei: Blaster.cpp Projekt: alvinlai/open-source-search-engine

void Blaster::gotDoc2 ( void *state, TcpSocket *s){
	StateBD *st=(StateBD *)state;
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blaster: Lost the Request in gotDoc2");
		m_launched--;
		//No need to point p2
		// Free stateBD
		freeStateBD(st);
		return;
	}
	
	// . don't let TcpServer free m_buf when socket is recycled/closed
	// . we own it now and are responsible for freeing it
	//	s->m_readBuf = NULL;

	long long now = gettimeofdayInMilliseconds();
	// So now after getting both docIds, get their contents
	char *reply1 = st->m_buf1 ;
	long  size1  = st->m_buf1Len;
	HttpMime mime1;
	mime1.set ( reply1 , size1 , NULL );
	char *content1    = reply1 + mime1.getMimeLen();
	long  content1Len = size1  - mime1.getMimeLen();
	unsigned long h = hash32 ( content1 , content1Len );
	// log msg
	if ( g_errno ) 
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     st->m_u2   , 
		     mstrerror(g_errno)   );
	else
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) "
		     "(hash=%lx) %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     h ,
		     st->m_u2       );


	if (m_verbose){
		log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s",
		    content1Len,content1);
		log(LOG_WARN,"\n");
	}
	char *reply2 = s->m_readBuf ;
	long  size2  = s->m_readOffset;
	HttpMime mime2;
	mime2.set ( reply2 , size2 , NULL );
	char *content2    = reply2 + mime2.getMimeLen();
	long  content2Len = size2  - mime2.getMimeLen();
	if (m_verbose)	
		log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s",
		    content2Len,content2);

	// Now that we've got the contents, lets get the url links out 
	// of these pages.Passing them to function getSearchLinks should 
	// get the first x links found out.
	/*	st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3");
	st->m_links2=st->m_links1+100*MAX_URL_LEN;
	st->m_numLinks1=100;
	st->m_numLinks2=100;*/

	/*	long numLinks1=getSearchLinks(content1,content1Len,
				      st->m_links1,st->m_numLinks1);
	long numLinks2=getSearchLinks(content2,content2Len,
	st->m_links2,st->m_numLinks2);*/


	content1[content1Len]='\0';
	//short csEnum1= get_iana_charset(mime1.getCharset(), 
	//				mime1.getCharsetLen());
	/*	if (csEnum1== csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
	Xml xml1;
	// assume utf8
	if (!xml1.set(content1, 
		     content1Len,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
	}
	Links links1;
	Url parent; parent.set ( st->m_u1);
	if (!links1.set(false , // userellnofollow
			&xml1,
			&parent,//mime1.getLocationUrl(), parent Url
			false, // setLinkHashes
			NULL  , // baseUrl
			TITLEREC_CURRENT_VERSION, // version
			0 , // niceness
			false , // parent is permalink?
			NULL )) { // oldLinks
		log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2");
	}

	content2[content2Len]='\0';
	//short csEnum2= get_iana_charset(mime2.getCharset(), 
	//				mime2.getCharsetLen());
	/*	if (csEnum2== csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
	Xml xml2;
	if (!xml2.set(content2, 
		     content2Len,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
	}
	Links links2;
	parent.set(st->m_u2);
	if (!links2.set(0,//siterec xml
			&xml2,
			&parent,//&st->m_u2,//mime2.getLocationUrl(),
			false,
			NULL,
			TITLEREC_CURRENT_VERSION,
			0,
			false,
			NULL)){
		log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2");
	}
	

	// put the hash of the sites into a hashtable, since we have
	// about a 100 or so of them
	HashTableT<unsigned long, bool> urlHash;
	// put the urls from doc2 into the hastable, but first check if
	// they are links to google or gigablast (for now). For msn and
	// yahoo we have to add other checks.
	char domain2[256];
	long dlen = 0;
	char *dom = getDomFast ( st->m_u2 , &dlen );
	if ( dom ) strncpy(domain2,dom,dlen);
	domain2[dlen]='\0';
	for (long i=0;i<links2.getNumLinks();i++){
		// The dots check if exactly google or gigablast are present
		// in the link
		char *ss=links2.getLink(i);
		char *p;
		p=strstr(ss,domain2);
		if(p) continue;
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"cc.msncache.com");//msn's cache page
		if(p) continue;
		if (m_verbose)
			log(LOG_WARN,"blaster: link in Doc2=%s"
			    ,links2.getLink(i));
		unsigned long h=hash32Lower_a(links2.getLink(i),
					    links2.getLinkLen(i));
		//should i check for conflict. no, because it doesn't matter
		urlHash.addKey(h,1);
	}
	// now check if the urls from doc1 are in doc2. save the
	// ones that are not
	// in there for later.
	/*	long numUrlsToCheck=links2.getNumLinks();*/
	long numUrlsNotFound=0;
	/*if (numLinks1<numUrlsToCheck)
	numUrlsToCheck=numLinks1;*/
	char domain1[256];
	dlen = 0;
	dom = getDomFast ( st->m_u1 ,&dlen );
	if ( dom ) strncpy(domain1,dom,dlen);
	domain1[dlen]='\0';
	for (long i=0;i<links1.getNumLinks();i++){
		char *ss=links1.getLink(i);
		char *p;
		p=strstr(ss,domain1);
		if(p) continue;
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"cc.msncache.com");//msn's cache page
		if(p) continue;
		if (m_verbose)
			log(LOG_WARN,"blaster: link in Doc1=%s"
			    ,links1.getLink(i));
		unsigned long h=hash32Lower_a(links1.getLink(i),
					    links1.getLinkLen(i));
		long slot= urlHash.getSlot(h);		
		if(slot!=-1) continue;

		// if url is not present, get its doc.
		if (m_verbose || m_justDisplay)
			log(LOG_WARN,"blaster: NOT FOUND %s in %s"
			    ,links1.getLink(i),domain2);
		numUrlsNotFound++;
		//Don't do anything else if just have to display the urls
		if (m_justDisplay) continue;
		//now get the doc of these urls
		//initialize
		st->m_numUrlDocsReceived=0;

		StateBD2 *st2;
		try { st2 = new (StateBD2); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("blaster: Failed. "
			    "Could not allocate %li bytes for query. "
			    "Returning HTTP status of 500.",
			    (long)sizeof(StateBD2));
			return;
		}
		mnew ( st2 , sizeof(StateBD2) , "Blaster4" );
		//Point to the big state;
		st2->m_st=st;
		//Msg16 does 6 redirects, so I do 6 too
		st2->m_numRedirects=6;
		//st2->m_url.set(links1.getLink(i),links1.getLinkLen(i));
		st2->m_url = links1.getLink(i);
		// No need for a proxy ip here, since we are fetching
		// doc's from different IPs. Faster this way
		bool status = g_httpServer.getDoc ( st2->m_url, // url
						    0,//ip
						    0 ,  // offset
						    -1 ,  // size
						    0 , // ifModifiedSince
						    st2,  // state
						    gotDocWrapper3, // callback
						    60*1000, // timeout
						    0, // proxy ip
						    0, // proxy port
						    30*1024*1024, //maxLen
						    30*1024*1024);//maxOtherLen
		// continue if it blocked
		if ( ! status ) continue;
		// If not blocked, there is an error.
		st->m_numUrlDocsReceived++;
	}
	st->m_numUrlDocsSent=numUrlsNotFound;

	//There might have been an error while sending the docs, so if there
	//has been put a check
	if ( st->m_numUrlDocsReceived > 0 && 
	     st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){
		log(LOG_WARN,"blaster: %li docs could not be sent due to "
		    "error",st->m_numUrlDocsReceived);
		m_launched--;
		freeStateBD(st);
		return;
	}
		
	if (numUrlsNotFound==0){
		//job done for this pair
		log(LOG_WARN,"blaster: All urls from %s found in "
		    "%s",domain1,domain2);
		m_launched--;
		// Free stateBD
		freeStateBD(st);
		return;
	}
	log(LOG_WARN,"blaster: %li urls from %s Not found in %s",
	    numUrlsNotFound,domain1,domain2);
	if(m_justDisplay){
		m_launched--;
		// Free stateBD
		freeStateBD(st);
	}
	return;
}

Beispiel #28

0

Datei anzeigen

Datei: hash.cpp Projekt: RogerTheGreat/open-source-search-engine

u_int96_t hash96 ( char *s, int32_t slen, u_int96_t startHash ) {
    u_int96_t h;
    h.n0 = hash64 ( s , slen , startHash.n0 );
    h.n1 = hash32 ( s , slen , startHash.n1 );
    return h;
}

Beispiel #29

0

Datei anzeigen

Datei: blaster2.cpp Projekt: exename/open-source-search-engine

void gotDocWrapper ( void *state , TcpSocket *s ) {
	// no longer launched
	s_launched--;
	char* url = (char*)state;
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("lost %s",(char *) state);
		if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
		return;
	}
	// got one more result page
	s_total++;
	// allow printing
	s_printIt = true;
	// get time now
	int64_t now = gettimeofdayInMilliseconds();
	// get hash
	char *reply = s->m_readBuf ;
	int32_t  size  = s->m_readOffset;
	HttpMime mime;
	mime.set ( reply , size , NULL );
	char *content    = reply + mime.getMimeLen();
	int32_t  contentLen = size  - mime.getMimeLen();
	int32_t status      = mime.getHttpStatus();
	uint32_t h = hash32 ( content , contentLen );
	char *p = mime.getMime();
	char *pend = p + mime.getMimeLen();
	char message[256];
	int32_t mlen = 0;

	// parse status message out of response

	// HTTP/1.0
	while ( p < pend && !isspace(*p) ) p++;
	// skip space
	while ( p < pend &&  isspace(*p) ) p++;
	// copy to end of line
	while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
		message[mlen++] = *p;
	}
	message[mlen] = '\0';

	// log msg
	if ( g_errno ) 
		logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) %s : "
		     "%s", status,
		      s->m_readOffset      , 
		      (int32_t)(now - s->m_startTime) , 
		      (char *)state        , 
		      mstrerror(g_errno)   );
	else
		logf(LOG_INFO,"blaster: got doc (status=%"INT32") (%"INT32") (%"INT32"ms) "
		     "(hash=%"XINT32") %s", status,
		      s->m_readOffset      , 
		      (int32_t)(now - s->m_startTime) , 
		      h ,
		      (char *)state        );

	if(s_server) mfree(url, gbstrlen(url)+1, "saved url");
	// try to launch another
	startSpidering();
}

Beispiel #30

0

Datei anzeigen

Datei: Blaster.cpp Projekt: alvinlai/open-source-search-engine

void Blaster::gotDoc1( void *state, TcpSocket *s){
	StateBD *st=(StateBD *)state;
	// Even if we loose the request, still count it as done.
	m_totalDone++;
	m_print=true;
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blaster: lost the Request in gotDoc1");
		m_launched--;
		freeStateBD(st);
		return;
	}

	//if we are not doing diff
	if (!m_blasterDiff){
		m_launched--;
	}
	long long now = gettimeofdayInMilliseconds();
	// get hash
	char *reply = s->m_readBuf ;
	long  size  = s->m_readOffset;
	HttpMime mime;
	mime.set ( reply , size , NULL );
	char *content    = reply + mime.getMimeLen();
	long  contentLen = size  - mime.getMimeLen();
	unsigned long h = hash32 ( content , contentLen );
	// log msg
	if ( g_errno ) 
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     st->m_u1   , 
		     mstrerror(g_errno)   );
	else
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) "
		     "(hash=%lx) %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     h ,
		     st->m_u1       );
	if (!m_blasterDiff){
		// try to launch another if not using log file
		freeStateBD(st);
		if (!m_isLogFile){
			startBlastering();
		}
		if (m_isLogFile && --m_totalUrls==0) exit(0);
		return;
	}

	// Store the buffer from socket so that it does not get destroyed
	// at the end. Also, add another space because in gotDoc2 xml.set
	// demands the content to be null ended, so we need to store the
	// null character there. So as a precaution, just allocating the
	// max buf size.
	st->m_buf1=(char*) mcalloc(s->m_readBufSize,"Blaster5");
	memcpy(st->m_buf1,s->m_readBuf,s->m_readOffset);
	//st->m_buf1=(char*) mdup(s->m_readBuf,s->m_readOffset,"Blaster5");
	st->m_buf1Len=s->m_readOffset;
	st->m_buf1MaxLen=s->m_readBufSize;

	// . don't let TcpServer free m_buf when socket is recycled/closed
	// . we own it now and are responsible for freeing it. DON'T do this
	// because I believe this makes malloc crash, since TcpServer says
	// that it has freed the memory so malloc tries to allocate wrong
	// memory and gives a seg fault.
	//	s->m_readBuf = NULL;
	
	log(LOG_WARN,"blaster: Downloading %s",st->m_u2);
	//char *ss="www.gigablast.com/search?q=hoopla&code=gbmonitor";
	//	st->m_u2.set(ss,gbstrlen(ss));
	// get it
	bool status = g_httpServer.getDoc ( st->m_u2 , // url
					    0,//ip
					    0 ,  // offset
					    -1 ,  // size
					    0 , // ifModifiedSince
					    st ,  // state
					    gotDocWrapper2, // callback
					    60*1000, // timeout
					    0,//atoip("66.154.102.20",13),//proxy ip
					    0,//3128,//80, // proxy port
					    30*1024*1024, //maxLen
					    30*1024*1024);//maxOtherLen
	// continue if it blocked
	if ( ! status ) return;
	// If not blocked, there is an error.
	m_launched--;
	// log msg
	log("From file2, gotdoc2 %s: %s", st->m_u2,
	    mstrerror(g_errno) );
	// No need to point p2 ahead because already been done
	// Free stateBD
	freeStateBD(st);
	return;
	
}