C++ (Cpp) Url::getDomainLen示例

// init our rdb
bool Titledb::init ( ) {

	// key sanity tests
	int64_t uh48  = 0x1234567887654321LL & 0x0000ffffffffffffLL;
	int64_t docId = 123456789;
	key96_t k = makeKey(docId,uh48,false);
	if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);}
	if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);}

	const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
	Url uu;
	uu.set(url);
	const char *d1 = uu.getDomain();
	int32_t  dlen1 = uu.getDomainLen();
	int32_t dlen2 = 0;
	const char *d2 = getDomFast ( url , &dlen2 );
	if ( !d1 || !d2 ) { g_process.shutdownAbort(true); }
	if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }

	// another one
	url = "http://ok/";
	uu.set(url);
	const char *d1a = uu.getDomain();
	dlen1 = uu.getDomainLen();
	dlen2 = 0;
	const char *d2a = getDomFast ( url , &dlen2 );
	if ( d1a || d2a ) { g_process.shutdownAbort(true); }
	if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); }

	// . what's max # of tree nodes?
	// . assume avg TitleRec size (compressed html doc) is about 1k we get:
	// . NOTE: overhead is about 32 bytes per node
	int32_t maxTreeNodes  = g_conf.m_titledbMaxTreeMem / (1*1024);

	// initialize our own internal rdb
	return m_rdb.init ( "titledb"                   ,
			    -1                          , // fixed record size
			    //g_conf.m_titledbMinFilesToMerge ,
			    // this should not really be changed...
			    -1,
			    g_conf.m_titledbMaxTreeMem  ,
			    maxTreeNodes                ,
			    false,                         // half keys?
			    12,             // key size
			    false,          //isCollectionLess
			    false);         //useIndexFile

	// validate
	//return verify ( );
}

示例#2

显示文件

文件： Images.cpp 项目： UIKit0/open-source-search-engine

void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
			     Sections *sections , XmlDoc *xd ) {
	// not valid for now
	m_thumbnailValid = false;
	// reset our array of image node candidates
	m_numImages = 0;
	// flag it
	m_setCalled = true;
	// strange...
	if ( m_imgReply ) { char *xx=NULL;*xx=0; }
	// save this
	m_xml       = xml;
	m_pageUrl   = pageUrl;

	// if we are a diffbot json reply, trust that diffbot got the
	// best candidate, and just use that
	if ( xd->m_isDiffbotJSONObject ) return;

	//m_pageSite  = pageSite;
	// scan the words
	long       nw     = words->getNumWords();
	nodeid_t  *tids   = words->getTagIds();
	long long *wids   = words->getWordIds();
	//long      *scores = scoresArg->m_scores;
	Section **sp = NULL; 
	if ( sections ) sp = sections->m_sectionPtrs;
	// not if we don't have any identified sections
	if ( sections && sections->m_numSections <= 0 ) sp = NULL;
	// the positive scored window
	long firstPosScore = -1;
	long lastPosScore  = -1;
	long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
	// find positive scoring window
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if in bad section
		if ( sp && (sp[i]->m_flags & badFlags) ) continue;
		if ( wids[i]   != 0 ) continue;
		// set first positive scoring guy
		if ( firstPosScore == -1 ) firstPosScore = i;
		// keep track of last guy
		lastPosScore = i;
	}
	// sanity check
	if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
	// . pedal firstPosScore back until we hit a section boundary
	// . i.e. stop once we hit a front/back tag pair, like <div> and </div>
	char tc[512];
	memset ( tc , 0 , 512 );
	long a = firstPosScore;
	for ( ; a >= 0 ; a-- ) {
		// get the tid
		nodeid_t tid = tids[a];
		// remove back bit, if any
		tid &= BACKBITCOMP;
		// skip if not a tag, or a generic xml tag
		if ( tid <= 1 ) continue;
		// mark it
		if ( words->isBackTag(a) ) tc[tid] |= 0x02;
		else                       tc[tid] |= 0x01;
		// continue if not a full front/back pair
		if ( tc[tid] != 0x03 ) continue;
		// continue if not a "section" type tag (see Scores.cpp)
		if ( tid != TAG_DIV      &&
		     tid != TAG_TEXTAREA &&
                     tid != TAG_TR       &&
                     tid != TAG_TD       &&
                     tid != TAG_TABLE      ) 
			continue;
		// ok we should stop now
		break;
	}		
	// min is 0
	if ( a < 0 ) a = 0;

	// now look for the image urls within this window
	for ( long i = a ; i < lastPosScore ; i++ ) {
		// skip if not <img> tag
		if (tids[i] != TAG_IMG ) continue;
		// get the node num into Xml.cpp::m_nodes[] array
		long nn = words->m_nodes[i];
		// check width to rule out small decorating imgs
		long width = xml->getLong(nn,nn+1,"width", -1 );
		if ( width != -1 && width < 50 ) continue;
		// same with height
		long height = xml->getLong(nn,nn+1, "height", -1 );
		if ( height != -1 && height < 50 ) continue;
		// get the url of the image
		long  srcLen;
		char *src = xml->getString(nn,"src",&srcLen);
		// skip if none
		if ( srcLen <= 2 ) continue;
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		iu.set ( pageUrl , src , srcLen ); 
		// skip if invalid domain or TLD
		if ( iu.getDomainLen() <= 0 ) continue;
		// skip if not from same domain as page url
		//long dlen = pageUrl->getDomainLen();
		//if ( iu.getDomainLen() != dlen ) continue;
		//if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue
		// get the full url
		char *u    = iu.getUrl();
		long  ulen = iu.getUrlLen();
		// skip common crap
		if ( strncasestr(u,ulen,"logo"           ) ) continue;
		if ( strncasestr(u,ulen,"comment"        ) ) continue;
		if ( strncasestr(u,ulen,"print"          ) ) continue;
		if ( strncasestr(u,ulen,"subscribe"      ) ) continue;
		if ( strncasestr(u,ulen,"header"         ) ) continue;
		if ( strncasestr(u,ulen,"footer"         ) ) continue;
		if ( strncasestr(u,ulen,"menu"           ) ) continue;
		if ( strncasestr(u,ulen,"button"         ) ) continue;
		if ( strncasestr(u,ulen,"banner"         ) ) continue;
		if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue;
		if ( strncasestr(u,ulen,"ads.webfeat."   ) ) continue;
		if ( strncasestr(u,ulen,"xads.zedo."     ) ) continue;

		// save it
		m_imageNodes[m_numImages] = nn;

		// before we lookup the image url to see if it is unique we
		// must first make sure that we have an adequate number of
		// permalinks from this same site with this same hop count.
		// we need at least 10 before we extract image thumbnails.
		char buf[2000];
		// set the query
		Query q;

		// if we do have 10 or more, then we lookup the image url to
		// make sure it is indeed unique
		sprintf ( buf , "gbimage:%s",u);
		// TODO: make sure this is a no-split termid storage thingy
		// in Msg14.cpp
		if ( ! q.set2 ( buf , langUnknown , false ) )
			// return true with g_errno set on error
			return;
		// store the termid
		m_termIds[m_numImages] = q.getTermId(0);

		// advance the counter
		m_numImages++;

		// break if full
		if ( m_numImages >= MAX_IMAGES ) break;
	}
}

示例#3

显示文件

文件： Images.cpp 项目： lemire/open-source-search-engine

void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
			     Sections *sections , XmlDoc *xd ) {
	// not valid for now
	m_thumbnailValid = false;
	// reset our array of image node candidates
	m_numImages = 0;
	// flag it
	m_setCalled = true;
	// strange...
	if ( m_imgReply ) { char *xx=NULL;*xx=0; }
	// save this
	m_xml       = xml;
	m_pageUrl   = pageUrl;

	//
	// first add any open graph candidate.
	// basically they page telling us the best image straight up.
	//

	int32_t node2 = -1;
	int32_t startNode = 0;

	// . field can be stuff like "summary","description","keywords",...
	// . if "convertHtmlEntites" is true we change < to &lt; and > to &gt;
	// . <meta property="og:image" content="http://example.com/rock2.jpg"/>
	// . <meta property="og:image" content="http://example.com/rock3.jpg"/>
 ogimgloop:
	char ubuf[2000];
	int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 );

	// update this in case goto ogimgloop is called
	startNode = node2 + 1;
	// see section below for explanation of what we are storing here...
	if ( node2 >= 0 ) {
		// save it
		m_imageNodes[m_numImages] = node2;
		Query q;
		if ( ulen > MAX_URL_LEN ) goto ogimgloop;
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		iu.set( pageUrl, ubuf, ulen );
		// skip if invalid domain or TLD
		if ( iu.getDomainLen() <= 0 ) goto ogimgloop;
		// for looking it up on disk to see if unique or not
		char buf[2000];
		// if we don't put in quotes it expands '|' into
		// the "PiiPe" operator in Query.cpp
		snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl());
		// TODO: make sure this is a no-split termid storage thingy
		// in Msg14.cpp
		if ( ! q.set2 ( buf , langUnknown , false ) ) return;
		// sanity test
		if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; }
		// store the termid
		m_termIds[m_numImages] = q.getTermId(0);
		// advance the counter
		m_numImages++;
		// try to get more graph images if we have some room
		if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop;
	}
	


	//m_pageSite  = pageSite;
	// scan the words
	int32_t       nw     = words->getNumWords();
	nodeid_t  *tids   = words->getTagIds();
	int64_t *wids   = words->getWordIds();
	//int32_t      *scores = scoresArg->m_scores;
	Section **sp = NULL; 
	if ( sections ) sp = sections->m_sectionPtrs;
	// not if we don't have any identified sections
	if ( sections && sections->m_numSections <= 0 ) sp = NULL;
	// the positive scored window
	int32_t firstPosScore = -1;
	int32_t lastPosScore  = -1;
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
	// find positive scoring window
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if in bad section
		if ( sp && (sp[i]->m_flags & badFlags) ) continue;
		if ( wids[i]   != 0 ) continue;
		// set first positive scoring guy
		if ( firstPosScore == -1 ) firstPosScore = i;
		// keep track of last guy
		lastPosScore = i;
	}
	// sanity check
	if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
	// . pedal firstPosScore back until we hit a section boundary
	// . i.e. stop once we hit a front/back tag pair, like <div> and </div>
	char tc[512];
	memset ( tc , 0 , 512 );
	int32_t a = firstPosScore;
	for ( ; a >= 0 ; a-- ) {
		// get the tid
		nodeid_t tid = tids[a];
		// remove back bit, if any
		tid &= BACKBITCOMP;
		// skip if not a tag, or a generic xml tag
		if ( tid <= 1 ) continue;
		// mark it
		if ( words->isBackTag(a) ) tc[tid] |= 0x02;
		else                       tc[tid] |= 0x01;
		// continue if not a full front/back pair
		if ( tc[tid] != 0x03 ) continue;
		// continue if not a "section" type tag (see Scores.cpp)
		if ( tid != TAG_DIV      &&
		     tid != TAG_TEXTAREA &&
                     tid != TAG_TR       &&
                     tid != TAG_TD       &&
                     tid != TAG_TABLE      ) 
			continue;
		// ok we should stop now
		break;
	}		
	// min is 0
	if ( a < 0 ) a = 0;

	// now look for the image urls within this window
	for ( int32_t i = a ; i < lastPosScore ; i++ ) {
		// skip if not <img> tag
		if (tids[i] != TAG_IMG ) continue;
		// get the node num into Xml.cpp::m_nodes[] array
		int32_t nn = words->getNodes()[i];
		// check width to rule out small decorating imgs
		int32_t width = xml->getLong(nn,nn+1,"width", -1 );
		if ( width != -1 && width < 50 ) continue;
		// same with height
		int32_t height = xml->getLong(nn,nn+1, "height", -1 );
		if ( height != -1 && height < 50 ) continue;
		// get the url of the image
		int32_t  srcLen;
		char *src = xml->getString(nn,"src",&srcLen);
		// skip if none
		if ( srcLen <= 2 ) continue;
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		iu.set( pageUrl, src, srcLen );
		// skip if invalid domain or TLD
		if ( iu.getDomainLen() <= 0 ) continue;
		// skip if not from same domain as page url
		//int32_t dlen = pageUrl->getDomainLen();
		//if ( iu.getDomainLen() != dlen ) continue;
		//if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue
		// get the full url
		char *u    = iu.getUrl();
		int32_t  ulen = iu.getUrlLen();
		// skip common crap
		if ( strncasestr(u,ulen,"logo"           ) ) continue;
		if ( strncasestr(u,ulen,"comment"        ) ) continue;
		if ( strncasestr(u,ulen,"print"          ) ) continue;
		if ( strncasestr(u,ulen,"subscribe"      ) ) continue;
		if ( strncasestr(u,ulen,"header"         ) ) continue;
		if ( strncasestr(u,ulen,"footer"         ) ) continue;
		if ( strncasestr(u,ulen,"menu"           ) ) continue;
		if ( strncasestr(u,ulen,"button"         ) ) continue;
		if ( strncasestr(u,ulen,"banner"         ) ) continue;
		if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue;
		if ( strncasestr(u,ulen,"ads.webfeat."   ) ) continue;
		if ( strncasestr(u,ulen,"xads.zedo."     ) ) continue;

		// save it
		m_imageNodes[m_numImages] = nn;

		// before we lookup the image url to see if it is unique we
		// must first make sure that we have an adequate number of
		// permalinks from this same site with this same hop count.
		// we need at least 10 before we extract image thumbnails.
		char buf[2000];
		// set the query
		Query q;

		// if we do have 10 or more, then we lookup the image url to
		// make sure it is indeed unique
		sprintf ( buf , "gbimage:\"%s\"",u);
		// TODO: make sure this is a no-split termid storage thingy
		// in Msg14.cpp
		if ( ! q.set2 ( buf , langUnknown , false ) )
			// return true with g_errno set on error
			return;
		// store the termid
		m_termIds[m_numImages] = q.getTermId(0);

		// advance the counter
		m_numImages++;

		// break if full
		if ( m_numImages >= MAX_IMAGES ) break;
	}
}

示例#4

显示文件

文件： Titledb.cpp 项目： DeadNumbers/open-source-search-engine

// init our rdb
bool Titledb::init ( ) {

	// key sanity tests
	int64_t uh48  = 0x1234567887654321LL & 0x0000ffffffffffffLL;
	int64_t docId = 123456789;
	key_t k = makeKey(docId,uh48,false);
	if ( getDocId(&k) != docId ) { char *xx=NULL;*xx=0;}
	if ( getUrlHash48(&k) != uh48 ) { char *xx=NULL;*xx=0;}

	char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html";
	Url uu;
	uu.set(url);
	char *d1 = uu.getDomain();
	int32_t  dlen1 = uu.getDomainLen();
	int32_t dlen2 = 0;
	char *d2 = getDomFast ( url , &dlen2 );
	if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; }
	// another one
	url = "http://ok/";
	uu.set(url);
	d1 = uu.getDomain();
	dlen1 = uu.getDomainLen();
	dlen2 = 0;
	d2 = getDomFast ( url , &dlen2 );
	if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; }


	int64_t maxMem = 200000000; // 200MB

	// . what's max # of tree nodes?
	// . assume avg TitleRec size (compressed html doc) is about 1k we get:
	// . NOTE: overhead is about 32 bytes per node
	int32_t maxTreeNodes  = maxMem  / (1*1024);

	// . we now use a disk page cache for titledb as opposed to the
	//   old rec cache. i am trying to do away with the Rdb::m_cache rec
	//   cache in favor of cleverly used disk page caches, because
	//   the rec caches are not real-time and get stale.
	// . just hard-code 30MB for now
	int32_t pcmem    = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem;
	// f**k that we need all the mem!
	//pcmem = 0;
	// do not use any page cache if doing tmp cluster in order to
	// prevent swapping
	if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
	int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
	// init the page cache
	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
	if ( ! m_pc.init ( "titledb",
			   RDB_TITLEDB,
			   pcmem    ,
			   pageSize ) )
		return log("db: Titledb init failed.");

	// each entry in the cache is usually just a single record, no lists
	//int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
	// initialize our own internal rdb
	if ( ! m_rdb.init ( g_hostdb.m_dir              ,
			    "titledb"                   ,
			    true                        , // dedup same keys?
			    -1                          , // fixed record size
			    //g_hostdb.m_groupMask          ,
			    //g_hostdb.m_groupId            ,
			    //g_conf.m_titledbMinFilesToMerge , 
			    // this should not really be changed...
			    -1,//3,//230  minfilestomerge mintomerge
			    maxMem, // g_conf.m_titledbMaxTreeMem  ,
			    maxTreeNodes                ,
			    // now we balance so Sync.cpp can ordered huge list
			    true                        , // balance tree?
			    // turn off cache for now because the page cache
			    // is just as fast and does not get out of date
			    // so bad??
			    //0                         ,
			    0,//g_conf.m_titledbMaxCacheMem ,
			    0,//maxCacheNodes               ,
			    false                       ,// half keys?
			    false                       ,// g_conf.m_titledbSav
			    &m_pc               , // page cache ptr
			    true                        ) )// is titledb?
		return false;
	return true;
	// validate
	//return verify ( );
}

示例#5

显示文件

文件： urlinfo.cpp 项目： DeadNumbers/open-source-search-engine

int main ( int argc , char *argv[] ) {
	bool addWWW = true;
	bool stripSession = true;
	// check for arguments
	for (int32_t i = 1; i < argc; i++) {
		if (strcmp(argv[i], "-w") == 0)
			addWWW = false;
		else if (strcmp(argv[i], "-s") == 0)
			stripSession = false;
	}
	// initialize
	//g_mem.init(100*1024);
	hashinit();
	//g_conf.m_tfndbExtBits = 23;
 loop:
	// read a url from stddin
	char sbuf[1024];
	if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1);
	char *s = sbuf;
	char fbuf[1024];
	// decode if we should
	if ( strncmp(s,"http%3A%2F%2F",13) == 0 ||
	     strncmp(s,"https%3A%2F%2F",13) == 0 ) {
		urlDecode(fbuf,s,gbstrlen(s));
		s = fbuf;
	}
	// old url
	printf("###############\n");
	printf("old: %s",s);
	int32_t slen = gbstrlen(s);
	// remove any www. if !addWWW
	if (!addWWW) {
		if (slen >= 4 &&
		    strncasecmp(s, "www.", 4) == 0) {
			slen -= 4;
			memmove(s, &s[4], slen);
		}
		else {
			// get past a ://
			int32_t si = 0;
			while (si < slen &&
			       ( s[si] != ':' ||
				 s[si+1] != '/' ||
				 s[si+2] != '/' ) )
				si++;
			// remove the www.
			if (si + 7 < slen) {
				si += 3;
				if (strncasecmp(&s[si], "www.", 4) == 0) {
					slen -= 4;
					memmove(&s[si], &s[si+4], slen-si);
				}
			}
		}
	}
	// set it
	Url u;
	u.set ( s , slen ,
		addWWW   ,      /*add www?*/
		stripSession ); /*strip session ids?*/
	// print it
	char out[1024*4];
	char *p = out;
	p += sprintf(p,"tld: ");
	gbmemcpy ( p, u.getTLD(),u.getTLDLen());
	p += u.getTLDLen();
	char c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	

	// dom
	p = out;
	sprintf ( p , "dom: ");
	p += gbstrlen ( p );
	gbmemcpy ( p , u.getDomain() , u.getDomainLen() );
	p += u.getDomainLen();
	c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	// host
	p = out;
	sprintf ( p , "host: ");
	p += gbstrlen ( p );
	gbmemcpy ( p , u.getHost() , u.getHostLen() );
	p += u.getHostLen();
	c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	// then the whole url
	printf("url: %s\n", u.getUrl() );

	/*
	int32_t  siteLen;
	char *site = u.getSite ( &siteLen , NULL , false );
	if ( site ) {
		c = site[siteLen];
		site[siteLen] = '\0';
	}
	printf("site: %s\n", site );
	if ( site ) site[siteLen] = c;
	*/
	SiteGetter sg;
	sg.getSite ( u.getUrl() ,
		     NULL , // tagrec
		     0 , // timestamp
		     NULL, // coll
		     0 , // niceness
		     //false , // addtags
		     NULL , // state
		     NULL ); // callback
	if ( sg.m_siteLen )
		printf("site: %s\n",sg.m_site);

	printf("isRoot: %"INT32"\n",(int32_t)u.isRoot());

	/*
	bool perm = ::isPermalink ( NULL        , // coll
				    NULL        , // Links ptr
				    &u          , // the url
				    CT_HTML     , // contentType
				    NULL        , // LinkInfo ptr
				    false       );// isRSS?
	printf ("isPermalink: %"INT32"\n",(int32_t)perm);
	*/

	// print the path too
	p = out;

	p += sprintf ( p , "path: " );
	gbmemcpy ( p , u.getPath(), u.getPathLen() );
	p += u.getPathLen();

	if ( u.getFilename() ) {
		p += sprintf ( p , "\nfilename: " );
		gbmemcpy ( p , u.getFilename(), u.getFilenameLen() );
		p += u.getFilenameLen();
		*p = '\0';
		printf("%s\n", out );
	}

	// encoded
	char dst[MAX_URL_LEN+200];
	urlEncode ( dst,MAX_URL_LEN+100,
				u.getUrl(), u.getUrlLen(), 
				false ); // are we encoding a request path?
	printf("encoded: %s\n",dst);

	// the probable docid
	int64_t pd = g_titledb.getProbableDocId(&u);
	printf("pdocid: %"UINT64"\n", pd );
	printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) );
	//printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u));
	if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
	else                  printf("islinkloop: no\n");
	int64_t hh64 = u.getHostHash64();
	printf("hosthash64: 0x%016"XINT64"\n",hh64);
	uint32_t hh32 = u.getHostHash32();
	printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32);
	int64_t dh64 = u.getDomainHash64();
	printf("domhash64: 0x%016"XINT64"\n",dh64);
	int64_t uh64 = u.getUrlHash64();
	printf("urlhash64: 0x%016"XINT64"\n",uh64);
	//if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n");
	//else                            printf("unregulated: no\n");
	goto loop;
}