Exemplos de SiteGetter em C++ (Cpp), exemplos de SiteGetter em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: SiteGetter.cpp Projeto: Doken-Tokuyama/open-source-search-engine

void gotSiteListWrapper ( void *state ) {
	SiteGetter *THIS = (SiteGetter *)state;
	if ( ! THIS->gotSiteList() ) return;
	// try again?
	if ( THIS->m_tryAgain ) {
		// return if blocked
		if ( ! THIS->getSiteList() ) return;
		// otherwise, if did not block, we are really done because
		// it loops until it blocks
	}
	// call callback if all done now
	THIS->m_callback ( THIS->m_state );
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: print_urlinfo.cpp Projeto: privacore/open-source-search-engine

int main(int argc, char **argv) {
	if (argc < 2) {
		print_usage(argv[0]);
		return 1;
	}

	if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) {
		print_usage(argv[0]);
		return 1;
	}

	// initialize library
	g_mem.init();
	hashinit();

	g_conf.init(NULL);

	g_log.m_logPrefix = false;

	const char *input = argv[1];
	size_t inputLen = strlen(input);

	Url url;
	url.set(input, inputLen);
	url.print();
	logf(LOG_TRACE, "\t");

	SiteGetter sg;
	sg.getSite(input, NULL, 0, 0, 0);
	logf(LOG_TRACE, "Site info");
	logf(LOG_TRACE, "\tsite         : %.*s", sg.getSiteLen(), sg.getSite());
	logf(LOG_TRACE, "\tsitehash32   : %" PRIx32, hash32(sg.getSite(), sg.getSiteLen(), 0));
	logf(LOG_TRACE, "\t");

	uint64_t probableDocId = Titledb::getProbableDocId(&url);
	logf(LOG_TRACE, "Document info");
	logf(LOG_TRACE, "\tprobabledocid      : %" PRIu64, probableDocId);
	logf(LOG_TRACE, "\tfirstprobabledocid : %" PRIu64, Titledb::getFirstProbableDocId(probableDocId));
	logf(LOG_TRACE, "\tlastprobabledocid  : %" PRIu64, Titledb::getLastProbableDocId(probableDocId));

	return 0;
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: SiteGetterTest.cpp Projeto: privacore/open-source-search-engine

TEST(SiteGetterTest, GetSite) {
	SiteGetter sg;
	EXPECT_TRUE(sg.getSite("http://dr.dk/", NULL, 0, 0, 0));
	EXPECT_STREQ("www.dr.dk", sg.getSite());
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: urlinfo.cpp Projeto: DeadNumbers/open-source-search-engine

int main ( int argc , char *argv[] ) {
	bool addWWW = true;
	bool stripSession = true;
	// check for arguments
	for (int32_t i = 1; i < argc; i++) {
		if (strcmp(argv[i], "-w") == 0)
			addWWW = false;
		else if (strcmp(argv[i], "-s") == 0)
			stripSession = false;
	}
	// initialize
	//g_mem.init(100*1024);
	hashinit();
	//g_conf.m_tfndbExtBits = 23;
 loop:
	// read a url from stddin
	char sbuf[1024];
	if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1);
	char *s = sbuf;
	char fbuf[1024];
	// decode if we should
	if ( strncmp(s,"http%3A%2F%2F",13) == 0 ||
	     strncmp(s,"https%3A%2F%2F",13) == 0 ) {
		urlDecode(fbuf,s,gbstrlen(s));
		s = fbuf;
	}
	// old url
	printf("###############\n");
	printf("old: %s",s);
	int32_t slen = gbstrlen(s);
	// remove any www. if !addWWW
	if (!addWWW) {
		if (slen >= 4 &&
		    strncasecmp(s, "www.", 4) == 0) {
			slen -= 4;
			memmove(s, &s[4], slen);
		}
		else {
			// get past a ://
			int32_t si = 0;
			while (si < slen &&
			       ( s[si] != ':' ||
				 s[si+1] != '/' ||
				 s[si+2] != '/' ) )
				si++;
			// remove the www.
			if (si + 7 < slen) {
				si += 3;
				if (strncasecmp(&s[si], "www.", 4) == 0) {
					slen -= 4;
					memmove(&s[si], &s[si+4], slen-si);
				}
			}
		}
	}
	// set it
	Url u;
	u.set ( s , slen ,
		addWWW   ,      /*add www?*/
		stripSession ); /*strip session ids?*/
	// print it
	char out[1024*4];
	char *p = out;
	p += sprintf(p,"tld: ");
	gbmemcpy ( p, u.getTLD(),u.getTLDLen());
	p += u.getTLDLen();
	char c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	

	// dom
	p = out;
	sprintf ( p , "dom: ");
	p += gbstrlen ( p );
	gbmemcpy ( p , u.getDomain() , u.getDomainLen() );
	p += u.getDomainLen();
	c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	// host
	p = out;
	sprintf ( p , "host: ");
	p += gbstrlen ( p );
	gbmemcpy ( p , u.getHost() , u.getHostLen() );
	p += u.getHostLen();
	c = *p;
	*p = '\0';
	printf("%s\n",out);
	*p = c;
	// then the whole url
	printf("url: %s\n", u.getUrl() );

	/*
	int32_t  siteLen;
	char *site = u.getSite ( &siteLen , NULL , false );
	if ( site ) {
		c = site[siteLen];
		site[siteLen] = '\0';
	}
	printf("site: %s\n", site );
	if ( site ) site[siteLen] = c;
	*/
	SiteGetter sg;
	sg.getSite ( u.getUrl() ,
		     NULL , // tagrec
		     0 , // timestamp
		     NULL, // coll
		     0 , // niceness
		     //false , // addtags
		     NULL , // state
		     NULL ); // callback
	if ( sg.m_siteLen )
		printf("site: %s\n",sg.m_site);

	printf("isRoot: %"INT32"\n",(int32_t)u.isRoot());

	/*
	bool perm = ::isPermalink ( NULL        , // coll
				    NULL        , // Links ptr
				    &u          , // the url
				    CT_HTML     , // contentType
				    NULL        , // LinkInfo ptr
				    false       );// isRSS?
	printf ("isPermalink: %"INT32"\n",(int32_t)perm);
	*/

	// print the path too
	p = out;

	p += sprintf ( p , "path: " );
	gbmemcpy ( p , u.getPath(), u.getPathLen() );
	p += u.getPathLen();

	if ( u.getFilename() ) {
		p += sprintf ( p , "\nfilename: " );
		gbmemcpy ( p , u.getFilename(), u.getFilenameLen() );
		p += u.getFilenameLen();
		*p = '\0';
		printf("%s\n", out );
	}

	// encoded
	char dst[MAX_URL_LEN+200];
	urlEncode ( dst,MAX_URL_LEN+100,
				u.getUrl(), u.getUrlLen(), 
				false ); // are we encoding a request path?
	printf("encoded: %s\n",dst);

	// the probable docid
	int64_t pd = g_titledb.getProbableDocId(&u);
	printf("pdocid: %"UINT64"\n", pd );
	printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) );
	//printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u));
	if ( u.isLinkLoop() ) printf("islinkloop: yes\n");
	else                  printf("islinkloop: no\n");
	int64_t hh64 = u.getHostHash64();
	printf("hosthash64: 0x%016"XINT64"\n",hh64);
	uint32_t hh32 = u.getHostHash32();
	printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32);
	int64_t dh64 = u.getDomainHash64();
	printf("domhash64: 0x%016"XINT64"\n",dh64);
	int64_t uh64 = u.getUrlHash64();
	printf("urlhash64: 0x%016"XINT64"\n",uh64);
	//if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n");
	//else                            printf("unregulated: no\n");
	goto loop;
}