void gotSiteListWrapper ( void *state ) { SiteGetter *THIS = (SiteGetter *)state; if ( ! THIS->gotSiteList() ) return; // try again? if ( THIS->m_tryAgain ) { // return if blocked if ( ! THIS->getSiteList() ) return; // otherwise, if did not block, we are really done because // it loops until it blocks } // call callback if all done now THIS->m_callback ( THIS->m_state ); }
int main(int argc, char **argv) { if (argc < 2) { print_usage(argv[0]); return 1; } if (strcmp(argv[1], "--h") == 0 || strcmp(argv[1], "--help") == 0 ) { print_usage(argv[0]); return 1; } // initialize library g_mem.init(); hashinit(); g_conf.init(NULL); g_log.m_logPrefix = false; const char *input = argv[1]; size_t inputLen = strlen(input); Url url; url.set(input, inputLen); url.print(); logf(LOG_TRACE, "\t"); SiteGetter sg; sg.getSite(input, NULL, 0, 0, 0); logf(LOG_TRACE, "Site info"); logf(LOG_TRACE, "\tsite : %.*s", sg.getSiteLen(), sg.getSite()); logf(LOG_TRACE, "\tsitehash32 : %" PRIx32, hash32(sg.getSite(), sg.getSiteLen(), 0)); logf(LOG_TRACE, "\t"); uint64_t probableDocId = Titledb::getProbableDocId(&url); logf(LOG_TRACE, "Document info"); logf(LOG_TRACE, "\tprobabledocid : %" PRIu64, probableDocId); logf(LOG_TRACE, "\tfirstprobabledocid : %" PRIu64, Titledb::getFirstProbableDocId(probableDocId)); logf(LOG_TRACE, "\tlastprobabledocid : %" PRIu64, Titledb::getLastProbableDocId(probableDocId)); return 0; }
TEST(SiteGetterTest, GetSite) { SiteGetter sg; EXPECT_TRUE(sg.getSite("http://dr.dk/", NULL, 0, 0, 0)); EXPECT_STREQ("www.dr.dk", sg.getSite()); }
int main ( int argc , char *argv[] ) { bool addWWW = true; bool stripSession = true; // check for arguments for (int32_t i = 1; i < argc; i++) { if (strcmp(argv[i], "-w") == 0) addWWW = false; else if (strcmp(argv[i], "-s") == 0) stripSession = false; } // initialize //g_mem.init(100*1024); hashinit(); //g_conf.m_tfndbExtBits = 23; loop: // read a url from stddin char sbuf[1024]; if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1); char *s = sbuf; char fbuf[1024]; // decode if we should if ( strncmp(s,"http%3A%2F%2F",13) == 0 || strncmp(s,"https%3A%2F%2F",13) == 0 ) { urlDecode(fbuf,s,gbstrlen(s)); s = fbuf; } // old url printf("###############\n"); printf("old: %s",s); int32_t slen = gbstrlen(s); // remove any www. if !addWWW if (!addWWW) { if (slen >= 4 && strncasecmp(s, "www.", 4) == 0) { slen -= 4; memmove(s, &s[4], slen); } else { // get past a :// int32_t si = 0; while (si < slen && ( s[si] != ':' || s[si+1] != '/' || s[si+2] != '/' ) ) si++; // remove the www. if (si + 7 < slen) { si += 3; if (strncasecmp(&s[si], "www.", 4) == 0) { slen -= 4; memmove(&s[si], &s[si+4], slen-si); } } } } // set it Url u; u.set ( s , slen , addWWW , /*add www?*/ stripSession ); /*strip session ids?*/ // print it char out[1024*4]; char *p = out; p += sprintf(p,"tld: "); gbmemcpy ( p, u.getTLD(),u.getTLDLen()); p += u.getTLDLen(); char c = *p; *p = '\0'; printf("%s\n",out); *p = c; // dom p = out; sprintf ( p , "dom: "); p += gbstrlen ( p ); gbmemcpy ( p , u.getDomain() , u.getDomainLen() ); p += u.getDomainLen(); c = *p; *p = '\0'; printf("%s\n",out); *p = c; // host p = out; sprintf ( p , "host: "); p += gbstrlen ( p ); gbmemcpy ( p , u.getHost() , u.getHostLen() ); p += u.getHostLen(); c = *p; *p = '\0'; printf("%s\n",out); *p = c; // then the whole url printf("url: %s\n", u.getUrl() ); /* int32_t siteLen; char *site = u.getSite ( &siteLen , NULL , false ); if ( site ) { c = site[siteLen]; site[siteLen] = '\0'; } printf("site: %s\n", site ); if ( site ) site[siteLen] = c; */ SiteGetter sg; sg.getSite ( u.getUrl() , NULL , // tagrec 0 , // timestamp NULL, // coll 0 , // niceness //false , // addtags NULL , // state NULL ); // callback if ( sg.m_siteLen ) printf("site: %s\n",sg.m_site); printf("isRoot: %"INT32"\n",(int32_t)u.isRoot()); /* bool perm = ::isPermalink ( NULL , // coll NULL , // Links ptr &u , // the url CT_HTML , // contentType NULL , // LinkInfo ptr false );// isRSS? printf ("isPermalink: %"INT32"\n",(int32_t)perm); */ // print the path too p = out; p += sprintf ( p , "path: " ); gbmemcpy ( p , u.getPath(), u.getPathLen() ); p += u.getPathLen(); if ( u.getFilename() ) { p += sprintf ( p , "\nfilename: " ); gbmemcpy ( p , u.getFilename(), u.getFilenameLen() ); p += u.getFilenameLen(); *p = '\0'; printf("%s\n", out ); } // encoded char dst[MAX_URL_LEN+200]; urlEncode ( dst,MAX_URL_LEN+100, u.getUrl(), u.getUrlLen(), false ); // are we encoding a request path? printf("encoded: %s\n",dst); // the probable docid int64_t pd = g_titledb.getProbableDocId(&u); printf("pdocid: %"UINT64"\n", pd ); printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) ); //printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u)); if ( u.isLinkLoop() ) printf("islinkloop: yes\n"); else printf("islinkloop: no\n"); int64_t hh64 = u.getHostHash64(); printf("hosthash64: 0x%016"XINT64"\n",hh64); uint32_t hh32 = u.getHostHash32(); printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32); int64_t dh64 = u.getDomainHash64(); printf("domhash64: 0x%016"XINT64"\n",dh64); int64_t uh64 = u.getUrlHash64(); printf("urlhash64: 0x%016"XINT64"\n",uh64); //if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n"); //else printf("unregulated: no\n"); goto loop; }