// init our rdb bool Titledb::init ( ) { // key sanity tests int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL; int64_t docId = 123456789; key96_t k = makeKey(docId,uh48,false); if ( getDocId(&k) != docId ) { g_process.shutdownAbort(true);} if ( getUrlHash48(&k) != uh48 ) { g_process.shutdownAbort(true);} const char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html"; Url uu; uu.set(url); const char *d1 = uu.getDomain(); int32_t dlen1 = uu.getDomainLen(); int32_t dlen2 = 0; const char *d2 = getDomFast ( url , &dlen2 ); if ( !d1 || !d2 ) { g_process.shutdownAbort(true); } if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); } // another one url = "http://ok/"; uu.set(url); const char *d1a = uu.getDomain(); dlen1 = uu.getDomainLen(); dlen2 = 0; const char *d2a = getDomFast ( url , &dlen2 ); if ( d1a || d2a ) { g_process.shutdownAbort(true); } if ( dlen1 != dlen2 ) { g_process.shutdownAbort(true); } // . what's max # of tree nodes? // . assume avg TitleRec size (compressed html doc) is about 1k we get: // . NOTE: overhead is about 32 bytes per node int32_t maxTreeNodes = g_conf.m_titledbMaxTreeMem / (1*1024); // initialize our own internal rdb return m_rdb.init ( "titledb" , -1 , // fixed record size //g_conf.m_titledbMinFilesToMerge , // this should not really be changed... -1, g_conf.m_titledbMaxTreeMem , maxTreeNodes , false, // half keys? 12, // key size false, //isCollectionLess false); //useIndexFile // validate //return verify ( ); }
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // if we are a diffbot json reply, trust that diffbot got the // best candidate, and just use that if ( xd->m_isDiffbotJSONObject ) return; //m_pageSite = pageSite; // scan the words long nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); long long *wids = words->getWordIds(); //long *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window long firstPosScore = -1; long lastPosScore = -1; long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE; // find positive scoring window for ( long i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); long a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( long i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array long nn = words->m_nodes[i]; // check width to rule out small decorating imgs long width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height long height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image long srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set ( pageUrl , src , srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //long dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); long ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:%s",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // // first add any open graph candidate. // basically they page telling us the best image straight up. // int32_t node2 = -1; int32_t startNode = 0; // . field can be stuff like "summary","description","keywords",... // . if "convertHtmlEntites" is true we change < to < and > to > // . <meta property="og:image" content="http://example.com/rock2.jpg"/> // . <meta property="og:image" content="http://example.com/rock3.jpg"/> ogimgloop: char ubuf[2000]; int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 ); // update this in case goto ogimgloop is called startNode = node2 + 1; // see section below for explanation of what we are storing here... if ( node2 >= 0 ) { // save it m_imageNodes[m_numImages] = node2; Query q; if ( ulen > MAX_URL_LEN ) goto ogimgloop; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, ubuf, ulen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) goto ogimgloop; // for looking it up on disk to see if unique or not char buf[2000]; // if we don't put in quotes it expands '|' into // the "PiiPe" operator in Query.cpp snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl()); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) return; // sanity test if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; } // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // try to get more graph images if we have some room if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop; } //m_pageSite = pageSite; // scan the words int32_t nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); int64_t *wids = words->getWordIds(); //int32_t *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window int32_t firstPosScore = -1; int32_t lastPosScore = -1; int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // find positive scoring window for ( int32_t i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); int32_t a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( int32_t i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array int32_t nn = words->getNodes()[i]; // check width to rule out small decorating imgs int32_t width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height int32_t height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image int32_t srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, src, srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //int32_t dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); int32_t ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:\"%s\"",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
// init our rdb bool Titledb::init ( ) { // key sanity tests int64_t uh48 = 0x1234567887654321LL & 0x0000ffffffffffffLL; int64_t docId = 123456789; key_t k = makeKey(docId,uh48,false); if ( getDocId(&k) != docId ) { char *xx=NULL;*xx=0;} if ( getUrlHash48(&k) != uh48 ) { char *xx=NULL;*xx=0;} char *url = "http://.ezinemark.com/int32_t-island-child-custody-attorneys-new-york-visitation-lawyers-melville-legal-custody-law-firm-45f00bbed18.html"; Url uu; uu.set(url); char *d1 = uu.getDomain(); int32_t dlen1 = uu.getDomainLen(); int32_t dlen2 = 0; char *d2 = getDomFast ( url , &dlen2 ); if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; } // another one url = "http://ok/"; uu.set(url); d1 = uu.getDomain(); dlen1 = uu.getDomainLen(); dlen2 = 0; d2 = getDomFast ( url , &dlen2 ); if ( dlen1 != dlen2 ) { char *xx=NULL;*xx=0; } int64_t maxMem = 200000000; // 200MB // . what's max # of tree nodes? // . assume avg TitleRec size (compressed html doc) is about 1k we get: // . NOTE: overhead is about 32 bytes per node int32_t maxTreeNodes = maxMem / (1*1024); // . we now use a disk page cache for titledb as opposed to the // old rec cache. i am trying to do away with the Rdb::m_cache rec // cache in favor of cleverly used disk page caches, because // the rec caches are not real-time and get stale. // . just hard-code 30MB for now int32_t pcmem = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem; // f**k that we need all the mem! //pcmem = 0; // do not use any page cache if doing tmp cluster in order to // prevent swapping if ( g_hostdb.m_useTmpCluster ) pcmem = 0; int32_t pageSize = GB_INDEXDB_PAGE_SIZE; // init the page cache // . MDW: "minimize disk seeks" not working otherwise i'd enable it! if ( ! m_pc.init ( "titledb", RDB_TITLEDB, pcmem , pageSize ) ) return log("db: Titledb init failed."); // each entry in the cache is usually just a single record, no lists //int32_t maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024); // initialize our own internal rdb if ( ! m_rdb.init ( g_hostdb.m_dir , "titledb" , true , // dedup same keys? -1 , // fixed record size //g_hostdb.m_groupMask , //g_hostdb.m_groupId , //g_conf.m_titledbMinFilesToMerge , // this should not really be changed... -1,//3,//230 minfilestomerge mintomerge maxMem, // g_conf.m_titledbMaxTreeMem , maxTreeNodes , // now we balance so Sync.cpp can ordered huge list true , // balance tree? // turn off cache for now because the page cache // is just as fast and does not get out of date // so bad?? //0 , 0,//g_conf.m_titledbMaxCacheMem , 0,//maxCacheNodes , false ,// half keys? false ,// g_conf.m_titledbSav &m_pc , // page cache ptr true ) )// is titledb? return false; return true; // validate //return verify ( ); }
int main ( int argc , char *argv[] ) { bool addWWW = true; bool stripSession = true; // check for arguments for (int32_t i = 1; i < argc; i++) { if (strcmp(argv[i], "-w") == 0) addWWW = false; else if (strcmp(argv[i], "-s") == 0) stripSession = false; } // initialize //g_mem.init(100*1024); hashinit(); //g_conf.m_tfndbExtBits = 23; loop: // read a url from stddin char sbuf[1024]; if ( ! fgets ( sbuf , 1024 , stdin ) ) exit(1); char *s = sbuf; char fbuf[1024]; // decode if we should if ( strncmp(s,"http%3A%2F%2F",13) == 0 || strncmp(s,"https%3A%2F%2F",13) == 0 ) { urlDecode(fbuf,s,gbstrlen(s)); s = fbuf; } // old url printf("###############\n"); printf("old: %s",s); int32_t slen = gbstrlen(s); // remove any www. if !addWWW if (!addWWW) { if (slen >= 4 && strncasecmp(s, "www.", 4) == 0) { slen -= 4; memmove(s, &s[4], slen); } else { // get past a :// int32_t si = 0; while (si < slen && ( s[si] != ':' || s[si+1] != '/' || s[si+2] != '/' ) ) si++; // remove the www. if (si + 7 < slen) { si += 3; if (strncasecmp(&s[si], "www.", 4) == 0) { slen -= 4; memmove(&s[si], &s[si+4], slen-si); } } } } // set it Url u; u.set ( s , slen , addWWW , /*add www?*/ stripSession ); /*strip session ids?*/ // print it char out[1024*4]; char *p = out; p += sprintf(p,"tld: "); gbmemcpy ( p, u.getTLD(),u.getTLDLen()); p += u.getTLDLen(); char c = *p; *p = '\0'; printf("%s\n",out); *p = c; // dom p = out; sprintf ( p , "dom: "); p += gbstrlen ( p ); gbmemcpy ( p , u.getDomain() , u.getDomainLen() ); p += u.getDomainLen(); c = *p; *p = '\0'; printf("%s\n",out); *p = c; // host p = out; sprintf ( p , "host: "); p += gbstrlen ( p ); gbmemcpy ( p , u.getHost() , u.getHostLen() ); p += u.getHostLen(); c = *p; *p = '\0'; printf("%s\n",out); *p = c; // then the whole url printf("url: %s\n", u.getUrl() ); /* int32_t siteLen; char *site = u.getSite ( &siteLen , NULL , false ); if ( site ) { c = site[siteLen]; site[siteLen] = '\0'; } printf("site: %s\n", site ); if ( site ) site[siteLen] = c; */ SiteGetter sg; sg.getSite ( u.getUrl() , NULL , // tagrec 0 , // timestamp NULL, // coll 0 , // niceness //false , // addtags NULL , // state NULL ); // callback if ( sg.m_siteLen ) printf("site: %s\n",sg.m_site); printf("isRoot: %"INT32"\n",(int32_t)u.isRoot()); /* bool perm = ::isPermalink ( NULL , // coll NULL , // Links ptr &u , // the url CT_HTML , // contentType NULL , // LinkInfo ptr false );// isRSS? printf ("isPermalink: %"INT32"\n",(int32_t)perm); */ // print the path too p = out; p += sprintf ( p , "path: " ); gbmemcpy ( p , u.getPath(), u.getPathLen() ); p += u.getPathLen(); if ( u.getFilename() ) { p += sprintf ( p , "\nfilename: " ); gbmemcpy ( p , u.getFilename(), u.getFilenameLen() ); p += u.getFilenameLen(); *p = '\0'; printf("%s\n", out ); } // encoded char dst[MAX_URL_LEN+200]; urlEncode ( dst,MAX_URL_LEN+100, u.getUrl(), u.getUrlLen(), false ); // are we encoding a request path? printf("encoded: %s\n",dst); // the probable docid int64_t pd = g_titledb.getProbableDocId(&u); printf("pdocid: %"UINT64"\n", pd ); printf("dom8: 0x%"XINT32"\n", (int32_t)g_titledb.getDomHash8FromDocId(pd) ); //printf("ext23: 0x%"XINT32"\n",g_tfndb.makeExt(&u)); if ( u.isLinkLoop() ) printf("islinkloop: yes\n"); else printf("islinkloop: no\n"); int64_t hh64 = u.getHostHash64(); printf("hosthash64: 0x%016"XINT64"\n",hh64); uint32_t hh32 = u.getHostHash32(); printf("hosthash32: 0x%08"XINT32" (%"UINT32")\n",hh32,hh32); int64_t dh64 = u.getDomainHash64(); printf("domhash64: 0x%016"XINT64"\n",dh64); int64_t uh64 = u.getUrlHash64(); printf("urlhash64: 0x%016"XINT64"\n",uh64); //if(isUrlUnregulated(NULL ,0,&u)) printf("unregulated: yes\n"); //else printf("unregulated: no\n"); goto loop; }