void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // if we are a diffbot json reply, trust that diffbot got the // best candidate, and just use that if ( xd->m_isDiffbotJSONObject ) return; //m_pageSite = pageSite; // scan the words long nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); long long *wids = words->getWordIds(); //long *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window long firstPosScore = -1; long lastPosScore = -1; long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE; // find positive scoring window for ( long i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); long a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( long i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array long nn = words->m_nodes[i]; // check width to rule out small decorating imgs long width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height long height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image long srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set ( pageUrl , src , srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //long dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); long ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:%s",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // // first add any open graph candidate. // basically they page telling us the best image straight up. // int32_t node2 = -1; int32_t startNode = 0; // . field can be stuff like "summary","description","keywords",... // . if "convertHtmlEntites" is true we change < to < and > to > // . <meta property="og:image" content="http://example.com/rock2.jpg"/> // . <meta property="og:image" content="http://example.com/rock3.jpg"/> ogimgloop: char ubuf[2000]; int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 ); // update this in case goto ogimgloop is called startNode = node2 + 1; // see section below for explanation of what we are storing here... if ( node2 >= 0 ) { // save it m_imageNodes[m_numImages] = node2; Query q; if ( ulen > MAX_URL_LEN ) goto ogimgloop; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, ubuf, ulen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) goto ogimgloop; // for looking it up on disk to see if unique or not char buf[2000]; // if we don't put in quotes it expands '|' into // the "PiiPe" operator in Query.cpp snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl()); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) return; // sanity test if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; } // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // try to get more graph images if we have some room if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop; } //m_pageSite = pageSite; // scan the words int32_t nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); int64_t *wids = words->getWordIds(); //int32_t *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window int32_t firstPosScore = -1; int32_t lastPosScore = -1; int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // find positive scoring window for ( int32_t i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); int32_t a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( int32_t i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array int32_t nn = words->getNodes()[i]; // check width to rule out small decorating imgs int32_t width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height int32_t height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image int32_t srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, src, srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //int32_t dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); int32_t ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:\"%s\"",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
// . returns false if blocked, returns true otherwise // . sets g_errno on error bool Images::getThumbnail ( char *pageSite , long siteLen , long long docId , XmlDoc *xd , collnum_t collnum,//char *coll , //char **statusPtr , long hopCount, void *state , void (*callback)(void *state) ) { // sanity check if ( ! m_setCalled ) { char *xx=NULL;*xx=0; } // we haven't had any error m_hadError = 0; // no reason to stop yet m_stopDownloading = false; // reset here now m_i = 0; m_j = 0; m_phase = 0; // sanity check if ( ! m_pageUrl ) { char *xx=NULL;*xx=0; } // sanity check if ( ! pageSite ) { char *xx=NULL;*xx=0; } // we need to be a permalink //if ( ! isPermalink ) return true; // save these //m_statusPtr = statusPtr; // save this m_collnum = collnum; m_docId = docId; m_callback = callback; m_state = state; // if this doc is a json diffbot reply it already has the primary // image selected so just use that m_xd = xd; if ( m_xd->m_isDiffbotJSONObject ) return downloadImages(); // if no candidates, we are done, no error if ( m_numImages == 0 ) return true; //Vector *v = xd->getTagVector(); // this will at least have one component, the 0/NULL component uint32_t *tph = xd->getTagPairHash32(); // must not block or error on us if ( tph == (void *)-1 ) { char *xx=NULL;*xx=0; } // must not error on use? if ( ! tph ) { char *xx=NULL;*xx=0; } // . see DupDetector.cpp, very similar to this // . see how many pages we have from our same site with our same // html template (and that are permalinks) char buf[2000]; char c = pageSite[siteLen]; pageSite[siteLen]=0; // site MUST NOT start with "http://" if ( strncmp ( pageSite , "http://", 7)==0){char*xx=NULL;*xx=0;} // this must match what we hash in XmlDoc::hashNoSplit() sprintf ( buf , "gbsitetemplate:%lu%s", (unsigned long)*tph,pageSite ); pageSite[siteLen]=c; // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp Query q; if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return true; // store the termid long long termId = q.getTermId(0); key144_t startKey ; key144_t endKey ; g_posdb.makeStartKey(&startKey,termId); g_posdb.makeEndKey (&endKey ,termId); // get shard of that (this termlist is sharded by termid - // see XmlDoc.cpp::hashNoSplit() where it hashes gbsitetemplate: term) long shardNum = g_hostdb.getShardNumByTermId ( &startKey ); // if ( ! m_msg36.getTermFreq ( m_collnum , // 0 , // maxAge // termId , // this , // gotTermFreqWrapper , // MAX_NICENESS , // true , // exact count? // false , // inc count? // false , // dec count? // false )) // is split? // return false; // just use msg0 and limit to like 1k or something if ( ! m_msg0.getList ( -1 , // hostid -1 , // ip -1 , // port 0 , // maxAge false , // addToCache? RDB_POSDB , m_collnum , &m_list , // RdbList ptr (char *)&startKey , (char *)&endKey , 1024 , // minRecSize this , gotTermListWrapper , MAX_NICENESS , false , // err correction? true , // inc tree? true , // domergeobsolete -1 , // firstHostId 0 , // start filenum -1 , // numFiles 30 , // timeout -1 , // syncpoint -1 , // preferlocalreads NULL , // msg5 NULL , // msg5b false , // isRealMerge? true , // allow pg cache false , // focelocalindexdb false , // doIndexdbSplit? shardNum ))// force paritysplit return false; // did not block return gotTermFreq(); }
// . returns false if blocked, true otherwise // . sets g_errno on error // . make a web page displaying the config of this host // . call g_httpServer.sendDynamicPage() to send it bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) { // . get fields from cgi field of the requested url // . get the search query long queryLen = 0; char *query = r->getString ( "q" , &queryLen , NULL /*default*/); // ensure query not too big if ( queryLen >= MAX_QUERY_LEN ) { g_errno = EQUERYTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection long collLen = 0; char *coll = r->getString("c",&collLen); if ( ! coll || ! coll[0] ) { //coll = g_conf.m_defaultColl; coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); collLen = gbstrlen(coll); } // ensure collection not too big if ( collLen >= MAX_COLL_LEN ) { g_errno = ECOLLTOOBIG; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // make a state State10 *st ; try { st = new (State10); } catch ( ... ) { g_errno = ENOMEM; log("PageIndexdb: new(%i): %s", sizeof(State10),mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));} mnew ( st , sizeof(State10) , "PageIndexdb" ); // password, too long pwdLen = 0 ; char *pwd = r->getString ( "pwd" , &pwdLen ); if ( pwdLen > 31 ) pwdLen = 31; if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen ); st->m_pwd[pwdLen]='\0'; // get # of records to retreive from IndexList st->m_numRecs = r->getLong ( "numRecs" , 100 ); // use disk, tree, or cache? st->m_useDisk = r->getLong ("ud" , 0 ); st->m_useTree = r->getLong ("ut" , 0 ); st->m_useCache = r->getLong ("uc" , 0 ); st->m_useDatedb= r->getLong ("ub" , 0 ); st->m_add = r->getLong ("add", 0 ); st->m_del = r->getLong ("del", 0 ); // get the termId, if any, from the cgi vars st->m_termId = r->getLongLong ("t", 0LL ) ; // get docid and score st->m_docId = r->getLongLong ("d", 0LL ); st->m_score = r->getLong ("score", 0 ); // copy query/collection memcpy ( st->m_query , query , queryLen ); st->m_queryLen = queryLen; st->m_query [ queryLen ] ='\0'; //memcpy ( st->m_coll , coll , collLen ); //st->m_collLen = collLen; //st->m_coll [ collLen ] ='\0'; st->m_coll = coll; // save the TcpSocket st->m_socket = s; // and if the request is local/internal or not st->m_isAdmin = g_collectiondb.isAdmin ( r , s ); st->m_isLocal = r->isLocal(); st->m_r.copy ( r ); // . check for add/delete request if ( st->m_add || st->m_del ) { key_t startKey = g_indexdb.makeStartKey ( st->m_termId ); key_t endKey = g_indexdb.makeEndKey ( st->m_termId ); // construct the key to add/delete st->m_key = g_indexdb.makeKey ( st->m_termId, st->m_score , st->m_docId , st->m_del ); // make an RdbList out of the key st->m_keyList.set ( (char*)&st->m_key, sizeof(key_t), (char*)&st->m_key, sizeof(key_t), startKey, endKey, 0, false, true ); log ( LOG_INFO, "build: adding indexdb key to indexdb: " "%lx %llx", st->m_key.n1, st->m_key.n0 ); // call msg1 to add/delete key if ( ! st->m_msg1.addList ( &st->m_keyList, RDB_INDEXDB, st->m_coll, st, addedKeyWrapper, false, MAX_NICENESS ) ) return false; // continue to page if no block return gotIndexList ( st ); } if ( ! st->m_query[0] ) return gotIndexList(st); // . set query class // . a boolFlag of 0 means query is not boolean Query q; q.set2 ( query , langUnknown , true ); // 0 = boolFlag, not boolean! // reset st->m_msg36.m_termFreq = 0LL; // if query was provided, use that, otherwise use termId if ( q.getNumTerms() > 0 ) st->m_termId = q.getTermId(0); // skip if nothing else return gotTermFreq ( st ); // get the termfreq of this term! if ( ! st->m_msg36.getTermFreq ( coll , 0 , st->m_termId, st , gotTermFreqWrapper ) ) return false; // otherwise, we didn't block return gotTermFreq ( st ); }