// . returns false if blocked, true otherwise // . see if other pages we've indexed have this same image url bool Images::launchRequests ( ) { // loop over all images for ( long i = m_i ; i < m_numImages ; i++ ) { // advance m_i++; // assume no error m_errors[i] = 0; // make the keys. each term is a gbimage:<imageUrl> term // so we are searching for the image url to see how often // it is repeated on other pages. key144_t startKey ; key144_t endKey ; g_posdb.makeStartKey(&startKey,m_termIds[i]); g_posdb.makeEndKey (&endKey ,m_termIds[i]); // get our residing groupid //unsigned long gid = g_indexdb.getNoSplitGroupId(&startKey); // no split is true for this one, so we do not split by docid //uint32_t gid = getGroupId(RDB_INDEXDB,&startKey,false); unsigned long shardNum; shardNum = getShardNum(RDB_POSDB,&startKey); // get the termlist if ( ! m_msg0.getList ( -1 , // hostid -1 , // ip -1 , // port 0 , // maxAge false , // addToCache? RDB_POSDB, m_collnum , &m_list , // RdbList ptr (char *)&startKey , (char *)&endKey , 1024 , // minRecSize this , gotTermListWrapper , MAX_NICENESS , false , // err correction? true , // inc tree? true , // domergeobsolete -1 , // firstHostId 0 , // start filenum -1 , // numFiles 30 , // timeout -1 , // syncpoint -1 , // preferlocalreads NULL , // msg5 NULL , // msg5b false , // isRealMerge? true , // allow pg cache false , // focelocalindexdb false , // doIndexdbSplit? shardNum ))// force paritysplit return false; // process the msg36 response gotTermList (); } // i guess we didn't block return downloadImages(); }
void Downloader::finishedLoadingImages(Page *page) { if (m_cancelled) return; log("Received page '"+page->url().toString()+"'"); emit finishedImagesPage(page); if (--m_waiting > 0) { loadNext(); return; } QList<Image*> images; for (int i = 0; i < m_pages->size(); ++i) for (Image *img : m_pages->at(i)->images()) { if (!m_blacklist) { if (!img->blacklisted(m_blacklistedTags).empty()) { ++m_ignored; continue; } } if (m_noduplicates) { bool found = false; for (Image *image : images) if (image->md5() == img->md5()) found = true; if (found) continue; } images.append(img); } QList<Image*> imgs; int i = 0; for (Image *img : images) if (m_max <= 0 || i++ < m_max) imgs.append(img); if (m_quit) downloadImages(imgs); else emit finishedImages(imgs); }
// . returns false if blocked, returns true otherwise // . sets g_errno on error bool Images::getThumbnail ( char *pageSite , long siteLen , long long docId , XmlDoc *xd , collnum_t collnum,//char *coll , //char **statusPtr , long hopCount, void *state , void (*callback)(void *state) ) { // sanity check if ( ! m_setCalled ) { char *xx=NULL;*xx=0; } // we haven't had any error m_hadError = 0; // no reason to stop yet m_stopDownloading = false; // reset here now m_i = 0; m_j = 0; m_phase = 0; // sanity check if ( ! m_pageUrl ) { char *xx=NULL;*xx=0; } // sanity check if ( ! pageSite ) { char *xx=NULL;*xx=0; } // we need to be a permalink //if ( ! isPermalink ) return true; // save these //m_statusPtr = statusPtr; // save this m_collnum = collnum; m_docId = docId; m_callback = callback; m_state = state; // if this doc is a json diffbot reply it already has the primary // image selected so just use that m_xd = xd; if ( m_xd->m_isDiffbotJSONObject ) return downloadImages(); // if no candidates, we are done, no error if ( m_numImages == 0 ) return true; //Vector *v = xd->getTagVector(); // this will at least have one component, the 0/NULL component uint32_t *tph = xd->getTagPairHash32(); // must not block or error on us if ( tph == (void *)-1 ) { char *xx=NULL;*xx=0; } // must not error on use? if ( ! tph ) { char *xx=NULL;*xx=0; } // . see DupDetector.cpp, very similar to this // . see how many pages we have from our same site with our same // html template (and that are permalinks) char buf[2000]; char c = pageSite[siteLen]; pageSite[siteLen]=0; // site MUST NOT start with "http://" if ( strncmp ( pageSite , "http://", 7)==0){char*xx=NULL;*xx=0;} // this must match what we hash in XmlDoc::hashNoSplit() sprintf ( buf , "gbsitetemplate:%lu%s", (unsigned long)*tph,pageSite ); pageSite[siteLen]=c; // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp Query q; if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return true; // store the termid long long termId = q.getTermId(0); key144_t startKey ; key144_t endKey ; g_posdb.makeStartKey(&startKey,termId); g_posdb.makeEndKey (&endKey ,termId); // get shard of that (this termlist is sharded by termid - // see XmlDoc.cpp::hashNoSplit() where it hashes gbsitetemplate: term) long shardNum = g_hostdb.getShardNumByTermId ( &startKey ); // if ( ! m_msg36.getTermFreq ( m_collnum , // 0 , // maxAge // termId , // this , // gotTermFreqWrapper , // MAX_NICENESS , // true , // exact count? // false , // inc count? // false , // dec count? // false )) // is split? // return false; // just use msg0 and limit to like 1k or something if ( ! m_msg0.getList ( -1 , // hostid -1 , // ip -1 , // port 0 , // maxAge false , // addToCache? RDB_POSDB , m_collnum , &m_list , // RdbList ptr (char *)&startKey , (char *)&endKey , 1024 , // minRecSize this , gotTermListWrapper , MAX_NICENESS , false , // err correction? true , // inc tree? true , // domergeobsolete -1 , // firstHostId 0 , // start filenum -1 , // numFiles 30 , // timeout -1 , // syncpoint -1 , // preferlocalreads NULL , // msg5 NULL , // msg5b false , // isRealMerge? true , // allow pg cache false , // focelocalindexdb false , // doIndexdbSplit? shardNum ))// force paritysplit return false; // did not block return gotTermFreq(); }
// . returns false if blocked, true otherwise // . see if other pages we've indexed have this same image url bool Images::launchRequests ( ) { // loop over all images for ( int32_t i = m_i ; i < m_numImages ; i++ ) { // advance m_i++; // assume no error m_errors[i] = 0; // make the keys. each term is a gbimage:<imageUrl> term // so we are searching for the image url to see how often // it is repeated on other pages. key144_t startKey ; key144_t endKey ; g_posdb.makeStartKey(&startKey,m_termIds[i]); g_posdb.makeEndKey (&endKey ,m_termIds[i]); uint32_t shardNum; // assume to be for posdb here shardNum = g_hostdb.getShardNumByTermId ( &startKey ); // debug msg if ( g_conf.m_logDebugImage ) log("image: image checking shardnum %" PRId32" (termid0=%" PRIu64")" " for image url #%" PRId32, shardNum ,m_termIds[i],i); // get the termlist if ( ! m_msg0.getList ( -1 , // hostid -1 , // ip -1 , // port 0 , // maxAge false , // addToCache? RDB_POSDB, m_collnum , &m_list , // RdbList ptr (char *)&startKey , (char *)&endKey , 1024 , // minRecSize this , gotTermListWrapper , MAX_NICENESS , false , // err correction? true , // inc tree? true , // domergeobsolete -1 , // firstHostId 0 , // start filenum -1 , // numFiles 30000 , // timeout -1 , // syncpoint -1 , // preferlocalreads NULL , // msg5 false , // isRealMerge? true , // allow pg cache false , // focelocalindexdb false , // doIndexdbSplit? shardNum ))// force paritysplit return false; // process the msg36 response gotTermList (); } // i guess we didn't block return downloadImages(); }