// . returns false if blocked, true otherwise
// . see if other pages we've indexed have this same image url
bool Images::launchRequests ( ) {
	// loop over all images
	for ( long i = m_i ; i < m_numImages ; i++ ) {
		// advance
		m_i++;
		// assume no error
		m_errors[i] = 0;
		// make the keys. each term is a gbimage:<imageUrl> term
		// so we are searching for the image url to see how often
		// it is repeated on other pages.
		key144_t startKey ; 
		key144_t endKey   ;
		g_posdb.makeStartKey(&startKey,m_termIds[i]);
		g_posdb.makeEndKey  (&endKey  ,m_termIds[i]);
		// get our residing groupid
		//unsigned long gid = g_indexdb.getNoSplitGroupId(&startKey);
		// no split is true for this one, so we do not split by docid
		//uint32_t gid = getGroupId(RDB_INDEXDB,&startKey,false);
		unsigned long shardNum;
		shardNum = getShardNum(RDB_POSDB,&startKey);
		// get the termlist
		if ( ! m_msg0.getList ( -1    , // hostid
					-1    , // ip
					-1    , // port
					0     , // maxAge
					false , // addToCache?
					RDB_POSDB,
					m_collnum      ,
					&m_list     , // RdbList ptr
					(char *)&startKey    ,
					(char *)&endKey      ,
					1024        , // minRecSize
					this        ,
					gotTermListWrapper ,
					MAX_NICENESS       ,
					false , // err correction?
					true  , // inc tree?
					true  , // domergeobsolete
					-1    , // firstHostId
					0     , // start filenum
					-1    , // numFiles
					30    , // timeout
					-1    , // syncpoint
					-1    , // preferlocalreads
					NULL  , // msg5
					NULL  , // msg5b
					false , // isRealMerge?
					true  , // allow pg cache
					false , // focelocalindexdb
					false , // doIndexdbSplit?
					shardNum ))// force paritysplit
			return false;
		// process the msg36 response
		gotTermList ();
	}
	// i guess we didn't block
	return downloadImages();
}
Beispiel #2
0
void Downloader::finishedLoadingImages(Page *page)
{
	if (m_cancelled)
		return;

	log("Received page '"+page->url().toString()+"'");
    emit finishedImagesPage(page);

	if (--m_waiting > 0)
	{
		loadNext();
		return;
	}

	QList<Image*> images;
	for (int i = 0; i < m_pages->size(); ++i)
		for (Image *img : m_pages->at(i)->images())
		{
			if (!m_blacklist)
			{
				if (!img->blacklisted(m_blacklistedTags).empty())
				{
					++m_ignored;
					continue;
				}
			}
			if (m_noduplicates)
			{
				bool found = false;
				for (Image *image : images)
					if (image->md5() == img->md5())
						found = true;
				if (found)
					continue;
			}
			images.append(img);
		}

	QList<Image*> imgs;
	int i = 0;
	for (Image *img : images)
		if (m_max <= 0 || i++ < m_max)
			imgs.append(img);

	if (m_quit)
		downloadImages(imgs);
	else
		emit finishedImages(imgs);
}
// . returns false if blocked, returns true otherwise
// . sets g_errno on error
bool Images::getThumbnail ( char *pageSite ,
			    long  siteLen  ,
			    long long docId ,
			    XmlDoc *xd ,
			    collnum_t collnum,//char *coll ,
			    //char **statusPtr ,
			    long hopCount,
			    void *state ,
			    void   (*callback)(void *state) ) {
	// sanity check
	if ( ! m_setCalled ) { char *xx=NULL;*xx=0; }
	// we haven't had any error
	m_hadError  = 0;
	// no reason to stop yet
	m_stopDownloading = false;
	// reset here now
	m_i = 0;
	m_j = 0;
	m_phase = 0;

	// sanity check
	if ( ! m_pageUrl ) { char *xx=NULL;*xx=0; }
	// sanity check
	if ( ! pageSite ) { char *xx=NULL;*xx=0; }
	// we need to be a permalink
	//if ( ! isPermalink ) return true;

	// save these
	//m_statusPtr = statusPtr;
	// save this
	m_collnum = collnum;
	m_docId = docId;
	m_callback = callback;
	m_state = state;

	// if this doc is a json diffbot reply it already has the primary
	// image selected so just use that
	m_xd = xd;
	if ( m_xd->m_isDiffbotJSONObject ) 
		return downloadImages();

	// if no candidates, we are done, no error
	if ( m_numImages == 0 ) return true;

	//Vector *v = xd->getTagVector();
	// this will at least have one component, the 0/NULL component
	uint32_t *tph = xd->getTagPairHash32();
	// must not block or error on us
	if ( tph == (void *)-1 ) { char *xx=NULL;*xx=0; }
	// must not error on use?
	if ( ! tph ) { char *xx=NULL;*xx=0; }

	// . see DupDetector.cpp, very similar to this
	// . see how many pages we have from our same site with our same 
	//   html template (and that are permalinks)
	char buf[2000];
	char c = pageSite[siteLen];
	pageSite[siteLen]=0;
	// site MUST NOT start with "http://"
	if ( strncmp ( pageSite , "http://", 7)==0){char*xx=NULL;*xx=0;}
	// this must match what we hash in XmlDoc::hashNoSplit()
	sprintf ( buf , "gbsitetemplate:%lu%s", (unsigned long)*tph,pageSite );
	pageSite[siteLen]=c;
	// TODO: make sure this is a no-split termid storage thingy
	// in Msg14.cpp
	Query q;
	if ( ! q.set2 ( buf , langUnknown , false ) )
		// return true with g_errno set on error
		return true;
	// store the termid
	long long termId = q.getTermId(0);

	key144_t startKey ;
	key144_t endKey   ;
	g_posdb.makeStartKey(&startKey,termId);
	g_posdb.makeEndKey  (&endKey  ,termId);

	// get shard of that (this termlist is sharded by termid -
	// see XmlDoc.cpp::hashNoSplit() where it hashes gbsitetemplate: term)
	long shardNum = g_hostdb.getShardNumByTermId ( &startKey );

	// if ( ! m_msg36.getTermFreq ( m_collnum               ,
	// 			     0                  , // maxAge
	// 			     termId             ,
	// 			     this               ,
	// 			     gotTermFreqWrapper ,
	// 			     MAX_NICENESS       ,
	// 			     true               ,  // exact count?
	// 			     false              ,  // inc count?
	// 			     false              ,  // dec count?
	// 			     false              )) // is split?
	// 	return false;


	// just use msg0 and limit to like 1k or something
	if ( ! m_msg0.getList ( -1    , // hostid
				-1    , // ip
				-1    , // port
				0     , // maxAge
				false , // addToCache?
				RDB_POSDB ,
				m_collnum      ,
				&m_list     , // RdbList ptr
				(char *)&startKey    ,
				(char *)&endKey      ,
				1024        , // minRecSize
				this        ,
				gotTermListWrapper ,
				MAX_NICENESS       ,
				false , // err correction?
				true  , // inc tree?
				true  , // domergeobsolete
				-1    , // firstHostId
				0     , // start filenum
				-1    , // numFiles
				30    , // timeout
				-1    , // syncpoint
				-1    , // preferlocalreads
				NULL  , // msg5
				NULL  , // msg5b
				false , // isRealMerge?
				true  , // allow pg cache
				false , // focelocalindexdb
				false , // doIndexdbSplit?
				shardNum ))// force paritysplit
		return false;


	// did not block
	return gotTermFreq();
}
// . returns false if blocked, true otherwise
// . see if other pages we've indexed have this same image url
bool Images::launchRequests ( ) {
	// loop over all images
	for ( int32_t i = m_i ; i < m_numImages ; i++ ) {
		// advance
		m_i++;
		// assume no error
		m_errors[i] = 0;
		// make the keys. each term is a gbimage:<imageUrl> term
		// so we are searching for the image url to see how often
		// it is repeated on other pages.
		key144_t startKey ; 
		key144_t endKey   ;
		g_posdb.makeStartKey(&startKey,m_termIds[i]);
		g_posdb.makeEndKey  (&endKey  ,m_termIds[i]);
		uint32_t shardNum;
		// assume to be for posdb here
		shardNum = g_hostdb.getShardNumByTermId ( &startKey );

		// debug msg
		if ( g_conf.m_logDebugImage )
			log("image: image checking shardnum %" PRId32" (termid0=%" PRIu64")"
			    " for image url #%" PRId32,
			    shardNum ,m_termIds[i],i);

		// get the termlist
		if ( ! m_msg0.getList ( -1    , // hostid
					-1    , // ip
					-1    , // port
					0     , // maxAge
					false , // addToCache?
					RDB_POSDB,
					m_collnum      ,
					&m_list     , // RdbList ptr
					(char *)&startKey    ,
					(char *)&endKey      ,
					1024        , // minRecSize
					this        ,
					gotTermListWrapper ,
					MAX_NICENESS       ,
					false , // err correction?
					true  , // inc tree?
					true  , // domergeobsolete
					-1    , // firstHostId
					0     , // start filenum
					-1    , // numFiles
					30000 , // timeout
					-1    , // syncpoint
					-1    , // preferlocalreads
					NULL  , // msg5
					false , // isRealMerge?
					true  , // allow pg cache
					false , // focelocalindexdb
					false , // doIndexdbSplit?
					shardNum ))// force paritysplit
			return false;
		// process the msg36 response
		gotTermList ();
	}
	// i guess we didn't block
	return downloadImages();
}