コード例 #1
0
void Blaster::gotDoc2 ( void *state, TcpSocket *s){
	StateBD *st=(StateBD *)state;
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blaster: Lost the Request in gotDoc2");
		m_launched--;
		//No need to point p2
		// Free stateBD
		freeStateBD(st);
		return;
	}
	
	// . don't let TcpServer free m_buf when socket is recycled/closed
	// . we own it now and are responsible for freeing it
	//	s->m_readBuf = NULL;

	long long now = gettimeofdayInMilliseconds();
	// So now after getting both docIds, get their contents
	char *reply1 = st->m_buf1 ;
	long  size1  = st->m_buf1Len;
	HttpMime mime1;
	mime1.set ( reply1 , size1 , NULL );
	char *content1    = reply1 + mime1.getMimeLen();
	long  content1Len = size1  - mime1.getMimeLen();
	unsigned long h = hash32 ( content1 , content1Len );
	// log msg
	if ( g_errno ) 
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) %s : %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     st->m_u2   , 
		     mstrerror(g_errno)   );
	else
		logf(LOG_INFO,"blaster: got doc (%li) (%li ms) "
		     "(hash=%lx) %s",
		     s->m_readOffset      , 
		     (long)(now - s->m_startTime) , 
		     h ,
		     st->m_u2       );


	if (m_verbose){
		log(LOG_WARN,"blaster: content1len=%li, Content1 is =%s",
		    content1Len,content1);
		log(LOG_WARN,"\n");
	}
	char *reply2 = s->m_readBuf ;
	long  size2  = s->m_readOffset;
	HttpMime mime2;
	mime2.set ( reply2 , size2 , NULL );
	char *content2    = reply2 + mime2.getMimeLen();
	long  content2Len = size2  - mime2.getMimeLen();
	if (m_verbose)	
		log(LOG_WARN,"blaster: content2len=%li, Content2 is =%s",
		    content2Len,content2);

	// Now that we've got the contents, lets get the url links out 
	// of these pages.Passing them to function getSearchLinks should 
	// get the first x links found out.
	/*	st->m_links1=(char *) mmalloc(200*MAX_URL_LEN,"Blaster3");
	st->m_links2=st->m_links1+100*MAX_URL_LEN;
	st->m_numLinks1=100;
	st->m_numLinks2=100;*/

	/*	long numLinks1=getSearchLinks(content1,content1Len,
				      st->m_links1,st->m_numLinks1);
	long numLinks2=getSearchLinks(content2,content2Len,
	st->m_links2,st->m_numLinks2);*/


	content1[content1Len]='\0';
	//short csEnum1= get_iana_charset(mime1.getCharset(), 
	//				mime1.getCharsetLen());
	/*	if (csEnum1== csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
	Xml xml1;
	// assume utf8
	if (!xml1.set(content1, 
		     content1Len,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML1 Class in gotDoc2");
	}
	Links links1;
	Url parent; parent.set ( st->m_u1);
	if (!links1.set(false , // userellnofollow
			&xml1,
			&parent,//mime1.getLocationUrl(), parent Url
			false, // setLinkHashes
			NULL  , // baseUrl
			TITLEREC_CURRENT_VERSION, // version
			0 , // niceness
			false , // parent is permalink?
			NULL )) { // oldLinks
		log(LOG_WARN,"blaster: Couldn't set Links Class in gotDoc2");
	}

	content2[content2Len]='\0';
	//short csEnum2= get_iana_charset(mime2.getCharset(), 
	//				mime2.getCharsetLen());
	/*	if (csEnum2== csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime2.getCharset());*/
	Xml xml2;
	if (!xml2.set(content2, 
		     content2Len,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML2 Class in gotDoc2");
	}
	Links links2;
	parent.set(st->m_u2);
	if (!links2.set(0,//siterec xml
			&xml2,
			&parent,//&st->m_u2,//mime2.getLocationUrl(),
			false,
			NULL,
			TITLEREC_CURRENT_VERSION,
			0,
			false,
			NULL)){
		log(LOG_WARN,"blaster: Couldn't set links2 Class in gotDoc2");
	}
	

	// put the hash of the sites into a hashtable, since we have
	// about a 100 or so of them
	HashTableT<unsigned long, bool> urlHash;
	// put the urls from doc2 into the hastable, but first check if
	// they are links to google or gigablast (for now). For msn and
	// yahoo we have to add other checks.
	char domain2[256];
	long dlen = 0;
	char *dom = getDomFast ( st->m_u2 , &dlen );
	if ( dom ) strncpy(domain2,dom,dlen);
	domain2[dlen]='\0';
	for (long i=0;i<links2.getNumLinks();i++){
		// The dots check if exactly google or gigablast are present
		// in the link
		char *ss=links2.getLink(i);
		char *p;
		p=strstr(ss,domain2);
		if(p) continue;
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"cc.msncache.com");//msn's cache page
		if(p) continue;
		if (m_verbose)
			log(LOG_WARN,"blaster: link in Doc2=%s"
			    ,links2.getLink(i));
		unsigned long h=hash32Lower_a(links2.getLink(i),
					    links2.getLinkLen(i));
		//should i check for conflict. no, because it doesn't matter
		urlHash.addKey(h,1);
	}
	// now check if the urls from doc1 are in doc2. save the
	// ones that are not
	// in there for later.
	/*	long numUrlsToCheck=links2.getNumLinks();*/
	long numUrlsNotFound=0;
	/*if (numLinks1<numUrlsToCheck)
	numUrlsToCheck=numLinks1;*/
	char domain1[256];
	dlen = 0;
	dom = getDomFast ( st->m_u1 ,&dlen );
	if ( dom ) strncpy(domain1,dom,dlen);
	domain1[dlen]='\0';
	for (long i=0;i<links1.getNumLinks();i++){
		char *ss=links1.getLink(i);
		char *p;
		p=strstr(ss,domain1);
		if(p) continue;
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"cc.msncache.com");//msn's cache page
		if(p) continue;
		if (m_verbose)
			log(LOG_WARN,"blaster: link in Doc1=%s"
			    ,links1.getLink(i));
		unsigned long h=hash32Lower_a(links1.getLink(i),
					    links1.getLinkLen(i));
		long slot= urlHash.getSlot(h);		
		if(slot!=-1) continue;

		// if url is not present, get its doc.
		if (m_verbose || m_justDisplay)
			log(LOG_WARN,"blaster: NOT FOUND %s in %s"
			    ,links1.getLink(i),domain2);
		numUrlsNotFound++;
		//Don't do anything else if just have to display the urls
		if (m_justDisplay) continue;
		//now get the doc of these urls
		//initialize
		st->m_numUrlDocsReceived=0;

		StateBD2 *st2;
		try { st2 = new (StateBD2); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("blaster: Failed. "
			    "Could not allocate %li bytes for query. "
			    "Returning HTTP status of 500.",
			    (long)sizeof(StateBD2));
			return;
		}
		mnew ( st2 , sizeof(StateBD2) , "Blaster4" );
		//Point to the big state;
		st2->m_st=st;
		//Msg16 does 6 redirects, so I do 6 too
		st2->m_numRedirects=6;
		//st2->m_url.set(links1.getLink(i),links1.getLinkLen(i));
		st2->m_url = links1.getLink(i);
		// No need for a proxy ip here, since we are fetching
		// doc's from different IPs. Faster this way
		bool status = g_httpServer.getDoc ( st2->m_url, // url
						    0,//ip
						    0 ,  // offset
						    -1 ,  // size
						    0 , // ifModifiedSince
						    st2,  // state
						    gotDocWrapper3, // callback
						    60*1000, // timeout
						    0, // proxy ip
						    0, // proxy port
						    30*1024*1024, //maxLen
						    30*1024*1024);//maxOtherLen
		// continue if it blocked
		if ( ! status ) continue;
		// If not blocked, there is an error.
		st->m_numUrlDocsReceived++;
	}
	st->m_numUrlDocsSent=numUrlsNotFound;

	//There might have been an error while sending the docs, so if there
	//has been put a check
	if ( st->m_numUrlDocsReceived > 0 && 
	     st->m_numUrlDocsReceived <= st->m_numUrlDocsSent ){
		log(LOG_WARN,"blaster: %li docs could not be sent due to "
		    "error",st->m_numUrlDocsReceived);
		m_launched--;
		freeStateBD(st);
		return;
	}
		
	if (numUrlsNotFound==0){
		//job done for this pair
		log(LOG_WARN,"blaster: All urls from %s found in "
		    "%s",domain1,domain2);
		m_launched--;
		// Free stateBD
		freeStateBD(st);
		return;
	}
	log(LOG_WARN,"blaster: %li urls from %s Not found in %s",
	    numUrlsNotFound,domain1,domain2);
	if(m_justDisplay){
		m_launched--;
		// Free stateBD
		freeStateBD(st);
	}
	return;
}
コード例 #2
0
// . merge all the replies together
// . put final merged docids into m_docIds[],m_bitScores[],m_scores[],...
// . this calls Msg51 to get cluster levels when done merging
// . Msg51 remembers clusterRecs from previous call to avoid repeating lookups
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg3a::mergeLists ( ) {

    // time how long the merge takes
    if ( m_debug ) {
        logf( LOG_DEBUG, "query: msg3a: --- Final DocIds --- " );
        m_startTime = gettimeofdayInMilliseconds();
    }

    // reset our final docids count here in case we are a re-call
    m_numDocIds = 0;
    // a secondary count, how many unique docids we scanned, and not
    // necessarily added to the m_docIds[] array
    //m_totalDocCount = 0; // long docCount = 0;
    m_moreDocIdsAvail = true;


    // shortcut
    //long numSplits = m_numHosts;//indexdbSplit;

    // . point to the various docids, etc. in each split reply
    // . tcPtr = term count. how many required query terms does the doc
    //   have? formerly called topExplicits in IndexTable2.cpp
    long long     *diPtr [MAX_INDEXDB_SPLIT];
    float         *rsPtr [MAX_INDEXDB_SPLIT];
    key_t         *ksPtr [MAX_INDEXDB_SPLIT];
    long long     *diEnd [MAX_INDEXDB_SPLIT];
    for ( long j = 0; j < m_numHosts ; j++ ) {
        Msg39Reply *mr =m_reply[j];
        // if we have gbdocid:| in query this could be NULL
        if ( ! mr ) {
            diPtr[j] = NULL;
            diEnd[j] = NULL;
            rsPtr[j] = NULL;
            ksPtr[j] = NULL;
            continue;
        }
        diPtr [j] = (long long *)mr->ptr_docIds;
        rsPtr [j] = (float     *)mr->ptr_scores;
        ksPtr [j] = (key_t     *)mr->ptr_clusterRecs;
        diEnd [j] = (long long *)(mr->ptr_docIds +
                                  mr->m_numDocIds * 8);
    }

    // clear if we had it
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    //
    // HACK: START section stats merge
    //
    m_sectionStats.reset();
    long sneed = 0;
    for ( long j = 0; j < m_numHosts ; j++ ) {
        Msg39Reply *mr = m_reply[j];
        if ( ! mr ) continue;
        sneed += mr->size_siteHashList/4;
    }
    HashTableX dt;
    //char tmpBuf[5000];
    if (sneed&&!dt.set(4,0,sneed,NULL,0,false,
                       m_r->m_niceness,"uniqsit"))
        return true;
    for ( long j = 0; sneed && j < m_numHosts ; j++ ) {
        Msg39Reply *mr =m_reply[j];
        if ( ! mr ) continue;
        SectionStats *src = &mr->m_sectionStats;
        SectionStats *dst = &m_sectionStats;
        dst->m_onSiteDocIds      += src->m_onSiteDocIds;
        dst->m_offSiteDocIds     += src->m_offSiteDocIds;
        // now the list should be the unique site hashes that
        // had the section hash. we need to uniquify them again
        // here.
        long *p = (long *)mr->ptr_siteHashList;
        long np = mr->size_siteHashList / 4;
        for ( long k = 0 ; k < np ; k++ )
            // hash it up, no dups!
            dt.addKey(&p[k]);
        // update our count based on that
        dst->m_numUniqueSites = dt.getNumSlotsUsed();
    }
    if ( m_r->m_getSectionStats ) return true;
    //
    // HACK: END section stats merge
    //


    if ( m_docsToGet <= 0 ) {
        char *xx=NULL;
        *xx=0;
    }

    // . how much do we need to store final merged docids, etc.?
    // . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1
    long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1);
    // allocate it
    m_finalBuf     = (char *)mmalloc ( need , "finalBuf" );
    m_finalBufSize = need;
    // g_errno should be set if this fails
    if ( ! m_finalBuf ) return true;
    // hook into it
    char *p = m_finalBuf;
    m_docIds        = (long long *)p;
    p += m_docsToGet * 8;
    m_scores        = (float     *)p;
    p += m_docsToGet * sizeof(float);
    m_clusterRecs   = (key_t     *)p;
    p += m_docsToGet * sizeof(key_t);
    m_clusterLevels = (char      *)p;
    p += m_docsToGet * 1;
    m_scoreInfos    = (DocIdScore **)p;
    p+=m_docsToGet*sizeof(DocIdScore *);

    // sanity check
    char *pend = m_finalBuf + need;
    if ( p != pend ) {
        char *xx = NULL;
        *xx =0;
    }
    // . now allocate for hash table
    // . get at least twice as many slots as docids
    HashTableT<long long,char> htable;
    // returns false and sets g_errno on error
    if ( ! htable.set ( m_docsToGet * 2 ) ) return true;
    // hash table for doing site clustering, provided we
    // are fully split and we got the site recs now
    HashTableT<long long,long> htable2;
    if ( m_r->m_doSiteClustering && ! htable2.set ( m_docsToGet * 2 ) )
        return true;

    //
    // ***MERGE ALL SPLITS INTO m_docIds[], etc.***
    //
    // . merge all lists in m_replyDocIds[splitNum]
    // . we may be re-called later after m_docsToGet is increased
    //   if too many docids were clustered/filtered out after the call
    //   to Msg51.
mergeLoop:

    // the winning docid will be diPtr[maxj]
    long maxj = -1;
    //Msg39Reply *mr;
    long hslot;

    // get the next highest-scoring docids from all split lists
    for ( long j = 0; j < m_numHosts; j++ ) {
        // . skip exhausted lists
        // . these both should be NULL if reply was skipped because
        //   we did a gbdocid:| query
        if ( diPtr[j] >= diEnd[j] ) continue;
        // compare the score
        if ( maxj == -1 ) {
            maxj = j;
            continue;
        }
        if ( *rsPtr[j] < *rsPtr[maxj] ) continue;
        if ( *rsPtr[j] > *rsPtr[maxj] ) {
            maxj = j;
            continue;
        }
        // prefer lower docids on top
        if ( *diPtr[j] < *diPtr[maxj] ) {
            maxj = j;
            continue;
        }
    }

    if ( maxj == -1 ) {
        m_moreDocIdsAvail = false;
        goto doneMerge;
    }

    // only do this logic if we have clusterdb recs included
    if ( m_r->m_doSiteClustering     &&
            // if the clusterLevel was set to CR_*errorCode* then this key
            // will be 0, so in that case, it might have been a not found
            // or whatever, so let it through regardless
            ksPtr[maxj]->n0 != 0LL &&
            ksPtr[maxj]->n1 != 0   ) {
        // get the hostname hash, a long long
        long sh = g_clusterdb.getSiteHash26 ((char *)ksPtr[maxj]);
        // do we have enough from this hostname already?
        long slot = htable2.getSlot ( sh );
        // if this hostname already visible, do not over-display it...
        if ( slot >= 0 ) {
            // get the count
            long val = htable2.getValueFromSlot ( slot );
            // . if already 2 or more, give up
            // . if the site hash is 0, that usually means a
            //   "not found" in clusterdb, and the accompanying
            //   cluster level would be set as such, but since we
            //   did not copy the cluster levels over in the merge
            //   algo above, we don't know for sure... cluster recs
            //   are set to 0 in the Msg39.cpp clustering.
            if ( sh && val >= 2 ) goto skip;
            // inc the count
            val++;
            // store it
            htable2.setValue ( slot , val );
        }
        // . add it, this should be pre-allocated!
        // . returns false and sets g_errno on error
        else if ( ! htable2.addKey(sh,1) ) return true;
    }

    hslot = htable.getSlot ( *diPtr[maxj] );

    // . only add it to the final list if the docid is "unique"
    // . BUT since different event ids share the same docid, exception!
    if ( hslot < 0 ) {
        // always inc this
        //m_totalDocCount++;
        // only do this if we need more
        if ( m_numDocIds < m_docsToGet ) {
            // get DocIdScore class for this docid
            Msg39Reply *mr = m_reply[maxj];
            // point to the array of DocIdScores
            DocIdScore *ds = (DocIdScore *)mr->ptr_scoreInfo;
            long nds = mr->size_scoreInfo/sizeof(DocIdScore);
            DocIdScore *dp = NULL;
            for ( long i = 0 ; i < nds ; i++ ) {
                if ( ds[i].m_docId != *diPtr[maxj] )  continue;
                dp = &ds[i];
                break;
            }
            // add the max to the final merged lists
            m_docIds    [m_numDocIds] = *diPtr[maxj];

            // wtf?
            if ( ! dp ) {
                // this is empty if no scoring info
                // supplied!
                if ( m_r->m_getDocIdScoringInfo )
                    log("msg3a: CRAP! got empty score "
                        "info for "
                        "d=%lli",
                        m_docIds[m_numDocIds]);
                //char *xx=NULL; *xx=0;  261561804684
                // qry = www.yahoo
            }
            // point to the single DocIdScore for this docid
            m_scoreInfos[m_numDocIds] = dp;

            // reset this just in case
            if ( dp ) {
                dp->m_singleScores = NULL;
                dp->m_pairScores   = NULL;
            }

            // now fix DocIdScore::m_pairScores and m_singleScores
            // ptrs so they reference into the
            // Msg39Reply::ptr_pairScoreBuf and ptr_singleSingleBuf
            // like they should. it seems we do not free the
            // Msg39Replies so we should be ok referencing them.
            if ( dp && dp->m_singlesOffset >= 0 )
                dp->m_singleScores =
                    (SingleScore *)(mr->ptr_singleScoreBuf+
                                    dp->m_singlesOffset) ;
            if ( dp && dp->m_pairsOffset >= 0 )
                dp->m_pairScores =
                    (PairScore *)(mr->ptr_pairScoreBuf +
                                  dp->m_pairsOffset );


            // turn it into a float, that is what rscore_t is.
            // we do this to make it easier for PostQueryRerank.cpp
            m_scores    [m_numDocIds]=(float)*rsPtr[maxj];
            if ( m_r->m_doSiteClustering )
                m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
            // clear this out
            //m_eventIdBits[m_numDocIds].clear();
            // set this for use below
            hslot = m_numDocIds;
            // point to next available slot to add to
            m_numDocIds++;
        }
        // if it has ALL the required query terms, count it
        //if ( *bsPtr[maxj] & 0x60 ) m_numAbove++;
        // . add it, this should be pre-allocated!
        // . returns false and sets g_errno on error
        if ( ! htable.addKey(*diPtr[maxj],1) ) return true;
    }

skip:
    // increment the split pointers from which we took the max
    rsPtr[maxj]++;
    diPtr[maxj]++;
    ksPtr[maxj]++;
    // get the next highest docid and add it in
    if ( m_numDocIds < m_docsToGet ) goto mergeLoop;

doneMerge:

    if ( m_debug ) {
        // show how long it took
        logf( LOG_DEBUG,"query: msg3a: [%lu] merged %li docs from %li "
              "splits in %llu ms. "
              ,
              (unsigned long)this,
              m_numDocIds, (long)m_numHosts,
              gettimeofdayInMilliseconds() - m_startTime
            );
        // show the final merged docids
        for ( long i = 0 ; i < m_numDocIds ; i++ ) {
            long sh = 0;
            if ( m_r->m_doSiteClustering )
                sh=g_clusterdb.getSiteHash26((char *)
                                             &m_clusterRecs[i]);
            // print out score_t
            logf(LOG_DEBUG,"query: msg3a: [%lu] "
                 "%03li) merged docId=%012llu "
                 "score=%.01f hosthash=0x%lx",
                 (unsigned long)this,
                 i,
                 m_docIds    [i] ,
                 (float)m_scores    [i] ,
                 sh );
        }
    }

    // if we had a full split, we should have gotten the cluster recs
    // from each split already
    memset ( m_clusterLevels , CR_OK , m_numDocIds );

    return true;
}