C++ (Cpp) SafeBuf::getBufStart Exemples

Exemple #1

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool loadUrls ( ) {
	static bool s_loaded = false;
	if ( s_loaded ) return true;
	s_loaded = true;
	// use injectme3 file
	s_ubuf1.load("./injectme3");
	// scan for +++URL: xxxxx
	char *s = s_ubuf1.getBufStart();
	for ( ; *s ; s++ ) {
		if ( strncmp(s,"+++URL: ",8) ) continue;
		// got one
		// \0 term it for s_contentPtrs below
		*s = '\0';
		// find end of it
		s += 8;
		char *e = s;
		for ( ; *e && ! is_wspace_a(*e); e++ );
		// null term it
		if ( *e ) *e = '\0';
		// store ptr
		s_ubuf2.pushLong((long)s);
		// skip past that
		s = e;
		// point to content
		s_cbuf2.pushLong((long)(s+1));
	}
	// make array of url ptrs
	s_urlPtrs = (char **)s_ubuf2.getBufStart();
	s_contentPtrs= (char **)s_cbuf2.getBufStart();
	return true;
}

Exemple #2

0

Afficher le fichier

Fichier : qa.cpp Projet : FlavioFalcao/open-source-search-engine

bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
	SafeBuf sb;
	sb.safePrintf ( "http://%s:%li%s"
			, iptoa(g_hostdb.m_myHost->m_ip)
			, (long)g_hostdb.m_myHost->m_port
			, path
			);
	Url u;
	u.set ( sb.getBufStart() );
	if ( ! g_httpServer.getDoc ( u.getUrl() ,
				     0 , // ip
				     0 , // offset
				     -1 , // size
				     0 , // ifmodsince
				     NULL ,
				     callback ,
				     60*1000, // timeout
				     0, // proxyip
				     0, // proxyport
				     -1, // maxtextdoclen
				     -1, // maxotherdoclen
				     NULL ) ) // useragent
		return false;
	// error?
	log("qa: getUrl error: %s",mstrerror(g_errno));
	return true;
}

Exemple #3

0

Afficher le fichier

Fichier : Log.cpp Projet : privacore/open-source-search-engine

// for example, RENAME log000 to log000-bak20131104-181932
static bool renameCurrentLogFile ( ) {
	File f;
	char tmp[16];
	sprintf(tmp,"log%03" PRId32,g_hostdb.m_hostId);
	f.set ( g_hostdb.m_dir , tmp );
	// make new filename like log000-bak20131104-181932
	time_t now = time(NULL);
	struct tm tm_buf;
	tm *tm1 = gmtime_r(&now,&tm_buf);
	char tmp2[64];
	strftime(tmp2,64,"%Y%m%d-%H%M%S",tm1);
	SafeBuf newName;
	if ( ! newName.safePrintf ( "%slog%03" PRId32"-bak%s",
				    g_hostdb.m_dir,
				    g_hostdb.m_hostId,
				    tmp2 ) ) {
		fprintf(stderr,"log rename failed\n");
		return false;
	}
	// rename log000 to log000-2013_11_04-18:19:32
	if ( f.doesExist() ) {
		//fprintf(stdout,"renaming file\n");
		f.rename ( newName.getBufStart() );
	}
	return true;
}

Exemple #4

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool saveHashTable ( ) {
	if ( s_ht.m_numSlotsUsed <= 0 ) return true;
	SafeBuf fn;
	fn.safePrintf("%s/qa/",g_hostdb.m_dir);
	log("qa: saving crctable.dat");
	s_ht.save ( fn.getBufStart() , "crctable.dat" );
	return true;
}

Exemple #5

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool qascrape ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	// scrape it
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&"
			       "format=xml&qts=test");
		if ( ! getUrl ( sb.getBufStart() , 999 ) )
			return false;
	}
		


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[6] ) {
		s_flags[6] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=test",
				-1310551262 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA SCRAPE TEST");
		return true;
	}

	return true;
}

Exemple #6

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

// returns false if blocked, true otherwise, like on quick connect error
bool getUrl( char *path , long checkCRC = 0 , char *post = NULL ) {

	SafeBuf sb;
	sb.safePrintf ( "http://%s:%li%s"
			, iptoa(g_hostdb.m_myHost->m_ip)
			, (long)g_hostdb.m_myHost->m_httpPort
			, path
			);

	s_checkCRC = checkCRC;

	bool doPost = true;
	if ( strncmp ( path , "/search" , 7 ) == 0 )
		doPost = false;

	//Url u;
	s_url.set ( sb.getBufStart() );
	log("qa: getting %s",sb.getBufStart());
	if ( ! g_httpServer.getDoc ( s_url.getUrl() ,
				     0 , // ip
				     0 , // offset
				     -1 , // size
				     0 , // ifmodsince
				     NULL ,
				     gotReplyWrapper,
				     999999*1000, // timeout ms
				     0, // proxyip
				     0, // proxyport
				     -1, // maxtextdoclen
				     -1, // maxotherdoclen
				     NULL , // useragent
				     "HTTP/1.0" , // protocol
				     doPost , // doPost
				     NULL , // cookie
				     NULL , // additionalHeader
				     NULL , // fullRequest
				     post ) )
		return false;
	// error?
	processReply ( NULL , 0 );
	//log("qa: getUrl error: %s",mstrerror(g_errno));
	return true;
}

Exemple #7

0

Afficher le fichier

Fichier : HttpRequest.cpp Projet : BKJackson/open-source-search-engine

// . the url being reuqested
// . removes &code= facebook cruft
bool HttpRequest::getCurrentUrl ( SafeBuf &cu ) {
	// makre sure we got enough room
	if ( ! cu.reserve ( m_hostLen + 64 + m_plen + 1 + 1 ) ) return false;
	// need a "Host: "
	char *host = m_host;
	if ( ! host ) host = APPSUBDOMAIN;
	cu.safePrintf("http");
	if ( m_isSSL ) cu.pushChar('s');
	cu.safePrintf("://%s",host);
	char *path = m_path;
	long  plen = m_plen;
	if ( ! path ) {
		path = "/";
		plen = 1;
	}
	// . scan path and change \0 back to = or &
	// . similar logic in HttpServer.cpp for logging!
	char *dst = cu.getBuf();
	char *src = path;
	char *srcEnd = path + plen;
	char dd = '=';
	for ( ; src < srcEnd ; src++ , dst++ ) {
		*dst = *src;
		if ( *src ) continue;
		*dst = dd;
		if ( dd == '=' ) dd = '&';
		else             dd = '=';
	}
	*dst = '\0';
	// cut it off at facebook's &code=
	char *buf = cu.getBufStart();
	char *code = strstr( buf,"&code=");
	// fix for eventguru.com/blog.html?code=
	if ( ! code ) code = strstr(buf,"?code=");
	// hack that off if there
	if ( code ) {
		*code = '\0';
		dst = code;
	}
	// update length
	cu.setLength( dst - cu.getBufStart() );
	return true;
}

Exemple #8

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

// . run a series of tests to ensure that gb is functioning properly
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
//   ensure consistency between tests for exact replays
bool qatest ( ) {

	if ( s_registered ) {
		g_loop.unregisterSleepCallback(NULL,qatestWrapper);
		s_registered = false;
	}

	if ( ! s_callback ) s_callback = qatest;

	if ( ! g_qaSock ) return true;


	// returns true when done, false when blocked
	//if ( ! qainject ( ) ) return false;

	// returns true when done, false when blocked
	//if ( ! qaspider ( ) ) return false;

	long n = sizeof(s_qatests)/sizeof(QATest);
	for ( long i = 0 ; i < n ; i++ ) {
		QATest *qt = &s_qatests[i];
		if ( ! qt->m_doTest ) continue;
		// store that
		s_qt = qt;
		// point to flags
		s_flags = qt->m_flags;
		// call the qatest
		if ( ! qt->m_func() ) return false;
	}

	// save this
	saveHashTable();
	// do not reset since we don't reload it above!
	//s_ht.reset();

	//if ( g_numErrors )
	//	g_qaOutput.safePrintf("<input type=submit value=submit><br>");

	g_qaOutput.safePrintf("<br>DONE RUNNING QA TESTS<br>");


	// . print the output
	// . the result of each test is stored in the g_qaOutput safebuf
	g_httpServer.sendDynamicPage(g_qaSock,
				     g_qaOutput.getBufStart(),
				     g_qaOutput.length(),
				     -1/*cachetime*/);

	g_qaOutput.purge();

	g_qaSock = NULL;

	return true;
}

Exemple #9

0

Afficher le fichier

Fichier : PageInject.cpp Projet : chushuai/open-source-search-engine

void doneInjectingLinksWrapper ( void *state ) {
	Msg7 *msg7 = (Msg7 *)state;
	SafeBuf *sb = &msg7->m_sb;
	// copy the serps into ou rbuf
	if ( ! g_errno ) {
		// print header
		if ( sb->length() == 0 ) {
			// print header of page
			sb->safePrintf("<?xml version=\"1.0\" "
				       "encoding=\"UTF-8\" ?>\n"
				       "<response>\n" );
		}
		// serp header
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t<googleResults>\n");
		else
			sb->safePrintf("\t<bingResults>\n");
		// print results
		sb->safeMemcpy(&msg7->m_xd.m_serpBuf);
		// end that
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t</googleResults>\n");
		else
			sb->safePrintf("\t</bingResults>\n");
	}
	// do bing now
	if ( msg7->m_round == 1 ) {
		// return if it blocks
		if ( ! msg7->scrapeQuery() ) return;
	}

	// otherwise, parse out the search results so steve can display them
	if ( g_errno )
		sb->safePrintf("<error><![CDATA[%s]]></error>\n",
			       mstrerror(g_errno));
	// print header of page
	sb->safePrintf("</response>\n");
	// page is not more than 32k
	//char buf[1024*32];
	//char *p = buf;
	// return docid and hostid
	//p += sprintf ( p , "scraping status ");
	// print error msg out, too or "Success"
	//p += sprintf ( p , "%s", mstrerror(g_errno));
	TcpSocket *sock = msg7->m_socket;
	g_httpServer.sendDynamicPage ( sock, 
				       sb->getBufStart(),
				       sb->length(),
				       -1/*cachetime*/);
	// hopefully sb buffer is copied becaues this will free it:
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
}

Exemple #10

0

Afficher le fichier

Fichier : qa.cpp Projet : FlavioFalcao/open-source-search-engine

bool deleteUrls ( ) {
	static long s_ii2 = 0;
	for ( ; s_ii2 < s_numUrls ; ) {
		// pre-inc it
		s_ii2++;
		// reject using html api
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
		sb.urlEncode ( s_urlPtrs[s_ii2] );
		return getUrl ( sb.getBufStart() , qatestWrapper );
	}
	return true;
}

Exemple #11

0

Afficher le fichier

Fichier : qa.cpp Projet : FlavioFalcao/open-source-search-engine

// ensure search results are consistent
bool searchTest2 () {
	long nq = sizeof(s_queries)/sizeof(char *);
	for ( ; s_qi2 < nq ; ) {
		// pre-inc it
		s_qi2++;
		// inject using html api
		SafeBuf sb;
		// qa=1 tell gb to exclude "variable" or "random" things
		// from the serps so we can checksum it consistently
		sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
		sb.urlEncode ( s_queries[s_qi2] );
		return getUrl ( sb.getBufStart() , doneSearching2 );
	}
	return true;
}

Exemple #12

0

Afficher le fichier

Fichier : Json.cpp Projet : DeadNumbers/open-source-search-engine

bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {

	// reset, but don't free mem etc. just set m_length to 0
	nameBuf.reset();
	// get its full compound name like "meta.twitter.title"
	JsonItem *p = this;//ji;
	char *lastName = NULL;
	char *nameArray[20];
	int32_t  numNames = 0;
	for ( ; p ; p = p->m_parent ) {
		// empty name?
		if ( ! p->m_name ) continue;
		if ( ! p->m_name[0] ) continue;
		// dup? can happen with arrays. parent of string
		// in object, has same name as his parent, the
		// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
		if ( p->m_name == lastName ) continue;
		// update
		lastName = p->m_name;
		// add it up
		nameArray[numNames++] = p->m_name;
		// breach?
		if ( numNames < 15 ) continue;
		log("build: too many names in json tag");
		break;
	}
	// assemble the names in reverse order which is correct order
	for ( int32_t i = 1 ; i <= numNames ; i++ ) {
		// copy into our safebuf
		if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) 
			return false;
		// separate names with periods
		if ( ! nameBuf.pushChar('.') ) return false;
	}
	// remove last period
	nameBuf.removeLastChar('.');
	// and null terminate
	if ( ! nameBuf.nullTerm() ) return false;
	// change all :'s in names to .'s since : is reserved!
	char *px = nameBuf.getBufStart();
	for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';

	return true;
}

Exemple #13

0

Afficher le fichier

Fichier : Log.cpp Projet : FlavioFalcao/open-source-search-engine

bool Log::init ( char *filename ) {
	// set the main process id
	//s_pid = getpidtid();
	setPid();
	// init these
	m_numErrors =  0;
	m_bufPtr    =  0;
	m_fd        = -1;
	m_disabled  = false;

#ifdef DEBUG
	g_dbufSize = 4096;
	g_dbuf = (char*)mmalloc(g_dbufSize,"Log: DebugBuffer");
	if (!g_dbuf) fprintf(stderr, "Unable to init debug buffer");
#endif
	//	m_hostname  = g_conf.m_hostname;
	//	m_port      = port;
	// is there a filename to log our errors to?
	m_filename = filename;
	if ( ! m_filename ) return true;

	// skip this for now
	//return true;

	//
	// RENAME log000 to log000-2013_11_04-18:19:32
	//
	if ( g_conf.m_runAsDaemon ) {
		File f;
		char tmp[16];
		sprintf(tmp,"log%03li",g_hostdb.m_hostId);
		f.set ( g_hostdb.m_dir , tmp );
		// make new filename like log000-2013_11_04-18:19:32
		time_t now = getTimeLocal();
		tm *tm1 = gmtime((const time_t *)&now);
		char tmp2[64];
		strftime(tmp2,64,"%Y_%m_%d-%T",tm1);
		SafeBuf newName;
		if ( ! newName.safePrintf ( "%slog%03li-%s",
					    g_hostdb.m_dir,
					    g_hostdb.m_hostId,
					    tmp2 ) ) {
			fprintf(stderr,"log rename failed\n");
			return false;
		}
		// rename log000 to log000-2013_11_04-18:19:32
		if ( f.doesExist() ) {
			//fprintf(stdout,"renaming file\n");
			f.rename ( newName.getBufStart() );
		}
	}


	// open it for appending.
	// create with -rw-rw-r-- permissions if it's not there.
	m_fd = open ( m_filename , 
		      O_APPEND | O_CREAT | O_RDWR , 
		      S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
	if ( m_fd >= 0 ) return true;
	// bitch to stderr and return false on error
	fprintf(stderr,"could not open log file %s for appending\n",
		m_filename);
	return false;
}

Exemple #14

0

Afficher le fichier

Fichier : Msg39.cpp Projet : DeadNumbers/open-source-search-engine

// . now come here when we got the necessary index lists
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg39::intersectLists ( ) { // bool updateReadInfo ) {
	// bail on error
	if ( g_errno ) { 
	hadError:
		log("msg39: Had error getting termlists: %s.",
		    mstrerror(g_errno));
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		//sendReply (m_slot,this,NULL,0,0,true);
		return true; 
	}
	// timestamp log
	if ( m_debug ) {
		log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		    "Got %"INT32" lists in %"INT64" ms"
		    , (PTRTYPE)this,m_tmpq.getNumTerms(),
		     gettimeofdayInMilliseconds() - m_startTime);
		m_startTime = gettimeofdayInMilliseconds();
	}

	// breathe
	QUICKPOLL ( m_r->m_niceness );

	// ensure collection not deleted from under us
	CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		goto hadError;
	}

	// . set the IndexTable so it can set it's score weights from the
	//   termFreqs of each termId in the query
	// . this now takes into account the special termIds used for sorting
	//   by date (0xdadadada and 0xdadadad2 & TERMID_MASK)
	// . it should weight them so much so that the summation of scores
	//   from other query terms cannot make up for a lower date score
	// . this will actually calculate the top
	// . this might also change m_tmpq.m_termSigns 
	// . this won't do anything if it was already called
	m_posdbTable.init ( &m_tmpq                ,
			    m_debug              ,
			    this                   ,
			    &m_tt                  ,
			    m_r->m_collnum,//ptr_coll          , 
			    &m_msg2 , // m_lists                ,
			    //m_tmpq.m_numTerms      , // m_numLists
			    m_r                              );

	// breathe
	QUICKPOLL ( m_r->m_niceness );

	// . we have to do this here now too
	// . but if we are getting weights, we don't need m_tt!
	// . actually we were using it before for rat=0/bool queries but
	//   i got rid of NO_RAT_SLOTS
	if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		//sendReply ( m_slot , this , NULL , 0 , 0 , true);
		return true;
	}

	// if msg2 had ALL empty lists we can cut it int16_t
	if ( m_posdbTable.m_topTree->m_numNodes == 0 ) {
		//estimateHitsAndSendReply ( );
		return true;
	}
		

	// we have to allocate this with each call because each call can
	// be a different docid range from doDocIdSplitLoop.
	if ( ! m_posdbTable.allocWhiteListTable() ) {
		log("msg39: Had error allocating white list table: %s.",
		    mstrerror(g_errno));
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		//sendReply (m_slot,this,NULL,0,0,true);
		return true; 
	}


	// do not re do it if doing docid range splitting
	m_allocedTree = true;


	// . now we must call this separately here, not in allocTopTree()
	// . we have to re-set the QueryTermInfos with each docid range split
	//   since it will set the list ptrs from the msg2 lists
	if ( ! m_posdbTable.setQueryTermInfo () ) return true;

	// print query term bit numbers here
	for ( int32_t i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
		QueryTerm *qt = &m_tmpq.m_qterms[i];
		//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
		char *tpc = qt->m_term + qt->m_termLen;
		char  tmp = *tpc;
		*tpc = '\0';
		SafeBuf sb;
		sb.safePrintf("query: msg39: BITNUM query term #%"INT32" \"%s\" "
			      "bitnum=%"INT32" ", i , qt->m_term, qt->m_bitNum );
		// put it back
		*tpc = tmp;
		logf(LOG_DEBUG,"%s",sb.getBufStart());
	}


	// timestamp log
	if ( m_debug ) {
		log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		    "Preparing to intersect "
		     "took %"INT64" ms",
		     (PTRTYPE)this, 
		    gettimeofdayInMilliseconds() - m_startTime );
		m_startTime = gettimeofdayInMilliseconds();
	}

	// time it
	int64_t start = gettimeofdayInMilliseconds();
	int64_t diff;

	// . don't bother making a thread if lists are small
	// . look at STAGE? in IndexReadInfo.cpp to see how we read in stages
	// . it's always saying msg39 handler is hogging cpu...could this be it
	//if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread;

	// debug
	//goto skipThread;

	// . NOW! let's do this in a thread so we can continue to service
	//   incoming requests
	// . don't launch more than 1 thread at a time for this
	// . set callback when thread done

	// breathe
	QUICKPOLL ( m_r->m_niceness );

	// . create the thread
	// . only one of these type of threads should be launched at a time
	if ( ! m_debug &&
	     g_threads.call ( INTERSECT_THREAD  , // threadType
			      m_r->m_niceness   ,
			      this              , // top 4 bytes must be cback
			      controlLoopWrapper2,//threadDoneWrapper ,
			      addListsWrapper   ) ) {
		m_blocked = true;
		return false;
	}
	// if it failed
	//log(LOG_INFO,"query: Intersect thread creation failed. Doing "
	//    "blocking. Hurts performance.");
	// check tree
	if ( m_tt.m_nodes == NULL ) {
		log(LOG_LOGIC,"query: msg39: Badness."); 
		char *xx = NULL; *xx = 0; }

	// sometimes we skip the thread
	//skipThread:
	// . addLists() should never have a problem
	// . g_errno should be set by prepareToAddLists() above if there is
	//   going to be a problem
	//if ( m_r->m_useNewAlgo )
	m_posdbTable.intersectLists10_r ( );
	//else
	//	m_posdbTable.intersectLists9_r ( );

	// time it
	diff = gettimeofdayInMilliseconds() - start;
	if ( diff > 10 ) log("query: Took %"INT64" ms for intersection",diff);

	// returns false if blocked, true otherwise
	//return addedLists ();
	return true;
}

Exemple #15

0

Afficher le fichier

Fichier : Msg39.cpp Projet : DeadNumbers/open-source-search-engine

// . returns false if blocked, true otherwise
// . sets g_errno on error
// . called either from 
//   1) doDocIdSplitLoop
//   2) or getDocIds2() if only 1 docidsplit
bool Msg39::getLists () {

	if ( m_debug ) m_startTime = gettimeofdayInMilliseconds();
	// . ask Indexdb for the IndexLists we need for these termIds
	// . each rec in an IndexList is a termId/score/docId tuple

	//
	// restrict to docid range?
	//
	// . get the docid start and end
	// . do docid paritioning so we can send to all hosts
	//   in the network, not just one stripe
	int64_t docIdStart = 0;
	int64_t docIdEnd = MAX_DOCID;
	// . restrict to this docid?
	// . will really make gbdocid:| searches much faster!
	int64_t dr = m_tmpq.m_docIdRestriction;
	if ( dr ) {
		docIdStart = dr;
		docIdEnd   = dr + 1;
	}
	// . override
	// . this is set from Msg39::doDocIdSplitLoop() to compute 
	//   search results in stages, so that we do not load massive
	//   termlists into memory and got OOM (out of memory)
	if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId;
	if ( m_r->m_maxDocId != -1 ) docIdEnd   = m_r->m_maxDocId+1;
	
	// if we have twins, then make sure the twins read different
	// pieces of the same docid range to make things 2x faster
	//bool useTwins = false;
	//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
	//if ( useTwins ) {
	//	int64_t delta2 = ( docIdEnd - docIdStart ) / 2;
	//	if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
	//	else                      docIdStart = docIdStart + delta2;
	//}
	// new striping logic:
	int32_t numStripes = g_hostdb.getNumStripes();
	int64_t delta2 = ( docIdEnd - docIdStart ) / numStripes;
	int32_t stripe = g_hostdb.getMyHost()->m_stripe;
	docIdStart += delta2 * stripe; // is this right?
	docIdEnd = docIdStart + delta2;
	// add 1 to be safe so we don't lose a docid
	docIdEnd++;
	// TODO: add triplet support later for this to split the
	// read 3 ways. 4 ways for quads, etc.
	//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
	// do not go over MAX_DOCID  because it gets masked and
	// ends up being 0!!! and we get empty lists
	if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
	// remember so Msg2.cpp can use them to restrict the termlists 
	// from "whiteList" as well
	m_docIdStart = docIdStart;
	m_docIdEnd   = docIdEnd;
	

	//
	// set startkey/endkey for each term/termlist
	//
	for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
		// breathe
		QUICKPOLL ( m_r->m_niceness );
		// int16_tcuts
		QueryTerm *qterm = &m_tmpq.m_qterms[i];
		char *sk = qterm->m_startKey;
		char *ek = qterm->m_endKey;
		// get the term id
		int64_t tid = m_tmpq.getTermId(i);
		// if only 1 stripe
		//if ( g_hostdb.getNumStripes() == 1 ) {
		//	docIdStart = 0;
		//	docIdEnd   = MAX_DOCID;
		//}
		// debug
		if ( m_debug )
			log("query: setting sk/ek for docids %"INT64""
			    " to %"INT64" for termid=%"INT64""
			    , docIdStart
			    , docIdEnd
			    , tid
			    );
		// store now in qterm
		g_posdb.makeStartKey ( sk , tid , docIdStart );
		g_posdb.makeEndKey   ( ek , tid , docIdEnd   );
		qterm->m_ks = sizeof(POSDBKEY);//key144_t);
	}

	// debug msg
	if ( m_debug || g_conf.m_logDebugQuery ) {
		for ( int32_t i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
			// get the term in utf8
			//char bb[256];
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
			char *tpc = qt->m_term + qt->m_termLen;
			char  tmp = *tpc;
			*tpc = '\0';
			char sign = qt->m_termSign;
			if ( sign == 0 ) sign = '0';
			QueryWord *qw = qt->m_qword;
			int32_t wikiPhrId = qw->m_wikiPhraseId;
			if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0;
			char leftwikibigram = 0;
			char rightwikibigram = 0;
			if ( qt->m_leftPhraseTerm &&
			     qt->m_leftPhraseTerm->m_isWikiHalfStopBigram )
				leftwikibigram = 1;
			if ( qt->m_rightPhraseTerm &&
			     qt->m_rightPhraseTerm->m_isWikiHalfStopBigram )
				rightwikibigram = 1;
			/*
			char c = m_tmpq.getTermSign(i);
			char tt[512];
			int32_t ttlen = m_tmpq.getTermLen(i);
			if ( ttlen > 254 ) ttlen = 254;
			if ( ttlen < 0   ) ttlen = 0;
			// old:painful: convert each term from unicode to ascii
			gbmemcpy ( tt , m_tmpq.getTerm(i) , ttlen );
			*/
			int32_t isSynonym = 0;
			QueryTerm *st = qt->m_synonymOf;
			if ( st ) isSynonym = true;
			SafeBuf sb;
			// now we can display it
			//tt[ttlen]='\0';
			//if ( c == '\0' ) c = ' ';
			sb.safePrintf(
			     "query: msg39: [%"PTRFMT"] "
			     "query term #%"INT32" \"%s\" "
			     "phr=%"INT32" termId=%"UINT64" rawTermId=%"UINT64" "
			     //"estimatedTermFreq=%"INT64" (+/- ~16000) "
			     "tfweight=%.02f "
			     "sign=%c "
			     "numPlusses=%hhu "
			     "required=%"INT32" "
			     "fielcode=%"INT32" "

			     "ebit=0x%0"XINT64" "
			     "impBits=0x%0"XINT64" "

			     "wikiphrid=%"INT32" "
			     "leftwikibigram=%"INT32" "
			     "rightwikibigram=%"INT32" "
			     //"range.startTermNum=%hhi range.endTermNum=%hhi "
			     //"minRecSizes=%"INT32" "
			     "readSizeInBytes=%"INT32" "
			     //"ebit=0x%"XINT64" "
			     //"impBits=0x%"XINT64" "
			     "hc=%"INT32" "
			     "component=%"INT32" "
			     "otermLen=%"INT32" "
			     "isSynonym=%"INT32" "
			     "querylangid=%"INT32" " ,
			     (PTRTYPE)this ,
			     i          ,
			     qt->m_term,//bb ,
			     (int32_t)m_tmpq.isPhrase (i) ,
			     m_tmpq.getTermId      (i) ,
			     m_tmpq.getRawTermId   (i) ,
			     ((float *)m_r->ptr_termFreqWeights)[i] ,
			     sign , //c ,
			     0 , 
			     (int32_t)qt->m_isRequired,
			     (int32_t)qt->m_fieldCode,

			     (int64_t)qt->m_explicitBit  ,
			     (int64_t)qt->m_implicitBits ,

			     wikiPhrId,
			     (int32_t)leftwikibigram,
			     (int32_t)rightwikibigram,
			     ((int32_t *)m_r->ptr_readSizes)[i]         ,
			     //(int64_t)m_tmpq.m_qterms[i].m_explicitBit  ,
			     //(int64_t)m_tmpq.m_qterms[i].m_implicitBits ,
			     (int32_t)m_tmpq.m_qterms[i].m_hardCount ,
			     (int32_t)m_tmpq.m_componentCodes[i],
			     (int32_t)m_tmpq.getTermLen(i) ,
			     isSynonym,
			     (int32_t)m_tmpq.m_langId ); // ,tt
			// put it back
			*tpc = tmp;
			if ( st ) {
				int32_t stnum = st - m_tmpq.m_qterms;
				sb.safePrintf("synofterm#=%"INT32"",stnum);
				//sb.safeMemcpy(st->m_term,st->m_termLen);
				sb.pushChar(' ');
				sb.safePrintf("synwid0=%"INT64" ",qt->m_synWids0);
				sb.safePrintf("synwid1=%"INT64" ",qt->m_synWids1);
				sb.safePrintf("synalnumwords=%"INT32" ",
					      qt->m_numAlnumWordsInSynonym);
				// like for synonym "nj" it's base,
				// "new jersey" has 2 alnum words!
				sb.safePrintf("synbasealnumwords=%"INT32" ",
					      qt->m_numAlnumWordsInBase);
			}
			logf(LOG_DEBUG,"%s",sb.getBufStart());

		}
		m_tmpq.printBooleanTree();
	}
	// timestamp log
	if ( m_debug ) 
		log(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
		    "Getting %"INT32" index lists ",
		     (PTRTYPE)this,m_tmpq.getNumTerms());
	// . now get the index lists themselves
	// . return if it blocked
	// . not doing a merge (last parm) means that the lists we receive
	//   will be an appending of a bunch of lists so keys won't be in order
	// . merging is uneccessary for us here because we hash the keys anyway
	// . and merging takes up valuable cpu time
	// . caution: the index lists returned from Msg2 are now compressed
	// . now i'm merging because it's 10 times faster than hashing anyway
	//   and the reply buf should now always be <= minRecSizes so we can
	//   pre-allocate one better, and, 3) this should fix the yahoo.com 
	//   reindex bug
	char rdbId = RDB_POSDB;

	// . TODO: MDW: fix
	// . partap says there is a bug in this??? we can't cache UOR'ed lists?
	bool checkCache = false;
	// split is us????
	//int32_t split = g_hostdb.m_myHost->m_group;
	int32_t split = g_hostdb.m_myHost->m_shardNum;
	// call msg2
	if ( ! m_msg2.getLists ( rdbId                      ,
				 m_r->m_collnum,//m_r->ptr_coll              ,
				 m_r->m_maxAge              ,
				 m_r->m_addToCache          ,
				 //m_tmpq.m_qterms ,
				 &m_tmpq,
				 m_r->ptr_whiteList,
				 // we need to restrict docid range for
				 // whitelist as well! this is from
				 // doDocIdSplitLoop()
				 m_docIdStart,
				 m_docIdEnd,
				 // how much of each termlist to read in bytes
				 (int32_t *)m_r->ptr_readSizes ,
				 //m_tmpq.getNumTerms()       , // numLists
				 // 1-1 with query terms
				 m_lists                    ,
				 this                       ,
				 controlLoopWrapper,//gotListsWrapper      ,
				 m_r                        ,
				 m_r->m_niceness            ,
				 true                       , // do merge?
				 m_debug                  ,
				 NULL                       ,  // best hostids
				 m_r->m_restrictPosdbForQuery  ,
				 split                      ,
				 checkCache                 )) {
		m_blocked = true;
		return false;
	}

	// error?
	//if ( g_errno ) { 
	//	log("msg39: Had error getting termlists2: %s.",
	//	    mstrerror(g_errno));
	//	// don't bail out here because we are in docIdSplitLoop()
	//	//sendReply (m_slot,this,NULL,0,0,true);
	//	return true; 
	//}
	
	//return gotLists ( true );
	return true;
}

Exemple #16

0

Afficher le fichier

Fichier : Msg39.cpp Projet : DeadNumbers/open-source-search-engine

void Msg39::estimateHitsAndSendReply ( ) {

	// no longer in use
	m_inUse = false;

	// now this for the query loop on the QueryLogEntries.
	m_topDocId50 = 0LL;
	m_topScore50 = 0.0;

	// a little hack for the seo pipeline in xmldoc.cpp
	m_topDocId  = 0LL;
	m_topScore  = 0.0;
	m_topDocId2 = 0LL;
	m_topScore2 = 0.0;
	int32_t ti = m_tt.getHighNode();
	if ( ti >= 0 ) {
		TopNode *t = &m_tt.m_nodes[ti];
		m_topDocId = t->m_docId;
		m_topScore = t->m_score;
	}
	// try the 2nd one too
	int32_t ti2 = -1;
	if ( ti >= 0 ) ti2 = m_tt.getNext ( ti );
	if ( ti2 >= 0 ) {
		TopNode *t2 = &m_tt.m_nodes[ti2];
		m_topDocId2 = t2->m_docId;
		m_topScore2 = t2->m_score;
	}

	// convenience ptrs. we will store the docids/scores into these arrays
	int64_t *topDocIds;
	double    *topScores;
	key_t     *topRecs;

	// numDocIds counts docs in all tiers when using toptree.
	int32_t numDocIds = m_tt.m_numUsedNodes;

	// the msg39 reply we send back
	int32_t  replySize;
	char *reply;

	//m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6;

	// make the reply?
	Msg39Reply mr;

	// this is what you want to look at if there is no seo.cpp module...
	if ( ! m_callback ) {
		// if we got clusterdb recs in here, use 'em
		if ( m_gotClusterRecs ) numDocIds = m_numVisible;
		
		// don't send more than the docs that are asked for
		if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet;

		// # of QueryTerms in query
		int32_t nqt = m_tmpq.m_numTerms;
		// start setting the stuff
		mr.m_numDocIds = numDocIds;
		// copy # estiamted hits into 8 bytes of reply
		//int64_t est = m_posdbTable.m_estimatedTotalHits;
		// ensure it has at least as many results as we got
		//if ( est < numDocIds ) est = numDocIds;
		// or if too big...
		//if ( numDocIds < m_r->m_docsToGet ) est = numDocIds;
		// . total estimated hits
		// . this is now an EXACT count!
		mr.m_estimatedHits = m_numTotalHits;
		// sanity check
		mr.m_nqt = nqt;
		// the m_errno if any
		mr.m_errno = m_errno;
		// int16_tcut
		PosdbTable *pt = &m_posdbTable;
		// the score info, in no particular order right now
		mr.ptr_scoreInfo  = pt->m_scoreInfoBuf.getBufStart();
		mr.size_scoreInfo = pt->m_scoreInfoBuf.length();
		// that has offset references into posdbtable::m_pairScoreBuf 
		// and m_singleScoreBuf, so we need those too now
		mr.ptr_pairScoreBuf    = pt->m_pairScoreBuf.getBufStart();
		mr.size_pairScoreBuf   = pt->m_pairScoreBuf.length();
		mr.ptr_singleScoreBuf  = pt->m_singleScoreBuf.getBufStart();
		mr.size_singleScoreBuf = pt->m_singleScoreBuf.length();
		// save some time since seo.cpp gets from posdbtable directly,
		// so we can avoid serializing/copying this stuff at least
		if ( ! m_r->m_makeReply ) {
			mr.size_scoreInfo      = 0;
			mr.size_pairScoreBuf   = 0;
			mr.size_singleScoreBuf = 0;
		}
		//mr.m_sectionStats    = pt->m_sectionStats;
		// reserve space for these guys, we fill them in below
		mr.ptr_docIds       = NULL;
		mr.ptr_scores       = NULL;
		mr.ptr_clusterRecs  = NULL;
		// this is how much space to reserve
		mr.size_docIds      = 8 * numDocIds; // int64_t
		mr.size_scores      = sizeof(double) * numDocIds; // float
		// if not doing site clustering, we won't have these perhaps...
		if ( m_gotClusterRecs ) 
			mr.size_clusterRecs = sizeof(key_t) *numDocIds;
		else    
			mr.size_clusterRecs = 0;

		#define MAX_FACETS 20000

		/////////////////
		//
		// FACETS
		//
		/////////////////

		// We can have multiple gbfacet: terms in a query so
		// serialize all the QueryTerm::m_facetHashTables into
		// Msg39Reply::ptr_facetHashList.
		//
		// combine the facet hash lists of each query term into
		// a list of lists. each lsit is preceeded by the query term
		// id of the query term (like gbfacet:xpathsitehash12345)
		// followed by a 4 byte length of the following 32-bit
		// facet values
		int32_t need = 0;
		for ( int32_t i = 0 ; i < m_tmpq.m_numTerms; i++ ) {
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			// skip if not facet
			if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
			     qt->m_fieldCode != FIELD_GBFACETINT &&
			     qt->m_fieldCode != FIELD_GBFACETFLOAT )
				continue;
			HashTableX *ft = &qt->m_facetHashTable;
			if ( ft->m_numSlotsUsed == 0 ) continue;
			int32_t used = ft->m_numSlotsUsed;
			// limit for memory
			if ( used > (int32_t)MAX_FACETS ) {
				log("msg39: truncating facet list to 20000 "
				    "from %"INT32" for %s",used,qt->m_term);
				used = (int32_t)MAX_FACETS;
			}
			// store query term id 64 bit
			need += 8;
			// then size
			need += 4;
			// then buckets. keys and counts
			need += (4+sizeof(FacetEntry)) * used;
		}
		// allocate
		SafeBuf tmp;
		if ( ! tmp.reserve ( need ) ) {
			log("query: Could not allocate memory "
			    "to hold reply facets");
			sendReply(m_slot,this,NULL,0,0,true);
			return;
		}
		// point to there
		char *p = tmp.getBufStart();
		for ( int32_t i = 0 ; i < m_tmpq.m_numTerms ; i++ ) {
			QueryTerm *qt = &m_tmpq.m_qterms[i];
			// skip if not facet
			if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
			     qt->m_fieldCode != FIELD_GBFACETINT &&
			     qt->m_fieldCode != FIELD_GBFACETFLOAT )
				continue;
			// get all the facet hashes and their counts
			HashTableX *ft = &qt->m_facetHashTable;
			// skip if none
			if ( ft->m_numSlotsUsed == 0 ) continue;
			// store query term id 64 bit
			*(int64_t *)p = qt->m_termId;
			p += 8;
			int32_t used = ft->getNumSlotsUsed();
			if ( used > (int32_t)MAX_FACETS ) 
				used = (int32_t)MAX_FACETS;
			// store count
			*(int32_t *)p = used;
			p += 4;
			int32_t count = 0;
			// for sanity check
			char *pend = p + (used * (4+sizeof(FacetEntry)));
			// serialize the key/val pairs
			for ( int32_t k = 0 ; k < ft->m_numSlots ; k++ ) {
				// skip empty buckets
				if ( ! ft->m_flags[k] ) continue;
				// store key. the hash of the facet value.
				*(int32_t *)p = ft->getKey32FromSlot(k); p += 4;
				// then store count
				//*(int32_t *)p = ft->getVal32FromSlot(k); p += 4;
				// now this has a docid on it so we can
				// lookup the text of the facet in Msg40.cpp
				FacetEntry *fe;
				fe = (FacetEntry *)ft->getValFromSlot(k);
				// sanity
				// no, count can be zero if its a range facet
				// that was never added to. we add those
				// empty FaceEntries only for range facets
				// in Posdb.cpp
				//if(fe->m_count == 0 ) { char *xx=NULL;*xx=0;}
				gbmemcpy ( p , fe , sizeof(FacetEntry) );
				p += sizeof(FacetEntry);
				// do not breach
				if ( ++count >= (int32_t)MAX_FACETS ) break;
			}
			// sanity check
			if ( p != pend ) { char *xx=NULL;*xx=0; }
			// do the next query term
		}
		// now point to that so it can be serialized below
		mr.ptr_facetHashList  = tmp.getBufStart();
		mr.size_facetHashList = p - tmp.getBufStart();//tmp.length();

		/////////////
		//
		// END FACETS
		//
		/////////////


		// . that is pretty much it,so serialize it into buffer,"reply"
		// . mr.ptr_docIds, etc., will point into the buffer so we can
		//   re-serialize into it below from the tree
		// . returns NULL and sets g_errno on error
		// . "true" means we should make mr.ptr_* reference into the 
		//   newly  serialized buffer.
		reply = serializeMsg ( sizeof(Msg39Reply), // baseSize
				       &mr.size_docIds, // firstSizeParm
				       &mr.size_clusterRecs,//lastSizePrm
				       &mr.ptr_docIds , // firstStrPtr
				       &mr , // thisPtr
				       &replySize , 
				       NULL , 
				       0 , 
				       true ) ;
		if ( ! reply ) {
			log("query: Could not allocated memory "
			    "to hold reply of docids to send back.");
			sendReply(m_slot,this,NULL,0,0,true);
			return;
		}
		topDocIds    = (int64_t *) mr.ptr_docIds;
		topScores    = (double    *) mr.ptr_scores;
		topRecs      = (key_t     *) mr.ptr_clusterRecs;
	}

	int32_t docCount = 0;
	// loop over all results in the TopTree
	for ( int32_t ti = m_tt.getHighNode() ; ti >= 0 ; 
	      ti = m_tt.getPrev(ti) ) {
		// get the guy
		TopNode *t = &m_tt.m_nodes[ti];
		// skip if clusterLevel is bad!
		if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK ) 
			continue;

		// if not sending back a reply... we were called from seo.cpp
		// State3f logic to evaluate a QueryLogEntry, etc.
		if ( m_callback ) {
			// skip results past #50
			if ( docCount > 50 ) continue;
			// set this
			m_topScore50 = t->m_score;
			m_topDocId50 = t->m_docId;
			// that's it
			continue;
		}

		// get the docid ptr
		//char      *diptr = t->m_docIdPtr;
		//int64_t  docId = getDocIdFromPtr(diptr);
		// sanity check
		if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; }
		//add it to the reply
		topDocIds         [docCount] = t->m_docId;
		topScores         [docCount] = t->m_score;
		if ( m_tt.m_useIntScores ) 
			topScores[docCount] = (double)t->m_intScore;
		// supply clusterdb rec? only for full splits
		if ( m_gotClusterRecs ) 
			topRecs [docCount] = t->m_clusterRec;
		//topExplicits      [docCount] = 
		//	getNumBitsOn(t->m_explicits)
		docCount++;

		// 50th score? set this for seo.cpp. if less than 50 results
		// we want the score of the last doc then.
		if ( docCount <= 50 ) m_topScore50 = t->m_score;
		
		if ( m_debug ) {
			logf(LOG_DEBUG,"query: msg39: [%"PTRFMT"] "
			    "%03"INT32") docId=%012"UINT64" sum=%.02f",
			    (PTRTYPE)this, docCount,
			    t->m_docId,t->m_score);
		}
		//don't send more than the docs that are wanted
		if ( docCount >= numDocIds ) break;
	}
 	if ( docCount > 300 && m_debug )
		log("query: Had %"INT32" nodes in top tree",docCount);

	// this is sensitive info
	if ( m_debug ) {
		log(LOG_DEBUG,
		    "query: msg39: [%"PTRFMT"] "
		    "Intersected lists took %"INT64" (%"INT64") "
		    "ms "
		    "docIdsToGet=%"INT32" docIdsGot=%"INT32" "
		    "q=%s",
		    (PTRTYPE)this                        ,
		    m_posdbTable.m_addListsTime       ,
		    gettimeofdayInMilliseconds() - m_startTime ,
		    m_r->m_docsToGet                       ,
		    numDocIds                         ,
		    m_tmpq.getQuery()                 );
	}


	// if we blocked because we used a thread then call callback if
	// summoned from a msg3f handler and not a msg39 handler
	if ( m_callback ) {
		// if we blocked call user callback
		if ( m_blocked ) m_callback ( m_state );
		// if not sending back a udp reply, return now
		return;
	}

	// now send back the reply
	sendReply(m_slot,this,reply,replySize,replySize,false);
	return;
}

Exemple #17

0

Afficher le fichier

Fichier : PageAddUrl.cpp Projet : nikhs/open-source-search-engine

bool sendReply ( void *state , bool addUrlEnabled ) {
	// allow others to add now
	//s_inprogress = false;
	// get the state properly
	//gr *st1 = (gr *) state;
	GigablastRequest *gr = (GigablastRequest *)state;
	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log(LOG_INFO,"http: add url %s (%s)",
		    xb.getBufStart(),mstrerror(g_errno));
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock    = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}


	long ulen = 0;
	char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
	
	//char rawbuf[1024*8];
	//SafeBuf rb(rawbuf, 1024*8);	
	//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
	//rb.safePrintf("<status>\n");
	//CollectionRec *cr = g_collectiondb.getRec ( gr->m_coll );
	
	// collection name

	char tt [ 128 ];
	tt[0] = '\0';

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// display url
	//char *url = gr->m_urlsBuf;
	//if ( url && ! url[0] ) url = NULL;

	// watch out for NULLs
	if ( ! url ) url = "http://";

	// if there was an error let them know
	//char msg[MAX_URL_LEN + 1024];
	SafeBuf mbuf;
	//char *pm = "";
	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", 
				mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
		//pm = msg;
		//rb.safePrintf("Error adding url(s): %s[%i]", 
		//	      mstrerror(g_errno) , g_errno);
	}
	else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b> added to spider "
				 "queue "
				 "successfully<br><br>");
		mbuf.safePrintf("</font></center>");
		//rb.safePrintf("%s added to spider "
		//	      "queue successfully", url );
		//pm = msg;
		//url = "http://";
		//else
		//	pm = "Don't forget to <a href=/gigaboost.html>"
		//		"Gigaboost</a> your URL.";
	}


	if ( mbuf.length() ) sb.safeStrcpy ( mbuf.getBufStart() );

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage (sock, 
					     sb.getBufStart(), 
					     sb.length(),
					     -1 ); // cachetime
}

Exemple #18

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool qaspider1 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // just one spider out allowed for consistency
	       "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode("tag:shallow site:www.walmart.com\r\n"
			     "tag:shallow site:http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		
	//
	// use the add url interface now
	// walmart.com above was not seeded because of the site: directive
	// so this will seed it.
	//
	//static bool s_y2 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=1"
			      "&urls=www.walmart.com+ibm.com"
			      );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		//sb.urlEncode(s_urls1);
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}


	// wait for index msg4 to not be cached to ensure all results indexed
	if ( ! s_flags[22] ) {
		s_flags[22] = true;
		wait(1.5);
	}


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1672870556 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A0",
				908338607 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&"
				"q=gbfacetstr%3Agbxpathsitehash2492664135",
				55157060 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[13] ) {
	// 	s_flags[13] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=site2%3Awww.walmart.com+"
				"gbsortby%3Agbspiderdate",
				999 ) )
			return false;
	}

	// xpath is like a title here i think. check the returned
	// facet table in the left column
	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&"
				"q=gbfacetstr%3Agbxpathsitehash3624590799"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[19] ) {
		s_flags[19] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetint%3Agbhopcount"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&"
				"q=gbfacetint%3Alog.score"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[21] ) {
		s_flags[21] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetfloat%3Atalks.rating"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[23] ) {
		s_flags[23] = true;
		// test facets mixed with gigabits in left hand column
		if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&"
				"q=gbfacetint%3Agbhopcount+walmart"
				, 999 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA SPIDER1 TEST");
		return true;
	}

	return true;
}

Exemple #19

0

Afficher le fichier

Fichier : PageAddColl.cpp Projet : exename/open-source-search-engine

bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
	// get collection name
	//int32_t  nclen;
	//char *nc   = r->getString ( "nc" , &nclen );
	//int32_t  cpclen;
	//char *cpc  = r->getString ( "cpc" , &cpclen );

	g_errno = 0;

	//bool cast = r->getLong("cast",0);

	const char *msg = NULL;

	// if any host in network is dead, do not do this
	//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";

	char format = r->getReplyFormat();


	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		// no addcoll given?
		int32_t  page = g_pages.getDynamicPageNumber ( r );
		const char *addcoll = r->getString("addcoll",NULL);
		const char *delcoll = r->getString("delcoll",NULL);
		if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
		if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
		if ( page == PAGE_ADDCOLL && ! addcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no addcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		if ( page == PAGE_DELCOLL && ! delcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no delcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		return g_httpServer.sendSuccessReply(s,format);
	}

	// error?
	const char *action = r->getString("action",NULL);
	const char *addColl = r->getString("addcoll",NULL);


	char  buf [ 64*1024 ];
	SafeBuf p(buf, 64*1024);


	//
	// CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS
	//

	SafeBuf gtmp;
	char *gmsg = NULL;
	// is it too big?
	if ( action && addColl && strlen(addColl) > MAX_COLL_LEN ) {
		gtmp.safePrintf("search engine name is too long");
		gmsg = gtmp.getBufStart();
	}
	// from Collectiondb.cpp::addNewColl() ensure coll name is legit
	const char *x = addColl;
	for ( ; x && *x ; x++ ) {
		if ( is_alnum_a(*x) ) continue;
		if ( *x == '-' ) continue;
		if ( *x == '_' ) continue; // underscore now allowed
		break;
	}
	if ( x && *x ) {
		g_errno = EBADENGINEER;
		gtmp.safePrintf("<font color=red>Error. \"%s\" is a "
				"malformed name because it "
				"contains the '%c' character.</font><br><br>",
				addColl,*x);
		gmsg = gtmp.getBufStart();
	}

	//
	// END GIGABOT ERRORS
	//



	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	// if added the coll successfully, do not print same page, jump to
	// printing the basic settings page so they can add sites to it.
	// crap, this GET request, "r", is missing the "c" parm sometimes.
	// we need to use the "addcoll" parm anyway. maybe print a meta
	// redirect then?
	char guide = r->getLong("guide",0);
	// do not redirect if gmsg is set, there was a problem with the name
	if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) {
		//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
		// just redirect to it
		if ( addColl )
			p.safePrintf("<meta http-equiv=Refresh "
				      "content=\"0; URL=/admin/settings"
				      "?guide=1&c=%s\">",
				      addColl);
		return g_httpServer.sendDynamicPage (s,
						     p.getBufStart(),
						     p.length());
	}


	// print standard header
	g_pages.printAdminTop ( &p , s , r , NULL, 
				"onload=document."
				"getElementById('acbox').focus();");


	if ( g_errno ) {
		msg = mstrerror( g_errno );
	}

	if ( msg && ! guide ) {
		const char *cc = "deleting";
		if ( add ) cc = "adding";
		p.safePrintf (
			  "<center>\n"
			  "<font color=red>"
			  "<b>Error %s collection: %s. "
			  "See log file for details.</b>"
			  "</font>"
			  "</center><br>\n",cc,msg);
	}

	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	if ( add && guide )
		printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg );



	// print the add collection box
	if ( add /*&& (! nc[0] || g_errno ) */ ) {

		const char *t1 = "Add Collection";
		if ( guide ) t1 = "Add Search Engine";

		p.safePrintf (
			  "<center>\n<table %s>\n"
			   "<tr class=hdrow><td colspan=2>"
			  "<center><b>%s</b></center>"
			  "</td></tr>\n"
			  ,TABLE_STYLE
			  ,t1
			      );
		const char *t2 = "collection";
		if ( guide ) t2 = "search engine";
		const char *str = addColl;
		if ( ! addColl ) str = "";
		p.safePrintf (
			      "<tr bgcolor=#%s>"
			      "<td><b>name of new %s to add</td>\n"
			      "<td><input type=text name=addcoll size=30 "
			      "id=acbox "
			      "value=\"%s\">"
			      "</td></tr>\n"
			      , LIGHT_BLUE
			      , t2 
			      , str
			      );

		// don't show the clone box if we are under gigabot the guide
		if ( ! guide )
			p.safePrintf(
				     "<tr bgcolor=#%s>"
				     "<td><b>clone settings from this "
				     "collection</b>"
				     "<br><font size=1>Copy settings from "
				     "this pre-existing collection. Leave "
				     "blank to "
				     "accept default values.</font></td>\n"
				     "<td><input type=text name=clonecoll "
				     "size=30>"
				     "</td>"
				     "</tr>"
				     , LIGHT_BLUE
				     );

		// collection pwds
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection passwords"
			     "</b>"
			     "<br><font size=1>List of white space separated "
			     "passwords allowed to adminster collection."
			     "</font>"
			     "</td>\n"
			     "<td><input type=text name=collpwd "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// ips box for security
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection ips"
			     "</b>"

			     "<br><font size=1>List of white space separated "
			     "IPs allowed to adminster collection."
			     "</font>"

			     "</td>\n"
			     "<td><input type=text name=collips "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// now list collections from which to copy the config
		//p.safePrintf (
		//	  "<tr><td><b>copy configuration from this "
		//	  "collection</b><br><font size=1>Leave blank to "
		//	  "accept default values.</font></td>\n"
		//	  "<td><input type=text name=cpc value=\"%s\" size=30>"
		//	  "</td></tr>\n",coll);
		p.safePrintf ( "</table></center><br>\n");

		// wrap up the form started by printAdminTop
		g_pages.printAdminBottom ( &p );
		int32_t bufLen = p.length();
		return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
	}

	// if we added a collection, print its page
	//if ( add && nc[0] && ! g_errno ) 
	//	return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH ,
	//					  nc , pwd );

	if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip;

	// print all collections out in a checklist so you can check the
	// ones you want to delete, the values will be the id of that collectn
	p.safePrintf (
		  "<center>\n<table %s>\n"
		  "<tr class=hdrow><td><center><b>Delete Collections"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  "<center><b>Select the collections you wish to delete. "
		  //"<font color=red>This feature is currently under "
		  //"development.</font>"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  // table within a table
		  "<center><table width=20%%>\n",
		  TABLE_STYLE,
		  LIGHT_BLUE,
		  DARK_BLUE
		      );

	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cr = g_collectiondb.m_recs[i];
		if ( ! cr ) continue;
		p.safePrintf (
			  "<tr bgcolor=#%s><td>"
			  "<input type=checkbox name=delcoll value=\"%s\"> "
			  "%s</td></tr>\n",
			  DARK_BLUE,
			  cr->m_coll,cr->m_coll);
	}
	p.safePrintf( "</table></center></td></tr></table><br>\n" );
skip:
	// wrap up the form started by printAdminTop
	g_pages.printAdminBottom ( &p );
	int32_t bufLen = p.length();
	return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
}

Exemple #20

0

Afficher le fichier

Fichier : PageParser.cpp Projet : acchou/open-source-search-engine

bool processLoop ( void *state ) {
	// cast it
	State8 *st = (State8 *)state;
	// get the xmldoc
	XmlDoc *xd = &st->m_xd;

	// error?
	if ( g_errno ) return sendErrorReply ( st , g_errno );

	// shortcut
	SafeBuf *xbuf = &st->m_xbuf;

	if ( st->m_u && st->m_u[0] ) {
		// . save the ips.txt file if we are the test coll
		// . saveTestBuf() is a function in Msge1.cpp
		CollectionRec *cr = xd->getCollRec();
		if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
			// use same dir that XmlDoc::getTestDir() would use
			//saveTestBuf ( "test-page-parser" );
			saveTestBuf("qa");
		// now get the meta list, in the process it will print out a 
		// bunch of junk into st->m_xbuf
		char *metalist = xd->getMetaList ( );
		if ( ! metalist ) return sendErrorReply ( st , g_errno );
		// return false if it blocked
		if ( metalist == (void *)-1 ) return false;
		// for debug...
		if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
		// print it out
		xd->printDoc( xbuf );
	}

	// print reason we can't analyze it (or index it)
	//if ( st->m_indexCode != 0 ) {
	//	xbuf->safePrintf ("<br><br><b>indexCode: %s</b>\n<br>", 
	//			  mstrerror(st->m_indexCode));
	//}

	// we are done
	g_inPageParser = false;

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );

	//log("parser: send sock=%li",st->m_s->m_sd);
	
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage( st->m_s , 
						    xbuf->getBufStart(), 
						    xbuf->length() ,
						    -1, //cachtime
						    false ,//postreply?
						    NULL, //ctype
						    -1 , //httpstatus
						    NULL,//cookie
						    "utf-8");
	// delete the state now
	if ( st->m_freeIt ) {
		mdelete ( st , sizeof(State8) , "PageParser" );
		delete (st);
	}
	// return the status
	return status;
}

Exemple #21

0

Afficher le fichier

Fichier : Msg3a.cpp Projet : rdhananjaya/open-source-search-engine

bool Msg3a::gotAllSplitReplies ( ) {

    // if any of the split requests had an error, give up and set m_errno
    // but don't set if for non critical errors like query truncation
    if ( m_errno ) {
        g_errno = m_errno;
        return true;
    }

    // also reset the finalbuf and the oldNumTopDocIds
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    // update our estimated total hits
    m_numTotalEstimatedHits = 0;

    for ( long i = 0; i < m_numHosts ; i++ ) {
        // get that host that gave us the reply
        //Host *h = g_hostdb.getHost(i);
        // . get the reply from multicast
        // . multicast should have destroyed all slots, but saved reply
        // . we are responsible for freeing the reply
        // . we need to call this even if g_errno or m_errno is
        //   set so we can free the replies in Msg3a::reset()
        // . if we don't call getBestReply() on it multicast should
        //   free it, because Multicast::m_ownReadBuf is still true
        Multicast *m = &m_mcast[i];
        bool freeit = false;
        long  replySize = 0;
        long  replyMaxSize;
        char *rbuf;
        Msg39Reply *mr;
        // . only get it if the reply not already full
        // . if reply already processed, skip
        // . perhaps it had no more docids to give us or all termlists
        //   were exhausted on its disk and this is a re-call
        // . we have to re-process it for count m_numTotalEstHits, etc.
        rbuf = m->getBestReply ( &replySize    ,
                                 &replyMaxSize ,
                                 &freeit       ,
                                 true          ); //stealIt?
        // cast it
        mr = (Msg39Reply *)rbuf;
        // in case of mem leak, re-label from "mcast" to this so we
        // can determine where it came from, "Msg3a-GBR"
        relabel( rbuf, replyMaxSize , "Msg3a-GBR" );
        // . we must be able to free it... we must own it
        // . this is true if we should free it, but we should not have
        //   to free it since it is owned by the slot?
        if ( freeit ) {
            log(LOG_LOGIC,"query: msg3a: Steal failed.");
            char *xx = NULL;
            *xx=0;
        }
        // bad reply?
        if ( ! mr ) {
            log(LOG_LOGIC,"query: msg3a: Bad NULL reply.");
            m_reply       [i] = NULL;
            m_replyMaxSize[i] = 0;
            // it might have been timd out, just ignore it!!
            continue;
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            // all reply buffers should be freed on reset()
            return true;
        }
        // how did this happen?
        if ( replySize < 29 && ! mr->m_errno ) {
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.",
                replySize);
            // all reply buffers should be freed on reset()
            return true;
        }

        // can this be non-null? we shouldn't be overwriting one
        // without freeing it...
        if ( m_reply[i] )
            // note the mem leak now
            log("query: mem leaking a 0x39 reply");

        // cast it and set it
        m_reply       [i] = mr;
        m_replyMaxSize[i] = replyMaxSize;
        // deserialize it (just sets the ptr_ and size_ member vars)
        //mr->deserialize ( );
        deserializeMsg ( sizeof(Msg39Reply) ,
                         &mr->size_docIds,
                         &mr->size_clusterRecs,
                         &mr->ptr_docIds,
                         mr->m_buf );

        // sanity check
        if ( mr->m_nqt != m_q->getNumTerms() ) {
            g_errno = EBADREPLY;
            m_errno = EBADREPLY;
            log("query: msg3a: Split reply qterms=%li != %li.",
                (long)mr->m_nqt,(long)m_q->getNumTerms() );
            return true;
        }
        // return if split had an error, but not for a non-critical
        // error like query truncation
        if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
            g_errno = mr->m_errno;
            m_errno = mr->m_errno;
            log("query: msg3a: Split had error: %s",
                mstrerror(g_errno));
            return true;
        }
        // skip down here if reply was already set
        //skip:
        // add of the total hits from each split, this is how many
        // total results the lastest split is estimated to be able to
        // return
        // . THIS should now be exact since we read all termlists
        //   of posdb...
        m_numTotalEstimatedHits += mr->m_estimatedHits;

        // debug log stuff
        if ( ! m_debug ) continue;
        // cast these for printing out
        long long *docIds    = (long long *)mr->ptr_docIds;
        score_t   *scores    = (score_t   *)mr->ptr_scores;
        // print out every docid in this split reply
        for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
            // print out score_t
            logf( LOG_DEBUG,
                  "query: msg3a: [%lu] %03li) "
                  "split=%li docId=%012llu domHash=0x%02lx "
                  "score=%lu"                     ,
                  (unsigned long)this                      ,
                  j                                        ,
                  i                                        ,
                  docIds [j] ,
                  (long)g_titledb.getDomHash8FromDocId(docIds[j]),
                  (long)scores[j] );
        }
    }

    // this seems to always return true!
    mergeLists ( );

    if ( ! m_r->m_useSeoResultsCache ) return true;

    // now cache the reply
    SafeBuf cr;
    long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4);
    long need = sizeof(key_t) + 4 + dataSize;
    bool status = cr.reserve ( need );
    // sanity
    if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) {
        char *xx=NULL;
        *xx=0;
    }
    // ignore errors
    g_errno = 0;
    // return on error with g_errno cleared if cache add failed
    if ( ! status ) return true;
    // add to buf otherwise
    cr.safeMemcpy ( &m_ckey , sizeof(key_t) );
    cr.safeMemcpy ( &dataSize , 4 );
    long now = getTimeGlobal();
    cr.pushLong ( now );
    cr.pushLong ( m_numDocIds );
    cr.pushLong ( m_numTotalEstimatedHits );//Results );
    long max = m_numDocIds;
    // then the docids
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLongLong(m_docIds[i] );
    for ( long i = 0 ; i < max ; i++ )
        cr.pushFloat(m_scores[i]);
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLong(getSiteHash26(i));
    // sanity
    if ( cr.length() != need ) {
        char *xx=NULL;
        *xx=0;
    }
    // make these
    key_t startKey;
    key_t endKey;
    startKey = m_ckey;
    // clear delbit
    startKey.n0 &= 0xfffffffffffffffeLL;
    // end key is us
    endKey = m_ckey;
    // that is the single record
    m_seoCacheList.set ( cr.getBufStart() ,
                         cr.length(),
                         cr.getBufStart(), // alloc
                         cr.getCapacity(), // alloc size
                         (char *)&startKey,
                         (char *)&endKey,
                         -1, // fixeddatasize
                         true, // owndata?
                         false,// use half keys?
                         sizeof(key_t) );
    // do not allow cr to free it, msg1 will
    cr.detachBuf();
    // note it
    //log("seopipe: storing ckey=%s q=%s"
    //    ,KEYSTR(&m_ckey,12)
    //    ,m_r->ptr_query
    //    );
    //log("msg1: sending niceness=%li",(long)m_r->m_niceness);
    // this will often block, but who cares!? it just sends a request off
    if ( ! m_msg1.addList ( &m_seoCacheList ,
                            RDB_SERPDB,//RDB_CACHEDB,
                            m_r->ptr_coll,
                            this, // state
                            gotSerpdbReplyWrapper, // callback
                            false, // forcelocal?
                            m_r->m_niceness ) ) {
        //log("blocked");
        return false;
    }

    // we can safely delete m_msg17... just return true
    return true;
}

Exemple #22

0

Afficher le fichier

Fichier : PageIndexdb.cpp Projet : BKJackson/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList ( void *state ) {
	// the state
	State10 *st = (State10 *) state;
	// launch more
	if ( ! launchRequests ( st ) ) return false;
	/*
	// get the date list
	//fprintf(stderr,"termId now=%lli\n",st->m_termId);
	//fprintf(stderr,"should be=%lli\n",(st->m_termId & TERMID_MASK));
	// . now get the indexList for this termId
	// . date is complemented, so start with bigger one first
	key128_t startKey = g_datedb.makeStartKey ( st->m_termId ,0xffffffff);
	key128_t endKey   = g_datedb.makeEndKey   ( st->m_termId ,0x0);
	// get the rdb ptr to titledb's rdb
	//Rdb *rdb = g_indexdb.getRdb();
	// -1 means read from all files in Indexdb
	long numFiles = -1;
	// make it zero if caller doesn't want to hit the disk
	if ( ! st->m_useDisk ) numFiles = 0;
	// get the title rec at or after this docId
	if ( ! st->m_msg0.getList ( -1 ,
				    0  ,
				    0  ,
				    0  ,    // max cache age
				    false , // add to cache?
				    RDB_DATEDB  , // rdbId of 2 = indexdb
				    st->m_coll ,
				    &st->m_list2  ,
				    (char *)&startKey  ,
				    (char *)&endKey    ,
				    st->m_numRecs * sizeof(key128_t),//recSizes
				    //st->m_useTree   , // include tree?
				    //st->m_useCache  , // include cache?
				    //false     , // add to cache?
				    //0         , // startFileNum
				    //numFiles  , // numFiles
				    st        , // state
				    gotIndexListWrapper2 ,
				    0  ) )  // niceness
		return false;
	// otherwise call gotResults which returns false if blocked, true else
	// and sets g_errno on error
	return gotIndexList2 ( (void *) st , NULL );
}


void gotIndexListWrapper2 ( void *state , RdbList *list ) {
	gotIndexList2 ( state , list );
}

void addedKeyWrapper ( void *state ) {
	gotIndexList2 ( state, NULL );
}

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotIndexList2 ( void *state , RdbList *list ) {
	// the state
	State10 *st = (State10 *) state;
	*/
	// get the socket
	TcpSocket *s = st->m_socket;
	// don't allow pages bigger than 128k in cache
	//char  buf [ 64*1024 ];
	// a ptr into "buf"
	//char *p    = buf;
	//char *pend = buf + 64*1024;
	/*
	// get termId
	key_t k = *(key_t *)st->m_list.getStartKey();
	long long termId = g_indexdb.getTermId ( k );
	// get groupId from termId
	//unsigned long groupId = k.n1 & g_hostdb.m_groupMask;
	unsigned long groupId = g_indexdb.getGroupIdFromKey ( &k );
	long hostnum = g_hostdb.makeHostId ( groupId );
	*/
	// check box " checked" strings
	char *ubs = "";
	char *uts = "";
	char *uds = "";
	char *ucs = "";
	char *add = "";
	char *del = "";
	if ( st->m_useDatedb) ubs = " checked";
	if ( st->m_useTree  ) uts = " checked";
	if ( st->m_useDisk  ) uds = " checked";
	if ( st->m_useCache ) ucs = " checked";
	if ( st->m_add      ) add = " checked";
	if ( st->m_del      ) del = " checked";

	SafeBuf *pbuf = &st->m_pbuf;

	g_pages.printAdminTop ( pbuf , st->m_socket , &st->m_r );

	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
	RdbBase *base; 
	if (!(base=getRdbBase((uint8_t)RDB_INDEXDB,st->m_coll))) return true;

	// print the standard header for admin pages
	pbuf->safePrintf ( 
		  "<center>\n"
		  "<table cellpadding=2><tr><td colspan=4>"
		  "useDatedb:<input type=checkbox value=1 name=ub%s> "
		  "useTree:<input type=checkbox value=1 name=ut%s> "
		  "useDisk:<input type=checkbox value=1 name=ud%s> "
		  "useCache:<input type=checkbox value=1 name=uc%s> "
		  "ADD:<input type=checkbox value=1 name=add%s> "
		  "DELETE:<input type=checkbox value=1 name=del%s>"
		  "</td></tr><tr><td>"
		  "query:"
		  "</td><td>"
		  "<input type=text name=q value=\"%s\" size=20>"
		  "</td><td>"
		  "collection:"
		  "</td><td>"
		  "<input type=text name=c value=\"%s\" size=10>"
		  "</td></tr><tr><td>"
		  "termId:"
		  "</td><td>"
		  "<input type=text name=t value=%lli size=20>"
		  "</td><td>"
		  "numRecs:"
		  "</td><td>"
		  "<input type=text name=numRecs value=%li size=10> "
		  "</td></tr><tr><td>"
		  "docId:"
		  "</td><td>"
		  "<input type=text name=d value=%lli size=20> "
		  "</td><td>"
		  "score:"
		  "</td><td>"
		  "<input type=text name=score value=%li size=10> "
		  "</td><td>"
		  "<input type=submit value=ok border=0>"
		  "</td></tr>"
		  "<tr><td colspan=2>"
		  "term appears in about %lli docs +/- %li"
		  "</td></tr>"
		  //"<tr><td colspan=2>"
		  //"this indexlist held by host #%li and twins"
		  //"</td></tr>"
		  "</table>"
		  "</form><br><br>" ,
		  ubs, uts, uds, ucs, add, del,
		  st->m_query , st->m_coll , st->m_termId  , 
		  st->m_numRecs  ,
		  st->m_docId , (long)st->m_score ,
		  st->m_termFreq ,
		  2 * (long)GB_INDEXDB_PAGE_SIZE / 6 * 
		  base->getNumFiles() );
		  //hostnum );

	if ( g_errno || (st->m_list.isEmpty() ) ) {//&&st->m_list2.isEmpty())){
		if (g_errno)pbuf->safePrintf("Error = %s",mstrerror(g_errno));
		else        pbuf->safePrintf("List is empty");
		pbuf->safePrintf("</center>");
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		bool status = g_httpServer.sendDynamicPage(s , 
							   pbuf->getBufStart(),
							   pbuf->length() );
		// delete it
		mdelete ( st , sizeof(State10) , "PageIndexdb" );
		delete (st);
		return status;
	}

	pbuf->safePrintf ( 
		  "<table cellpadding=1 border=1>" 
		  "<tr><td>#</td><td>score</td>"
		  "<td>docId</td><td>domHash</td></tr>");

	//if ( searchingEvents

	// now print the score/docId of indexlist
	long i = 0;
	for (   st->m_list.resetListPtr () ;
	      ! st->m_list.isExhausted  () ;
		st->m_list.skipCurrentRecord () ) {
		// break if buf is low
		//if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list.getCurrentDocId () ;
		unsigned long groupId = getGroupIdFromDocId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// log the first docid so we can blaster url: queries
		// to PageIndexdb and see if they are in indexdb
		if ( i == 0 ) 
			logf(LOG_INFO,"indexdb: %llu %s",docId,st->m_query);
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		unsigned long date = 0;
		if ( st->m_useDatedb )
			date = (unsigned long)st->m_list.getCurrentDate();
		uint8_t dh = g_titledb.getDomHash8FromDocId ( docId );
		char ds[32];
		ds[0]=0;
		if ( st->m_useDatedb ) sprintf (ds,"%lu/",date);
		pbuf->safePrintf ( 
			  "<tr><td>%li.</td>"
			  "<td>%s%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td>"
			  "<td>"
			  "0x%02lx"
			  "</td>"
			  "</tr>\n" ,
			  i++,
			  ds, (int)st->m_list.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId ,
			  (long)dh );
	}	
	pbuf->safePrintf ( "</table>" );

	/*
	if ( ! st->m_list2.isEmpty() ) 
		p += sprintf ( p ,
			       "<br>"
			       "<br>"
			       "<table cellpadding=1 border=1>" 
			       "<tr><td>#</td><td>termId</td>"
			       "<td>date</td><td>score</td>"
			       "<td>docId</td></tr>");

	// now print the score/docId of datedb list
	i = 0;
	for (   st->m_list2.resetListPtr () ;
	      ! st->m_list2.isExhausted  () ;
		st->m_list2.skipCurrentRecord () ) {
		// break if buf is low
		if ( p + 1024 >= pend ) break;
		// but set the ip/port to a host that has this titleRec
		// stored locally!
		long long     docId   = st->m_list2.getCurrentDocId () ;
		unsigned long groupId = g_titledb.getGroupId ( docId );
		// get the first host's hostId in this groupId
		Host *h = g_hostdb.getFastestHostInGroup ( groupId );
		// . pick the first host to handle the cached titleRec request
		// . we assume it has the best time and is up!! TODO: fix!
		// . use local ip though if it was an internal request
		// . otherwise, use the external ip
		//unsigned long  ip   = h->m_externalIp;
		unsigned long  ip   = h->m_ip;
		// use the NAT mapped port
		unsigned short port = h->m_externalHttpPort;
		// adjust ip/port if local
		if ( st->m_isLocal ) {
			ip   = h->m_ip;
			port = h->m_httpPort;
		}
		// debug
		char kb[16];
		st->m_list2.getCurrentKey(kb);
		//log(LOG_INFO,"debug: n1=%016llx n0=%016llx",
		//    *(long long *)(kb+8),*(long long *)(kb+0));
		//if ( (unsigned long)st->m_list2.getCurrentDate() == 0 )
		//	log("STOP");
		sprintf ( p , 
			  "<tr><td>%li.</td>"
			  "<td>%llu</td>"
			  "<td>%lu</td><td>%i</td>"
			  "<td>"
			  //"<a href=http://%s:%hu/master/titledb?d=%llu>"
			  "<a href=/master/titledb?c=%s&d=%llu>"
			  "%llu"
			  //"<td><a href=/cgi/4.cgi?d=%llu>%llu"
			  "</td></tr>\n" ,
			  i++,
			  st->m_list2.getTermId16(kb) ,
			  (unsigned long)st->m_list2.getCurrentDate() ,
			  (int)st->m_list2.getCurrentScore() ,
			  //iptoa(ip) , port ,
			  st->m_coll,
			  docId , 
			  docId );
		p += gbstrlen ( p );
	}	
	*/
	if ( ! st->m_list.isEmpty() ) 
		pbuf->safePrintf ( "</table>" );


	// print msg if we could fit all into buf
	//if ( p + 1024 >= pend ) {
	//	sprintf ( p ,"... truncated ... no mem" );
	//	p += gbstrlen ( p );		
	//}
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	pbuf->safePrintf ( "</center>\n");
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage ( s , 
						     pbuf->getBufStart() ,
						     pbuf->length() );
	// delete the state
	mdelete ( st , sizeof(State10) , "PageIndexdb" );
	delete (st) ;
	return status;
}

Exemple #23

0

Afficher le fichier

Fichier : PageInject.cpp Projet : BKJackson/open-source-search-engine

// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {

	// advance round now in case we return early
	m_round++;

	// error?
	if ( m_qbuf.length() > 500 ) {
		g_errno = EQUERYTOOBIG;
		return true;
	}

	// first encode the query
	SafeBuf ebuf;
	ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded );

	char *uf;
	if ( m_round == 1 )
		// set to 1 for debugging
		uf="http://www.google.com/search?num=20&"
			"q=%s&scoring=d&filter=0";
		//uf = "https://startpage.com/do/search?q=%s";
		//uf = "http://www.google.com/"
		//	"/cse?cx=013269018370076798483%3A8eec3papwpi&"
		//	"ie=UTF-8&q=%s&"
		//	"num=20";
	else
		uf="http://www.bing.com/search?q=%s";

	// skip bing for now
	//if ( m_round == 2 )
	//	return true;
	//if ( m_round == 1 )
	//	return true;
		
	// make the url we will download
	char ubuf[2048];
	sprintf ( ubuf , uf , ebuf.getBufStart() );

	// log it
	log("inject: SCRAPING %s",ubuf);

	SpiderRequest sreq;
	sreq.reset();
	// set the SpiderRequest
	strcpy(sreq.m_url, ubuf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	long firstIp = hash32n(ubuf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// this will tell it to index ahrefs first before indexing
	// the doc. but do NOT do this if we are from ahrefs.com
	// ourselves to avoid recursive explosion!!
	if ( m_useAhrefs )
		m_xd.m_useAhrefs = true;

	m_xd.m_reallyInjectLinks = m_injectLinks;

	//
	// rather than just add the links of the page to spiderdb,
	// let's inject them!
	//
	m_xd.setCallback ( this , doneInjectingLinksWrapper );

	// niceness is 0
	m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");

	// do we actually inject the links, or just scrape?
	if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
				  NULL,
				  this , 
				  doneInjectingLinksWrapper ) ) 
		return false;
	// otherwise, just download the google/bing search results so we
	// can display them in xml
	//else if ( m_xd.getUtf8Content() == (char **)-1 )
	//	return false;
		
	// print reply..
	//printReply();
	return true;
}

Exemple #24

0

Afficher le fichier

Fichier : PageCatdb.cpp Projet : BILObilo/open-source-search-engine

bool sendReply ( void *state ) {
	StateCatdb *st = (StateCatdb*)state;
	// check for error
	if (g_errno) {
		if (st->m_catLookup)
			log("PageCatdb: Msg8b had error getting Site Rec: %s",
			    mstrerror(g_errno));
		else
			log("PageCatdb: Msg2a had error generating Catdb: %s",
			    mstrerror(g_errno));
		st->m_catLookup = false;
		g_errno = 0;
	}
	long long endTime = gettimeofdayInMilliseconds();
	// page buffer
	SafeBuf sb;
	sb.reserve(64*1024);
	// . print standard header
	// . do not print big links if only an assassin, just print host ids
	g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );

	sb.safePrintf(
		      "<style>"
		      ".poo { background-color:#%s;}\n"
		      "</style>\n" ,
		      LIGHT_BLUE );


	sb.safePrintf ( "<table %s>"
			"<tr><td colspan=2>"
			"<center><font size=+1><b>Catdb</b></font></center>"
			"</td></tr>", TABLE_STYLE );

	// instructions
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td colspan=3>"
		      "<font size=-2>"
		      "<center>"
		      "Don't just start using this, you need to follow the "
		      "instructions in the <i>admin guide</i> for adding "
		      "DMOZ support."
		      "</center>"
		      "</font>"
		      "</td>"
		      "</tr>"
		      ,DARK_BLUE
		      );

	// print the generate Catdb link
	sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
			"Update Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	sb.safePrintf ( "<tr class=poo>"
			"<td>Generate New Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
			"Generate Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	if (st->m_genCatdb)
		sb.safePrintf ( "<tr class=poo>"
				"<td> Catdb Generation took %lli ms."
				"</td></tr>",
				endTime - st->m_startTime );
	// print Url Catgory Lookup
	sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
			"<td><input type=text name=caturl size=80"
			" value=\"");
	if (st->m_catLookup) {
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
	}
	sb.safePrintf("\"></center></td></tr>" );
	// print Url Info if Lookup was done
	if (st->m_catLookup) {
		sb.safePrintf("<tr><td>");
		// print the url
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
		sb.safePrintf(" (%lli ms)</td><td>",
				endTime - st->m_startTime );
		// print each category id and path
		for (long i = 0; i < st->m_catRec.m_numCatids; i++) {
			sb.safePrintf("<b>[%li] ",
					st->m_catRec.m_catids[i]);
			g_categories->printPathFromId(&sb,
					st->m_catRec.m_catids[i]);
			sb.safePrintf("</b><br>");
			// lookup title and summary
			char  title[1024];
			long  titleLen = 0;
			char  summ[4096];
			long  summLen = 0;
			char  anchor[256];
			unsigned char anchorLen = 0;
			g_categories->getTitleAndSummary(
					st->m_url.getUrl(),
					st->m_url.getUrlLen(),
					st->m_catRec.m_catids[i],
					title,
					&titleLen,
					1023,
					summ,
					&summLen,
					4098,
					anchor,
					&anchorLen,
					255 );
			title[titleLen] = '\0';
			summ[summLen] = '\0';
			anchor[anchorLen] = '\0';
			// print title and summary
			sb.safePrintf("<b>Title:</b> %s<br>"
					"<b>Summary:</b> %s<br>",
					title, summ);
			if (anchorLen > 0)
				sb.safePrintf("<b>Anchor:</b> %s<br>",
						anchor);
			sb.safePrintf("<br>");
		}
		sb.safePrintf("<b>Filenum:</b> %li<br>",
				st->m_catRec.m_filenum);
		// print indirect catids
		if (st->m_catRec.m_numIndCatids > 0) {
			sb.safePrintf("<hr><b>Indirect Catids [%li]:"
					"</b><br>\n",
					st->m_catRec.m_numIndCatids );
			for (long i = 0;
				  i < st->m_catRec.m_numIndCatids; i++) {
				sb.safePrintf("%lu<br>",
					st->m_catRec.m_indCatids[i]);
			}
		}
		sb.safePrintf("</td></tr>");
	}
	// end it
	sb.safePrintf ( "</center></td></tr></table>" );
	// print submit button
	sb.safePrintf ( "<br><center>"
			"<input type=submit value=\"Submit\" border=0>"
			"</form></center>" );

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// extract the socket
	TcpSocket *s = st->m_socket;
	// clear the state
	mdelete ( st, sizeof(StateCatdb), "PageCatdb" );
	delete st;
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length());
}

Exemple #25

0

Afficher le fichier

Fichier : PageReindex.cpp Projet : exename/open-source-search-engine

void doneReindexing ( void *state ) {
	// cast it
	State13 *st = (State13 *)state;

	GigablastRequest *gr = &st->m_gr;

	// note it
	if ( gr->m_query && gr->m_query[0] )
		log(LOG_INFO,"admin: Done with query reindex. %s",
		    mstrerror(g_errno));

	////
	//
	// print the html page
	//
	/////

	HttpRequest *hr = &gr->m_hr;

	char format = hr->getReplyFormat();

	SafeBuf sb;

	const char *ct = "text/html";
	if ( format == FORMAT_JSON ) ct = "application/json";
	if ( format == FORMAT_XML  ) ct = "text/xml";

	if ( format == FORMAT_XML ) {
		sb.safePrintf("<response>\n"
			      "\t<statusCode>0</statusCode>\n"
			      "\t<statusMsg>Success</statusMsg>\n"
			      "\t<matchingResults>%" PRId32"</matchingResults>\n"
			      "</response>"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}

	if ( format == FORMAT_JSON ) {
		sb.safePrintf("{\"response\":{\n"
			      "\t\"statusCode\":0,\n"
			      "\t\"statusMsg\":\"Success\",\n"
			      "\t\"matchingResults\":%" PRId32"\n"
			      "}\n"
			      "}\n"
			      , st->m_msg1c.m_numDocIdsAdded
			      );
		g_httpServer.sendDynamicPage ( gr->m_socket,
					       sb.getBufStart(),
					       sb.length(),
					       -1,
					       false,ct);
		mdelete ( st , sizeof(State13) , "PageTagdb" );
		delete (st);
		return;
	}



	g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr );

	sb.safePrintf("<style>"
		       ".poo { background-color:#%s;}\n"
		       "</style>\n" ,
		       LIGHT_BLUE );


	//
	// print error msg if any
	//

	if ( gr->m_query && gr->m_query[0] && ! g_errno )
		sb.safePrintf ( "<center><font color=red><b>Success. "
			  "Added %" PRId32" docid(s) to "
			  "spider queue.</b></font></center><br>" , 
			  st->m_msg1c.m_numDocIdsAdded );

	if ( gr->m_query && gr->m_query[0] && g_errno )
		sb.safePrintf ( "<center><font color=red><b>Error. "
				 "%s</b></font></center><br>" , 
				 mstrerror(g_errno));


	// print the reindex interface
	g_parms.printParmTable ( &sb , gr->m_socket , &gr->m_hr  );


	g_httpServer.sendDynamicPage ( gr->m_socket,
				       sb.getBufStart(),
				       sb.length(),
				       -1,
				       false);

	mdelete ( st , sizeof(State13) , "PageTagdb" );
	delete (st);
}

Exemple #26

0

Afficher le fichier

Fichier : PageAddUrl.cpp Projet : lemire/open-source-search-engine

bool sendReply ( void *state ) {
	GigablastRequest *gr = (GigablastRequest *)state;

	// in order to see what sites are being added log it, then we can
	// more easily remove sites from sitesearch.gigablast.com that are
	// being added but not being searched
	SafeBuf xb;
	if ( gr->m_urlsBuf ) {
		xb.safeTruncateEllipsis ( gr->m_urlsBuf , 200 );
		log( LOG_INFO, "http: add url %s (%s)", xb.getBufStart(), mstrerror( g_errno ) );
	}

	char format = gr->m_hr.getReplyFormat();
	TcpSocket *sock = gr->m_socket;

	if ( format == FORMAT_JSON || format == FORMAT_XML ) {
		bool status = g_httpServer.sendSuccessReply ( gr );
		// nuke state
		mdelete ( gr , sizeof(gr) , "PageAddUrl" );
		delete (gr);
		return status;
	}

	int32_t ulen = 0;
	const char *url = gr->m_urlsBuf;
	if ( url ) ulen = gbstrlen (url);

	// re-null it out if just http://
	bool printUrl = true;
	if ( ulen == 0 ) printUrl = false;
	if ( ! gr->m_urlsBuf       ) printUrl = false;
	if ( ulen==7 && printUrl && !strncasecmp(gr->m_url,"http://",7))
		printUrl = false;
	if ( ulen==8 && printUrl && !strncasecmp(gr->m_url,"https://",8))
		printUrl = false;

	// page is not more than 32k
	char buf[1024*32+MAX_URL_LEN*2];
	SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);

	g_pages.printAdminTop ( &sb , sock , &gr->m_hr );

	// if there was an error let them know
	SafeBuf mbuf;

	if ( g_errno ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("Error adding url(s): <b>%s[%i]</b>", mstrerror(g_errno) , g_errno);
		mbuf.safePrintf("</font></center>");
	} else if ( printUrl ) {
		mbuf.safePrintf("<center><font color=red>");
		mbuf.safePrintf("<b><u>");
		mbuf.safeTruncateEllipsis(gr->m_urlsBuf,200);
		mbuf.safePrintf("</u></b></font> added to spider queue successfully<br><br>");
		mbuf.safePrintf("</font></center>");
	}

	if ( mbuf.length() ) {
		sb.safeStrcpy( mbuf.getBufStart() );
	}

	g_parms.printParmTable ( &sb , sock , &gr->m_hr );

	// print the final tail
	g_pages.printTail ( &sb, true ); // admin?

	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;

	// nuke state
	mdelete ( gr , sizeof(GigablastRequest) , "PageAddUrl" );
	delete (gr);

	return g_httpServer.sendDynamicPage( sock, sb.getBufStart(), sb.length(), -1 ); // cachetime
}

Exemple #27

0

Afficher le fichier

Fichier : PageGet.cpp Projet : firatkarakusoglu/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Exemple #28

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool qainject2 ( ) {

	//if ( ! s_callback ) s_callback = qainject2;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}


	//
	// try delimeter based injecting
	//
	//static bool s_y2 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123&deleteurl=0&"
			      "delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
			      "hasmime=1&content=");
		// use injectme3 file
		SafeBuf ubuf;
		ubuf.load("./injectme3");
		sb.urlEncode(ubuf.getBufStart());
		if ( ! getUrl ( "/admin/inject",
				// check reply, seems to have only a single 
				// docid in it
				-1970198487, sb.getBufStart()) )
			return false;
	}

	// now query check
	//static bool s_y4 = false;
	if ( ! s_flags[8] ) {
		wait(1.5);
		s_flags[8] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				-1804253505 ) )
			return false;
	}

	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=1"
				,-1874756636 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=0&hacr=1"
				,1651330319 ) )
			return false;
	}

	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports"
				"+news&ns=1&tml=20&smxcpl=30&"
				"sw=10&showimages=0&sc=1"
				,-1405546537 ) )
			return false;
	}


	//
	// delete the 'qatest123' collection
	//
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 2");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

Exemple #29

0

Afficher le fichier

Fichier : PageParser.cpp Projet : acchou/open-source-search-engine

bool gotXmlDoc ( void *state ) {
	// cast it
	State8 *st = (State8 *)state;
	// get the xmldoc
	XmlDoc *xd = &st->m_xd;

	// if we loaded from old title rec, it should be there!


	// . save the ips.txt file if we are the test coll
	// . saveTestBuf() is a function in Msge1.cpp
	//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) 
	//	// use same dir that XmlDoc::getTestDir() would use
	//	saveTestBuf ( "test-page-parser" );

	// error?
	if ( g_errno ) return sendErrorReply ( st , g_errno );

	// shortcut
	SafeBuf *xbuf = &st->m_xbuf;

	bool printIt = false;
	if ( st->m_u && st->m_u[0] ) printIt = true;
	if ( st->m_docId != -1LL ) printIt = true;
	if ( st->m_donePrinting ) printIt = false;

	// do not re-call this if printDocForProCog blocked... (check length())
	if ( printIt ) {
		// mark as done
		st->m_donePrinting = true;
		// always re-compute the page inlinks dynamically, do not
		// use the ptr_linkInfo1 stored in titlerec!!
		// NO! not if set from titlerec/docid
		if ( st->m_recompute )
			xd->m_linkInfo1Valid = false;
		// try a recompute regardless, because we do not store the
		// bad inlinkers, and ppl want to see why they are bad!
		//xd->m_linkInfo1Valid = false;
		// now get the meta list, in the process it will print out a 
		// bunch of junk into st->m_xbuf
		//char *metalist = xd->getMetaList ( );
		//if ( ! metalist ) return sendErrorReply ( st , g_errno );
		// return false if it blocked
		//if ( metalist == (void *)-1 ) return false;
		// for debug...
		//if ( ! xd->m_indexCode ) xd->doConsistencyTest ( false );
		// . print it out
		// . returns false if blocks, true otherwise
		// . sets g_errno on error
		if ( ! xd->printDocForProCog ( xbuf , &st->m_r ) )
			return false;
		// error?
		if ( g_errno ) return sendErrorReply ( st , g_errno );
	}

	long isXml = st->m_r.getLong("xml",0);
	char ctype2 = CT_HTML;
	if ( isXml ) ctype2 = CT_XML;
	// now encapsulate it in html head/tail and send it off
	bool status = g_httpServer.sendDynamicPage( st->m_s , 
						    xbuf->getBufStart(), 
						    xbuf->length() ,
						    -1, //cachtime
						    false ,//postreply?
						    &ctype2,
						    -1 , //httpstatus
						    NULL,//cookie
						    "utf-8");
	// delete the state now
	if ( st->m_freeIt ) {
		mdelete ( st , sizeof(State8) , "PageParser" );
		delete (st);
	}
	// return the status
	return status;
}

Exemple #30

0

Afficher le fichier

Fichier : PageTitledb.cpp Projet : chushuai/open-source-search-engine

// . make a web page from results stored in msg40
// . send it on TcpSocket "s" when done
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool gotTitleRec ( void *state ) {
	// cast the State4 out
	State4 *st = (State4 *) state;
	// get the socket
	TcpSocket *s = st->m_socket;

	SafeBuf sb;
	// get it's docId
	long long docId = st->m_docId;
	// make the query string for passing to different hosts
	char  qs[64];
	sprintf(qs,"&d=%lli",docId);
	if ( docId==0LL ) qs[0] = 0;
	// print standard header
	sb.reserve2x ( 32768 );
	g_pages.printAdminTop (&sb, st->m_socket, &st->m_r );
	//PAGE_TITLEDB,
	//		       st->m_username,//NULL ,
	//		       st->m_coll , st->m_pwd , s->m_ip , qs );
	// shortcut
	XmlDoc *xd = &st->m_xd;

	// . deal with errors
	// . print none if non title rec at or after the provided docId
	if ( g_errno || docId == 0LL || xd->m_titleRecBuf.length() <= 0 ) {
		// print docId in box
		sb.safePrintf (  "<center>\nEnter docId: "
				 "<input type=text name=d value=%lli size=15>",
				 docId);
		sb.safePrintf ( "</form><br>\n" );
		if ( docId == 0 ) 
			sb.safePrintf("<br>");
		else if ( g_errno ) 
			sb.safePrintf("<br><br>Error = %s",mstrerror(g_errno));
		else 
			sb.safePrintf("<br><br>No titleRec for that docId "
				      "or higher");
		// print where it should be
		//unsigned long gid = getGroupIdFromDocId ( docId );
		//Host *hosts = g_hostdb.getGroup(gid);
		long shardNum = getShardNumFromDocId ( docId );
		Host *hosts = g_hostdb.getShard ( shardNum );
		long hostId = -1;
		if ( hosts ) hostId = hosts[0].m_hostId;
		sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
		sb.safePrintf ( "\n</center>" );
		mdelete ( st , sizeof(State4) , "PageTitledb");
		delete (st);
		// erase g_errno for sending
		g_errno = 0;
		// now encapsulate it in html head/tail and send it off
		return g_httpServer.sendDynamicPage ( s , 
						      sb.getBufStart(),
						      sb.length() );
	}
	// print docId in box
	sb.safePrintf ("<br>\n"
		       "<center>Enter docId: "
		       "<input type=text name=d value=%lli size=15>", docId );
	// print where it should be
	//unsigned long gid = getGroupIdFromDocId ( docId );
	//Host *hosts = g_hostdb.getGroup(gid);
	long shardNum = getShardNumFromDocId ( docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	long hostId = -1;
	if ( hosts ) hostId = hosts[0].m_hostId;
	sb.safePrintf("<br><br>docId on host #%li and twins.",hostId);
	sb.safePrintf ( "</form><br>\n" );

	//char *coll    = st->m_coll;

	Title *ti = xd->getTitle();
	if ( ! ti ) {
		log ( "admin: Could not set title" );
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
	}
	// sanity check. should not block
	if ( ! xd->m_titleValid ) { char *xx=NULL;*xx=0; }

	// print it out
	xd->printDoc ( &sb );

	// don't forget to cleanup
	mdelete ( st , sizeof(State4) , "PageTitledb");
	delete (st);
	// now encapsulate it in html head/tail and send it off
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length());
}