Ejemplos de SafeBuf::safeMemcpy en C++ (Cpp)

Lenguaje de programación: C++ (Cpp)

Clase / Tipo: SafeBuf

Método / Función: safeMemcpy

Ejemplos en hotexamples.com: 12

C++ (Cpp) SafeBuf::safeMemcpy - 12 ejemplos encontrados. Estos son los ejemplos en C++ (Cpp) del mundo real mejor valorados de SafeBuf::safeMemcpy extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

safePrintf(30)

getBufStart(30)

safeMemcpy(12)

urlEncode(8)

nullTerm(8)

reserve(7)

pushChar(5)

reset(5)

getBuf(4)

fillFromFile(3)

safeStrcpy(3)

load(3)

pushLong(3)

htmlEncode(3)

safeTruncateEllipsis(3)

detachBuf(2)

getBufPtr(2)

replaceChar(1)

save(1)

reserve2x(1)

setLength(1)

stealBuf(1)

cdataEncode(1)

pushFloat(1)

removeLastChar(1)

pushLongLong(1)

purge(1)

base64Encode(1)

length(1)

jsonEncode(1)

incrementLength(1)

htmlEncodeXmlTags(1)

getLength(1)

getCapacity(1)

dumpToFile(1)

getBufEnd(1)

base64Decode(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: HashTableT.cpp Proyecto: BILObilo/open-source-search-engine

bool HashTableT<Key_t, Val_t>::serialize(SafeBuf& sb) {
	sb += m_numSlots;
	sb += m_numSlotsUsed;
	if(m_numSlots == 0) return true;
	bool x = true;
	x &= sb.safeMemcpy((char*)m_keys, sizeof(Key_t) * m_numSlots);
	x &= sb.safeMemcpy((char*)m_vals, sizeof(Val_t) * m_numSlots);
	return x;
}

Ejemplo n.º 2

Mostrar archivo

Archivo: PageInject.cpp Proyecto: chushuai/open-source-search-engine

void doneInjectingLinksWrapper ( void *state ) {
	Msg7 *msg7 = (Msg7 *)state;
	SafeBuf *sb = &msg7->m_sb;
	// copy the serps into ou rbuf
	if ( ! g_errno ) {
		// print header
		if ( sb->length() == 0 ) {
			// print header of page
			sb->safePrintf("<?xml version=\"1.0\" "
				       "encoding=\"UTF-8\" ?>\n"
				       "<response>\n" );
		}
		// serp header
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t<googleResults>\n");
		else
			sb->safePrintf("\t<bingResults>\n");
		// print results
		sb->safeMemcpy(&msg7->m_xd.m_serpBuf);
		// end that
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t</googleResults>\n");
		else
			sb->safePrintf("\t</bingResults>\n");
	}
	// do bing now
	if ( msg7->m_round == 1 ) {
		// return if it blocks
		if ( ! msg7->scrapeQuery() ) return;
	}

	// otherwise, parse out the search results so steve can display them
	if ( g_errno )
		sb->safePrintf("<error><![CDATA[%s]]></error>\n",
			       mstrerror(g_errno));
	// print header of page
	sb->safePrintf("</response>\n");
	// page is not more than 32k
	//char buf[1024*32];
	//char *p = buf;
	// return docid and hostid
	//p += sprintf ( p , "scraping status ");
	// print error msg out, too or "Success"
	//p += sprintf ( p , "%s", mstrerror(g_errno));
	TcpSocket *sock = msg7->m_socket;
	g_httpServer.sendDynamicPage ( sock, 
				       sb->getBufStart(),
				       sb->length(),
				       -1/*cachetime*/);
	// hopefully sb buffer is copied becaues this will free it:
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
}

Ejemplo n.º 3

Mostrar archivo

Archivo: blaster.cpp Proyecto: BKJackson/open-source-search-engine

bool getWords() {
	FILE *fd = fopen ( "/usr/share/dict/words" , "r" );
	if ( ! fd ) {
		log("blaster:: failed to open /usr/share/dict/words %s",
		    mstrerror(errno)); 
		return 1; 
	}
	char tmp[1024];
	while ( fgets ( tmp , 1024 , fd ) ) {
		long len = gbstrlen(tmp);
		if(len > 2 && tmp[len-2] == 's' && tmp[len-3] == '\'') continue;
		s_windices += s_words.length();
		s_words.safeMemcpy(tmp, len-1); //copy in data minus the newline
		s_words += '\0';
	}
	fclose ( fd );
	log("blaster: read %li words, %li bytes in from dictionary.", 
	    s_windices.length() / sizeof(long), s_words.length());
	return true;
}

Ejemplo n.º 4

Mostrar archivo

Archivo: PageGet.cpp Proyecto: firatkarakusoglu/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Ejemplo n.º 5

Mostrar archivo

Archivo: Turkdb.cpp Proyecto: DeadNumbers/open-source-search-engine

bool sendTurkPageReply ( State60 *st ) {

	XmlDoc *xd = &st->m_xd;
	//char *content    = xd->ptr_utf8Content;
	//int32_t  contentLen = xd->size_utf8Content - 1;

	// count the total number of EventDesc classes for all evids
	//char *evd = xd->ptr_eventData;
	//EventDisplay *ed = (EventDisplay *)evd;
	//char *addr = evd + (int32_t)ed->m_addr;
	//char timeZoneOffset = getTimeZoneFromAddr ( addr );

	// in case getSections() block come right back in
	xd->setCallback ( st , xdcallback );

	// . set niceness to 1 so all this processing doesn't slow queries down
	// . however, g_niceness should still be zero... hmmm...
	xd->m_niceness = 1;

	// default to 1 niceness
	st->m_niceness = 1;

	// now set the sections class
	Sections *ss = xd->getSections();

	// now for each section with alnum text, telescope up as far as 
	// possible without containing anymore alnum text than what it 
	// contained. set SEC_CONTROL bit. such sections will have the
	// 2 green/blue dots, that are used for turning on/off title/desc.
	// but really the indians will only turn off sections that should
	// not have a title/desc.
	for ( Section *si = ss->m_rootSection ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(st->m_niceness);
		// skip if does not have text
		if ( si->m_firstWordPos < 0 ) continue;
		// otherwise, find biggest parent that contains just that text
		Section *p    = si->m_parent;
		Section *last = si;
		for ( ; p ; p = p->m_parent ) {
			if ( p->m_firstWordPos != si->m_firstWordPos ) break;
			if ( p->m_lastWordPos  != si->m_lastWordPos  ) break;
			last = p;
		}
		// set that bit then
		last->m_flags |= SEC_CONTROL;
		// and speed up the loop
		si = last;
	}

	// * now each SEC_CONTROL sections have a fence activated by a turker

	// * an event title or description can not span a fence. it must be
	//   confined within a fence. however, it is allowed to include
	//   title or description from a "title section".

	// * hold shift down to designate as title section when clicking it

	// * show the raw text of each event changing as you fence
	//   sections in or out.  show in a right frame.

	// * show list of events on page in the top frame. can toggle them
	//   all individually.

	// * and remove no-display from all tags so we can see everything.

	// * highlight addresses, not just dates.

	// * each section hash has its own unique bg color when activated

	// * with a single click, completely reject an event because:
	//   contains bad time, address, title or desc. specify which so
	//   we can improve our algo.

	// * when selecting an individual event, scroll to its tod...

	// * remove all color from webpage that we can so our colors show up

	// * remove all imgs. just src them to dev null.

	// * allow for entering a custom title for an event or all events
	//   that are or will ever appear on the page. 

	// * when displaying the text of the events, use hyphens to
	//   delineate the section topology. strike out text as a section
	//   fence is activated.

	// * when a section is activated is it easier to just redownload
	//   the whole text of the page? maybe just the text frame?

	// * clicking on an individual sentence section should just remove
	//   that sentence. that is kinda a special content hash removal
	//   tag. like "Click here for video."

	// * when an event id is selected i guess activate its bgcolor to
	//   be light blue for all sentences currently in the event that
	//   are not in activated sections. (make exception for designated 
	//   title sections). so we need multiple tags for each events
	//   sentence div section. if sentence is split use multiple div tags
	//   then to keep the order. so each event sentence would have 
	//   <div ev1=1 ev2=1 ev10=1>...</div> if it is in event ids 1,2 and
	//   10. that way we can activate it when one of those event ids is
	//   activated.


	SafeBuf sb;

	// int16_tcuts
	if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
	Words     *words = &xd->m_words;
	int32_t       nw    = words->getNumWords();
	char     **wptrs = words->getWords();
	int32_t      *wlens = words->getWordLens();
	nodeid_t  *tids  = words->getTagIds();

	// a special array for printing </div> tags
	char *endCounts = (char *)mcalloc ( nw ,"endcounts");
	if ( ! endCounts ) return sendErrorReply ( st , g_errno );


	// 
	// now loop over all the words. if word starts a section that has
	// SEC_CONTROL bit set, and print out the section hash and a color
	// tag to be activated if the turkey activates us.
	// CAUTION: word may start multiple sections.
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) { 
		// get section ptr
		Section *sj = ss->m_sectionPtrs[i];
		// sanity check. sj must be first section ptr that starts @ a
		if ( sj && sj->m_a==i && sj->m_prev && sj->m_prev->m_a==i ) {
			char *xx=NULL;*xx=0; }
		// . does word #i start a section?
		// . if section is control, print out the control
		while ( sj && sj->m_a == i ) {
			// print this section's hash
			if ( sj->m_flags & SEC_CONTROL) {
				// after the turkeys have made all the edits
				// they need to submit the changes they made.
				// how can we get that data sent back to the
				// back end? we need to send back the colors
				// of the sections that have been activated
				// i guess. just do a loop over them.
				sb.safePrintf("<div nobreak gbsecid=%"UINT32" "
					      "bgcolor=#%"XINT32" "
					      "onclick=gbtogglecolor()>",
					      (uint32_t)sj->m_tagHash,
					      (uint32_t)sj->m_tagHash);
				// sanity check
				if ( sj->m_b < 0  ) { char *xx=NULL;*xx=0; }
				if ( sj->m_b > nw ) { char *xx=NULL;*xx=0; }
				// and inc the /div count for that word
				endCounts[sj->m_b-1]++;
			}
			// try next section too
			sj = sj->m_next;
		}
		// if this is a tag, remove any coloring
		if ( tids[i] ) {
		}
		// print the word, be it a tag, alnum, punct
		sb.safeMemcpy ( wptrs[i] , wlens[i] );
		// end a div tag?
		if ( ! endCounts[i] ) continue;
		// might be many so loop it
		for ( int32_t j = 0 ; j < endCounts[i] ; j++ )
			sb.safePrintf("</div>");
	}			







	return false;
}

Ejemplo n.º 6

Mostrar archivo

Archivo: PageCatdb.cpp Proyecto: BILObilo/open-source-search-engine

bool sendReply ( void *state ) {
	StateCatdb *st = (StateCatdb*)state;
	// check for error
	if (g_errno) {
		if (st->m_catLookup)
			log("PageCatdb: Msg8b had error getting Site Rec: %s",
			    mstrerror(g_errno));
		else
			log("PageCatdb: Msg2a had error generating Catdb: %s",
			    mstrerror(g_errno));
		st->m_catLookup = false;
		g_errno = 0;
	}
	long long endTime = gettimeofdayInMilliseconds();
	// page buffer
	SafeBuf sb;
	sb.reserve(64*1024);
	// . print standard header
	// . do not print big links if only an assassin, just print host ids
	g_pages.printAdminTop ( &sb, st->m_socket , &st->m_r );

	sb.safePrintf(
		      "<style>"
		      ".poo { background-color:#%s;}\n"
		      "</style>\n" ,
		      LIGHT_BLUE );


	sb.safePrintf ( "<table %s>"
			"<tr><td colspan=2>"
			"<center><font size=+1><b>Catdb</b></font></center>"
			"</td></tr>", TABLE_STYLE );

	// instructions
	sb.safePrintf("<tr bgcolor=#%s>"
		      "<td colspan=3>"
		      "<font size=-2>"
		      "<center>"
		      "Don't just start using this, you need to follow the "
		      "instructions in the <i>admin guide</i> for adding "
		      "DMOZ support."
		      "</center>"
		      "</font>"
		      "</td>"
		      "</tr>"
		      ,DARK_BLUE
		      );

	// print the generate Catdb link
	sb.safePrintf ( "<tr class=poo><td>Update Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=2\">"
			"Update Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	sb.safePrintf ( "<tr class=poo>"
			"<td>Generate New Catdb from DMOZ data.</td>"
			"<td><center>"
			"<a href=\"/master/catdb?c=%s&gencatdb=1\">"
			"Generate Catdb</a> "
			"</center></td></tr>",
			st->m_coll );
	if (st->m_genCatdb)
		sb.safePrintf ( "<tr class=poo>"
				"<td> Catdb Generation took %lli ms."
				"</td></tr>",
				endTime - st->m_startTime );
	// print Url Catgory Lookup
	sb.safePrintf ( "<tr class=poo><td>Lookup Category of Url.</td>"
			"<td><input type=text name=caturl size=80"
			" value=\"");
	if (st->m_catLookup) {
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
	}
	sb.safePrintf("\"></center></td></tr>" );
	// print Url Info if Lookup was done
	if (st->m_catLookup) {
		sb.safePrintf("<tr><td>");
		// print the url
		sb.safeMemcpy(st->m_url.getUrl(), st->m_url.getUrlLen());
		sb.safePrintf(" (%lli ms)</td><td>",
				endTime - st->m_startTime );
		// print each category id and path
		for (long i = 0; i < st->m_catRec.m_numCatids; i++) {
			sb.safePrintf("<b>[%li] ",
					st->m_catRec.m_catids[i]);
			g_categories->printPathFromId(&sb,
					st->m_catRec.m_catids[i]);
			sb.safePrintf("</b><br>");
			// lookup title and summary
			char  title[1024];
			long  titleLen = 0;
			char  summ[4096];
			long  summLen = 0;
			char  anchor[256];
			unsigned char anchorLen = 0;
			g_categories->getTitleAndSummary(
					st->m_url.getUrl(),
					st->m_url.getUrlLen(),
					st->m_catRec.m_catids[i],
					title,
					&titleLen,
					1023,
					summ,
					&summLen,
					4098,
					anchor,
					&anchorLen,
					255 );
			title[titleLen] = '\0';
			summ[summLen] = '\0';
			anchor[anchorLen] = '\0';
			// print title and summary
			sb.safePrintf("<b>Title:</b> %s<br>"
					"<b>Summary:</b> %s<br>",
					title, summ);
			if (anchorLen > 0)
				sb.safePrintf("<b>Anchor:</b> %s<br>",
						anchor);
			sb.safePrintf("<br>");
		}
		sb.safePrintf("<b>Filenum:</b> %li<br>",
				st->m_catRec.m_filenum);
		// print indirect catids
		if (st->m_catRec.m_numIndCatids > 0) {
			sb.safePrintf("<hr><b>Indirect Catids [%li]:"
					"</b><br>\n",
					st->m_catRec.m_numIndCatids );
			for (long i = 0;
				  i < st->m_catRec.m_numIndCatids; i++) {
				sb.safePrintf("%lu<br>",
					st->m_catRec.m_indCatids[i]);
			}
		}
		sb.safePrintf("</td></tr>");
	}
	// end it
	sb.safePrintf ( "</center></td></tr></table>" );
	// print submit button
	sb.safePrintf ( "<br><center>"
			"<input type=submit value=\"Submit\" border=0>"
			"</form></center>" );

	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p );
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// extract the socket
	TcpSocket *s = st->m_socket;
	// clear the state
	mdelete ( st, sizeof(StateCatdb), "PageCatdb" );
	delete st;
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	return g_httpServer.sendDynamicPage(s , sb.getBufStart(), sb.length());
}

Ejemplo n.º 7

Mostrar archivo

Archivo: Msg3a.cpp Proyecto: rdhananjaya/open-source-search-engine

bool Msg3a::gotAllSplitReplies ( ) {

    // if any of the split requests had an error, give up and set m_errno
    // but don't set if for non critical errors like query truncation
    if ( m_errno ) {
        g_errno = m_errno;
        return true;
    }

    // also reset the finalbuf and the oldNumTopDocIds
    if ( m_finalBuf ) {
        mfree ( m_finalBuf, m_finalBufSize, "Msg3aF" );
        m_finalBuf     = NULL;
        m_finalBufSize = 0;
    }

    // update our estimated total hits
    m_numTotalEstimatedHits = 0;

    for ( long i = 0; i < m_numHosts ; i++ ) {
        // get that host that gave us the reply
        //Host *h = g_hostdb.getHost(i);
        // . get the reply from multicast
        // . multicast should have destroyed all slots, but saved reply
        // . we are responsible for freeing the reply
        // . we need to call this even if g_errno or m_errno is
        //   set so we can free the replies in Msg3a::reset()
        // . if we don't call getBestReply() on it multicast should
        //   free it, because Multicast::m_ownReadBuf is still true
        Multicast *m = &m_mcast[i];
        bool freeit = false;
        long  replySize = 0;
        long  replyMaxSize;
        char *rbuf;
        Msg39Reply *mr;
        // . only get it if the reply not already full
        // . if reply already processed, skip
        // . perhaps it had no more docids to give us or all termlists
        //   were exhausted on its disk and this is a re-call
        // . we have to re-process it for count m_numTotalEstHits, etc.
        rbuf = m->getBestReply ( &replySize    ,
                                 &replyMaxSize ,
                                 &freeit       ,
                                 true          ); //stealIt?
        // cast it
        mr = (Msg39Reply *)rbuf;
        // in case of mem leak, re-label from "mcast" to this so we
        // can determine where it came from, "Msg3a-GBR"
        relabel( rbuf, replyMaxSize , "Msg3a-GBR" );
        // . we must be able to free it... we must own it
        // . this is true if we should free it, but we should not have
        //   to free it since it is owned by the slot?
        if ( freeit ) {
            log(LOG_LOGIC,"query: msg3a: Steal failed.");
            char *xx = NULL;
            *xx=0;
        }
        // bad reply?
        if ( ! mr ) {
            log(LOG_LOGIC,"query: msg3a: Bad NULL reply.");
            m_reply       [i] = NULL;
            m_replyMaxSize[i] = 0;
            // it might have been timd out, just ignore it!!
            continue;
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            // all reply buffers should be freed on reset()
            return true;
        }
        // how did this happen?
        if ( replySize < 29 && ! mr->m_errno ) {
            // if size is 0 it can be Msg39 giving us an error!
            g_errno = EBADREPLYSIZE;
            m_errno = EBADREPLYSIZE;
            log(LOG_LOGIC,"query: msg3a: Bad reply size of %li.",
                replySize);
            // all reply buffers should be freed on reset()
            return true;
        }

        // can this be non-null? we shouldn't be overwriting one
        // without freeing it...
        if ( m_reply[i] )
            // note the mem leak now
            log("query: mem leaking a 0x39 reply");

        // cast it and set it
        m_reply       [i] = mr;
        m_replyMaxSize[i] = replyMaxSize;
        // deserialize it (just sets the ptr_ and size_ member vars)
        //mr->deserialize ( );
        deserializeMsg ( sizeof(Msg39Reply) ,
                         &mr->size_docIds,
                         &mr->size_clusterRecs,
                         &mr->ptr_docIds,
                         mr->m_buf );

        // sanity check
        if ( mr->m_nqt != m_q->getNumTerms() ) {
            g_errno = EBADREPLY;
            m_errno = EBADREPLY;
            log("query: msg3a: Split reply qterms=%li != %li.",
                (long)mr->m_nqt,(long)m_q->getNumTerms() );
            return true;
        }
        // return if split had an error, but not for a non-critical
        // error like query truncation
        if ( mr->m_errno && mr->m_errno != EQUERYTRUNCATED ) {
            g_errno = mr->m_errno;
            m_errno = mr->m_errno;
            log("query: msg3a: Split had error: %s",
                mstrerror(g_errno));
            return true;
        }
        // skip down here if reply was already set
        //skip:
        // add of the total hits from each split, this is how many
        // total results the lastest split is estimated to be able to
        // return
        // . THIS should now be exact since we read all termlists
        //   of posdb...
        m_numTotalEstimatedHits += mr->m_estimatedHits;

        // debug log stuff
        if ( ! m_debug ) continue;
        // cast these for printing out
        long long *docIds    = (long long *)mr->ptr_docIds;
        score_t   *scores    = (score_t   *)mr->ptr_scores;
        // print out every docid in this split reply
        for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
            // print out score_t
            logf( LOG_DEBUG,
                  "query: msg3a: [%lu] %03li) "
                  "split=%li docId=%012llu domHash=0x%02lx "
                  "score=%lu"                     ,
                  (unsigned long)this                      ,
                  j                                        ,
                  i                                        ,
                  docIds [j] ,
                  (long)g_titledb.getDomHash8FromDocId(docIds[j]),
                  (long)scores[j] );
        }
    }

    // this seems to always return true!
    mergeLists ( );

    if ( ! m_r->m_useSeoResultsCache ) return true;

    // now cache the reply
    SafeBuf cr;
    long dataSize = 4 + 4 + 4 + m_numDocIds * (8+4+4);
    long need = sizeof(key_t) + 4 + dataSize;
    bool status = cr.reserve ( need );
    // sanity
    if ( ( m_ckey.n0 & 0x01 ) == 0x00 ) {
        char *xx=NULL;
        *xx=0;
    }
    // ignore errors
    g_errno = 0;
    // return on error with g_errno cleared if cache add failed
    if ( ! status ) return true;
    // add to buf otherwise
    cr.safeMemcpy ( &m_ckey , sizeof(key_t) );
    cr.safeMemcpy ( &dataSize , 4 );
    long now = getTimeGlobal();
    cr.pushLong ( now );
    cr.pushLong ( m_numDocIds );
    cr.pushLong ( m_numTotalEstimatedHits );//Results );
    long max = m_numDocIds;
    // then the docids
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLongLong(m_docIds[i] );
    for ( long i = 0 ; i < max ; i++ )
        cr.pushFloat(m_scores[i]);
    for ( long i = 0 ; i < max ; i++ )
        cr.pushLong(getSiteHash26(i));
    // sanity
    if ( cr.length() != need ) {
        char *xx=NULL;
        *xx=0;
    }
    // make these
    key_t startKey;
    key_t endKey;
    startKey = m_ckey;
    // clear delbit
    startKey.n0 &= 0xfffffffffffffffeLL;
    // end key is us
    endKey = m_ckey;
    // that is the single record
    m_seoCacheList.set ( cr.getBufStart() ,
                         cr.length(),
                         cr.getBufStart(), // alloc
                         cr.getCapacity(), // alloc size
                         (char *)&startKey,
                         (char *)&endKey,
                         -1, // fixeddatasize
                         true, // owndata?
                         false,// use half keys?
                         sizeof(key_t) );
    // do not allow cr to free it, msg1 will
    cr.detachBuf();
    // note it
    //log("seopipe: storing ckey=%s q=%s"
    //    ,KEYSTR(&m_ckey,12)
    //    ,m_r->ptr_query
    //    );
    //log("msg1: sending niceness=%li",(long)m_r->m_niceness);
    // this will often block, but who cares!? it just sends a request off
    if ( ! m_msg1.addList ( &m_seoCacheList ,
                            RDB_SERPDB,//RDB_CACHEDB,
                            m_r->ptr_coll,
                            this, // state
                            gotSerpdbReplyWrapper, // callback
                            false, // forcelocal?
                            m_r->m_niceness ) ) {
        //log("blocked");
        return false;
    }

    // we can safely delete m_msg17... just return true
    return true;
}

Ejemplo n.º 8

Mostrar archivo

Archivo: Summary.cpp Proyecto: exename/open-source-search-engine

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
		}

		if ( wids[a] ) {
			wordCount++;
		}
	}

	// if didn't find a good start, then start at the start of the frag
	if ( !goodStart && firstFrag != -1 ) {
		a = firstFrag;
	}

	// don't let punct or tag word start a line, unless a quote
	if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){
		while ( a < matchWordNum && !wids[a] ) a++;
		
		// do not break right after a "strong connector", like 
		// apostrophe
		while ( a < matchWordNum && a > 0 && 
			( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
			a++;
		
		// don't let punct or tag word start a line
		while ( a < matchWordNum && !wids[a] ) a++;
	}

	// remember, b is not included in the summary, the summary is [a,b-1]
	// remember to include all words in a matched phrase
	int32_t b = matchWordNum + m->m_numWords ;
	int32_t endQuoteWordNum = -1;
	int32_t numTagsCrossed = 0;

	for ( ; b <= nw; b++ ) {
		if ( b == nw ) {
			break;
		}

		if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
			break;
		}
		
		if ( startOnQuote && words->getWord(b)[0] == '\"' ) {
			endQuoteWordNum = b;
		}

		// don't include any dead zone, those are already-used samples
		if ( bb[b] & D_USED ) {
			break;
		}

		// stop on a title word
		if ( bb[b] & D_IN_TITLE ) {
			break;
		}

		if ( wids[b] ) {
			wordCount++;
		}

		// don't go beyond an LI or TR backtag
		if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
		               tids[b] == (BACKBIT|TAG_TR) ) ) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 ) {
				break;
			}
		}

		// go beyond a P or DIV backtag in case the earlier char is a
		// ':'. This came from a special case for wikipedia pages 
		// eg. http://en.wikipedia.org/wiki/Flyover
		if ( tids && ( tids[b] == (BACKBIT|TAG_P)  ||
		               tids[b] == (BACKBIT|TAG_DIV) )) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) {
				break;
			}
		}
	}

	// don't end on a lot of punct words
	if ( b > matchWordNum && !wids[b-1]){
		// remove more than one punct words. if we're ending on a quote
		// keep it
		while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
			b--;
		}
		
		// do not break right after a "strong connector", like apostrophe
		while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) {
			b--;
		}
	}

	Match *ms = matches->m_matches;

	// make m_matches.m_matches[mi] the first match in our [a,b) window
	int32_t mi ;

	// . the match at the center of the window is match #"mm", so that
	//   matches->m_matches[mm] is the Match class
	// . set "mi" to it and back up "mi" as int32_t as >= a
	for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- )
		;

	// now get the score of this excerpt. Also mark all the represented 
	// query words. Mark the represented query words in the array that
	// comes to us. also mark how many times the same word is repeated in
	// this summary.
	int64_t score = 0LL;

	// is a url contained in the summary, that looks bad! punish!
	bool hasUrl = false;

	// the word count we did above was just an approximate. count it right
	wordCount = 0;

	// for debug
	//char buf[5000];
	//char *xp = buf;
	SafeBuf xp;

	// wtf?
	if ( b > nw ) {
		b = nw;
	}

	// first score from the starting match down to a, including match
	for ( int32_t i = a ; i < b ; i++ ) {
		// debug print out
		if ( g_conf.m_logDebugSummary ) {
			int32_t len = words->getWordLen(i);
			char cs;
			for (int32_t k=0;k<len; k+=cs ) {
				const char *c = words->getWord(i)+k;
				cs = getUtf8CharSize(c);
				if ( is_binary_utf8 ( c ) ) {
					continue;
				}
				xp.safeMemcpy ( c , cs );
				xp.nullTerm();
			}
		}

		// skip if in bad section, marquee, select, script, style
		if ( sp && (sp[i]->m_flags & badFlags) ) {
			continue;
		}

		// don't count just numeric words
		if ( words->isNum(i) ) {
			continue;
		}

		// check if there is a url. best way to check for '://'
		if ( wids && !wids[i] ) {
			const char *wrd = words->getWord(i);
			int32_t  wrdLen = words->getWordLen(i);
			if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' &&  wrd[2] == '/' ) {
				hasUrl = true;
			}
		}

		// skip if not wid
		if ( ! wids[i] ) {
			continue;
		}

		// just make every word 100 pts
		int32_t t = 100;

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		// boost it if in bold or italics
		if ( bb[i] & D_IN_BOLDORITALICS ) {
			t *= 2;
		}

		// add the score for this word
		score += t;

		// print the score, "t"
		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf("(%" PRId32")",t);
		}

		// count the alpha words we got
		wordCount++;

		// if no matches left, skip
		if ( mi >= matches->m_numMatches ) {
			continue;
		}

		// get the match
		Match *next = &ms[mi];

		// skip if not a match
		if ( i != next->m_wordNum ) {
			continue;
		}

		// must be a match in this class
		if ( next->m_words != words ) {
			continue;
		}

		// advance it
		mi++;

		// which query word # does it match
		int32_t qwn = next->m_qwordNum;

		if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}

		// undo old score
		score -= t;

		// add 100000 per match
		t = 100000;

		// weight based on tf, goes from 0.1 to 1.0
		t = (int32_t)((float)t * m_wordWeights [ qwn ]);

		// if it is a query stop word, make it 10000 pts
		if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
			t = 0;//10000;
		}

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		if ( gotIt[qwn] > 0 ) {
			// have we matched it in this [a,b) already?
			if ( gotIt[qwn] == 1 ) {
				t /= 15;
			} else {
				// if we have more than 2 matches in the same window,
				// it may not give a good summary. give a heavy penalty
				t -= 200000;
			}
		} else if ( retired [qwn] > 0 ) {
			// have we matched it already in a winning window?
			t /= 12;
		}

		// add it back
		score += t;

		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
				       m_wordWeights[qwn]);
		}

		// inc the query word count for this window
		if ( gotIt[qwn] < 100 ) {
			gotIt[qwn]++;
		}
	}

	int32_t oldScore = score;
	
	// apply the bonus if it starts or a sentence
	// only apply if the score is positive and if the wordcount is decent
	if ( score > 0 && wordCount > 7 ){
		// a match can give us 10k to 100k pts based on the tf weights
		// so we don't want to overwhelm that too much, so let's make
		// this a 20k bonus if it starts a sentence
		if ( bb[a] & D_STARTS_SENTENCE ) {
			score += 8000;
		} else if ( bb[a] & D_STARTS_FRAG ) {
			// likewise, a fragment, like after a comma
			score += 4000;
		}

		// 1k if the match word is very close to the
		// start of a sentence, lets say 3 alphawords
		if ( matchWordNum - a < 7 ) {
			score += 1000;
		}
	}

	// a summary isn't really a summary if its less than 7 words.
	// reduce the score, but still give it a decent score.
	// minus 5M.
	if ( wordCount < 7 ) {
		score -= 20000;
	}

	// summaries that cross a lot of tags are usually bad, penalize them
	if ( numTagsCrossed > 1 ) {
		score -= (numTagsCrossed * 20000);
	}

	if ( hasUrl ) {
		score -= 8000;
	}

	// show it
	if ( g_conf.m_logDebugSummary ) {
		log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
		     xp.getBufStart());
	}

	// set lasta, besta, bestb
	*lasta = a;
	*besta = a;
	*bestb = b;

	return score;
}

Ejemplo n.º 9

Mostrar archivo

Archivo: PageInject.cpp Proyecto: chushuai/open-source-search-engine

//
// . ENTRY POINT FOR IMPORTING TITLEDB RECS FROM ANOTHER CLUSTER
// . when user clicks 'begin' in import page we come here..
// . so when that parm changes in Parms.cpp we sense that and call
//   beginImport(CollectionRec *cr)
// . or on startup we call resumeImports to check each coll for 
//   an import in progress.
// . search for files named titledb*.dat
// . if none found just return
// . when msg7 inject competes it calls this
// . call this from sleep wrapper in Process.cpp
// . returns false if would block (outstanding injects), true otherwise
// . sets g_errno on error
bool ImportState::importLoop ( ) {

	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );

	if ( ! cr || g_hostdb.m_hostId != 0 ) { 
		// if coll was deleted!
		log("import: collnum %li deleted while importing into",
		    (long)m_collnum);
		//if ( m_numOut > m_numIn ) return true;
		// delete the entire import state i guess
		// what happens if we have a msg7 reply come back in?
		// it should see the collrec is NULL and just fail.
		mdelete ( this, sizeof(ImportState) , "impstate");
		delete (this);
		return true;
	}

 INJECTLOOP:

	// stop if waiting on outstanding injects
	long long out = m_numOut - m_numIn;
	if ( out >= cr->m_numImportInjects ) {
		g_errno = 0;
		return false;
	}
	

	if ( ! cr->m_importEnabled ) {
		// wait for all to return
		if ( out > 0 ) return false;
		// then delete it
		log("import: collnum %li import loop disabled",
		    (long)m_collnum);
		mdelete ( this, sizeof(ImportState) , "impstate");
		delete (this);
		return true;
	}




	// scan each titledb file scanning titledb0001.dat first,
	// titledb0003.dat second etc.

	//long long offset = -1;
	// . when offset is too big for current m_bigFile file then
	//   we go to the next and set offset to 0.
	// . sets m_bf and m_fileOffset
	if ( ! setCurrentTitleFileAndOffset ( ) ) {//cr  , -1 );
		log("import: import: no files to read");
		//goto INJECTLOOP;
		return true;
	}



	// this is -1 if none remain!
	if ( m_fileOffset == -1 ) {
		log("import: import fileoffset is -1. done.");
		return true;
	}

	long long saved = m_fileOffset;

	//Msg7 *msg7;
	//GigablastRequest *gr;
	//SafeBuf *sbuf = NULL;

	long need = 12;
	long dataSize = -1;
	//XmlDoc xd;
	key_t tkey;
	bool status;
	SafeBuf tmp;
	SafeBuf *sbuf = &tmp;
	long long docId;
	long shardNum;
	long key;
	Multicast *mcast;
	char *req;
	long reqSize;

	if ( m_fileOffset >= m_bfFileSize ) {
		log("inject: import: done processing file %li %s",
		    m_bfFileId,m_bf.getFilename());
		goto nextFile;
	}
	
	// read in title rec key and data size
	status = m_bf.read ( &tkey, sizeof(key_t) , m_fileOffset );
	
	//if ( n != 12 ) goto nextFile;
	if ( g_errno ) {
		log("inject: import: reading file error: %s. advancing "
		    "to next file",mstrerror(g_errno));
		goto nextFile;
	}

	m_fileOffset += 12;

	// if negative key, skip
	if ( (tkey.n0 & 0x01) == 0 ) {
		goto INJECTLOOP;
	}

	// if non-negative then read in size
	status = m_bf.read ( &dataSize , 4 , m_fileOffset );
	if ( g_errno ) {
		log("main: failed to read in title rec "
		    "file. %s. Skipping file %s",
		    mstrerror(g_errno),m_bf.getFilename());
		goto nextFile;
	}
	m_fileOffset += 4;
	need += 4;
	need += dataSize;
	need += 4; // collnum, first 4 bytes
	if ( dataSize < 0 || dataSize > 500000000 ) {
		log("main: could not scan in titledb rec of "
		    "corrupt dataSize of %li. BAILING ENTIRE "
		    "SCAN of file %s",dataSize,m_bf.getFilename());
		goto nextFile;
	}

	//gr = &msg7->m_gr;

	//XmlDoc *xd = getAvailXmlDoc();
	//msg7 = getAvailMsg7();
	mcast = getAvailMulticast();

	// if none, must have to wait for some to come back to us
	if ( ! mcast ) {
		// restore file offset
		//m_fileOffset = saved;
		// no, must have been a oom or something
		log("import: import no mcast available");
		return true;//false;
	}
	
	// this is for holding a compressed titlerec
	//sbuf = &mcast->m_sbuf;//&gr->m_sbuf;

	// point to start of buf
	sbuf->reset();

	// ensure we have enough room
	sbuf->reserve ( need );

	// collnum first 4 bytes
	sbuf->pushLong( (long)m_collnum );

	// store title key
	sbuf->safeMemcpy ( &tkey , sizeof(key_t) );

	// then datasize if any. neg rec will have -1 datasize
	if ( dataSize >= 0 ) 
		sbuf->pushLong ( dataSize );

	// then read data rec itself into it, compressed titlerec part
	if ( dataSize > 0 ) {
		// read in the titlerec after the key/datasize
		status = m_bf.read ( sbuf->getBuf() ,
				     dataSize ,
				     m_fileOffset );
		if ( g_errno ) { // n != dataSize ) {
			log("main: failed to read in title rec "
			    "file. %s. Skipping file %s",
			    mstrerror(g_errno),m_bf.getFilename());
			// essentially free up this msg7 now
			//msg7->m_inUse = false;
			//msg7->reset();
			goto nextFile;
		}
		// advance
		m_fileOffset += dataSize;
		// it's good, count it
		sbuf->m_length += dataSize;
	}

	// set xmldoc from the title rec
	//xd->set ( sbuf.getBufStart() );
	//xd->m_masterState = NULL;
	//xd->m_masterCallback ( titledbInjectLoop );

	// we use this so we know where the doc we are injecting
	// was in the foregien titledb file. so we can update our bookmark
	// code.
	mcast->m_hackFileOff = saved;//m_fileOffset;
	mcast->m_hackFileId  = m_bfFileId;

	//
	// inject a title rec buf this time, we are doing an import
	// FROM A TITLEDB FILE!!!
	//
	//gr->m_titleRecBuf = &sbuf;

	// break it down into gw
	// xd.set2 ( sbuf.getBufStart() ,
	// 	  sbuf.length() , // max size
	// 	  cr->m_coll, // use our coll
	// 	  NULL , // pbuf for page parser
	// 	  1 , // niceness
	// 	  NULL ); //sreq );

	// // note it
	// log("import: importing %s",xd.m_firstUrl.getUrl());

	// now we can set gr for the injection
	// TODO: inject the whole "sbuf" so we get sitenuminlinks etc
	// all exactly the same...
	// gr->m_url = xd.getFirstUrl()->getUrl();
	// gr->m_queryToScrape = NULL;
	// gr->m_contentDelim = 0;
	// gr->m_contentTypeStr = g_contentTypeStrings [xd.m_contentType];
	// gr->m_contentFile = NULL;
	// gr->m_content = xd.ptr_utf8Content;
	// gr->m_diffbotReply = NULL;
	// gr->m_injectLinks = false;
	// gr->m_spiderLinks = true;
	// gr->m_shortReply = false;
	// gr->m_newOnly = false;
	// gr->m_deleteUrl = false;
	// gr->m_recycle = true; // recycle content? or sitelinks?
	// gr->m_dedup = false;
	// gr->m_hasMime = false;
	// gr->m_doConsistencyTesting = false;
	// gr->m_getSections = false;
	// gr->m_gotSections = false;
	// gr->m_charset = xd.m_charset;
	// gr->m_hopCount = xd.m_hopCount;


	//
	// point to next doc in the titledb file
	//
	//m_fileOffset += need;

	// get docid from key
	docId = g_titledb.getDocIdFromKey ( &tkey );

	// get shard that holds the titlerec for it
	shardNum = g_hostdb.getShardNumFromDocId ( docId );

	// for selecting which host in the shard receives it
	key = (long)docId;


	m_numOut++;

	// then index it. master callback will be called
	//if ( ! xd->index() ) return false;

	// TODO: make this forward the request to an appropriate host!!
	// . gr->m_sbuf is set to the titlerec so this should handle that
	//   and use XmlDoc::set4() or whatever
	// if ( msg7->injectTitleRec ( msg7 , // state
	// 			    gotMsg7ReplyWrapper , // callback
	// 			    cr )) {
	// 	// it didn't block somehow...
	// 	msg7->m_inUse = false;
	// 	msg7->gotMsg7Reply();
	// }


	req = sbuf->getBufStart();
	reqSize = sbuf->length();

	if ( reqSize != need ) { char *xx=NULL;*xx=0 ; }

	// do not free it, let multicast free it after sending it
	sbuf->detachBuf();


	if ( ! mcast->send ( req ,
			     reqSize ,
			     0x07 ,
			     true , // ownmsg?
			     shardNum,
			     false, // send to whole shard?
			     key , // for selecting host in shard
			     mcast , // state
			     NULL , // state2
			     gotMulticastReplyWrapper ,
			     999999 ) ) { // total timeout in seconds
		log("import: import mcast had error: %s",mstrerror(g_errno));
		m_numIn++;
	}

	goto INJECTLOOP;

 nextFile:
	// invalidate this flag
	//m_offIsValid = false;
	// . and call this function. we add one to m_bfFileId so we
	//   do not re-get the file we just injected.
	// . sets m_bf and m_fileOffset
	// . returns false if nothing to read
	if ( ! setCurrentTitleFileAndOffset ( ) ) { //cr , m_bfFileId+1 );
		log("import: import: no files left to read");
		//goto INJECTLOOP;
		return true;
	}

	// if it returns NULL we are done!
	log("main: titledb injection loop completed. waiting for "
	    "outstanding injects to return.");
		
	if ( m_numOut > m_numIn )
		return false;

	log("main: all injects have returned. DONE.");

	// dummy return
	return true;
}

Ejemplo n.º 10

Mostrar archivo

Archivo: PageThesaurus.cpp Proyecto: BILObilo/open-source-search-engine

bool sendPageThesaurus( TcpSocket *s, HttpRequest *r ) {
	SafeBuf p;
	char getBuf[64]; // holds extra values for GET method
	char formBuf[256]; // holds extra values for forms
	snprintf(getBuf, 64, "c=%s", 
		 r->getString("c", 0, ""));
	snprintf(formBuf, 256, 
		 "<input type=hidden name=\"c\" value=\"%s\">",
		 //"<input type=hidden name=\"pwd\" value=\"%s\">",
		 r->getString("c", 0, ""));
	g_pages.printAdminTop( &p, s, r);
	
	if (r->getLong("cancel", 0) != 0) {
		g_thesaurus.cancelRebuild();
		p.safePrintf("<br><br>\n");
		p.safePrintf(
		  "<center><b><font color=#ff0000>"
		  "rebuild canceled"
		  "</font></b></center>");
	}

	if (r->getLong("rebuild", 0) != 0) {
		bool full = r->getLong("full", 0);
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.rebuild(0, full)) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error starting rebuild, check log for details"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "rebuild started"
			  "</font></b></center>");
		}
	}
	
	if (r->getLong("rebuildaff", 0) != 0) {
		bool full = r->getLong("full", 0);
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.rebuildAffinity(0, full)) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error starting rebuild, check log for details"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "rebuild started"
			  "</font></b></center>");
		}
	}

	if (r->getLong("distribute", 0) != 0) {
		char cmd[1024];
		p.safePrintf("<br><br>\n");
		if (g_thesaurus.m_affinityState) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "cannot distribute during rebuild"
			  "</font></b></center>");
		} else {
			for ( long i = 0; i < g_hostdb.getNumHosts() ; i++ ) {
				Host *h = g_hostdb.getHost(i);
				snprintf(cmd, 512,
					"rcp -r "
					"./dict/thesaurus.* "
					"%s:%s/dict/ &",
					iptoa(h->m_ip),
					h->m_dir);
				log(LOG_INFO, "admin: %s", cmd);
				system( cmd );
			}
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "data distributed"
			  "</font></b></center>");
		}	
	}

	if (r->getLong("reload", 0) != 0) {
		p.safePrintf("<br><br>\n");
		if (r->getLong("cast", 0) != 0) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "reload command broadcast"
			  "</font></b></center>");
		} else if (g_thesaurus.init()) {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "thesaurus data reloaded"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error reloading thesaurus data"
			  "</font></b></center>");
		}
	}

	long manualAddLen = 0;
	char *manualAdd = NULL;
	SafeBuf manualAddBuf;
	if ((manualAdd = r->getString("manualadd", &manualAddLen))) {
		trimWhite(manualAdd);
		manualAddLen = gbstrlen(manualAdd);
		File manualFile;
		manualFile.set(g_hostdb.m_dir, "dict/thesaurus-manual.txt");
		if (manualFile.open(O_WRONLY | O_CREAT | O_TRUNC) &&
			(manualFile.write(manualAdd, manualAddLen, 0) ==
			 manualAddLen)) {
			char newl = '\n'; // for write()
			if (manualAdd[manualAddLen-1] != '\n')
				manualFile.write(&newl, 1, manualAddLen);
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "updated manual add file sucessfully"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error writing manual add file"
			  "</font></b></center>");
		}
	} else {
		char ff[PATH_MAX];
		snprintf(ff, PATH_MAX, "%sdict/thesaurus-manual.txt",
			g_hostdb.m_dir);
		if (manualAddBuf.fillFromFile(ff)) {
			if (*(manualAddBuf.getBuf()-1) != '\n')
				manualAddBuf.pushChar('\n');
			manualAdd = manualAddBuf.getBufStart();
			manualAddLen = manualAddBuf.length();
		}
	}

	long affinityAddLen = 0;
	char *affinityAdd = NULL;
	SafeBuf affinityAddBuf;
	if ((affinityAdd = r->getString("affinityadd", &affinityAddLen))) {
		trimWhite(affinityAdd);
		affinityAddLen = gbstrlen(affinityAdd);
		File affinityFile;
		affinityFile.set(g_hostdb.m_dir, 
			"dict/thesaurus-affinity.txt");
		if (affinityFile.open(O_WRONLY | O_CREAT | O_TRUNC) &&
			(affinityFile.write(affinityAdd, affinityAddLen, 0) ==
			 affinityAddLen)) {
			char newl = '\n'; // for write()
			if (affinityAdd[affinityAddLen-1] != '\n')
				affinityFile.write(&newl, 1, affinityAddLen);
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "updated affinity add file sucessfully"
			  "</font></b></center>");
		} else {
			p.safePrintf(
			  "<center><b><font color=#ff0000>"
			  "error writing affinity add file"
			  "</font></b></center>");
		}
	} else {
		char ff[PATH_MAX];
		snprintf(ff, PATH_MAX, "%sdict/thesaurus-affinity.txt",
			g_hostdb.m_dir);
		if (affinityAddBuf.fillFromFile(ff)) {
			if (*(affinityAddBuf.getBuf()-1) != '\n')
				affinityAddBuf.pushChar('\n');
			affinityAdd = affinityAddBuf.getBufStart();
			affinityAddLen = affinityAddBuf.length();
		}
	}
	

	char *syn = r->getString("synonym");
	long len = 0;
	if (syn) len = gbstrlen(syn);

	if (len) {
		SynonymInfo info;
		bool r = g_thesaurus.getAllInfo(syn, &info, len, SYNBIT_ALL);
		p.safePrintf("<br><br>\n");
		p.safePrintf ( 
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Synonym List (%ld)</b></center>"
		  "</td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE, info.m_numSyns);
		if (r) {
			p.safePrintf("<tr>"
			  "<td align=right><tt>%s</tt></td>"
			  "<td align=left>"
			  "<tt>1.000/%08lX (1.000/%08lX)</tt>"
			  "</td>"
			  "</tr>\n", syn, MAX_AFFINITY, MAX_AFFINITY);
			for (long i = 0; i < info.m_numSyns; i++) {
				// get the reverse affinity as well
				long aff = g_thesaurus.getAffinity(
					info.m_syn[i], syn,
					info.m_len[i], len);
				p.safePrintf( 
				  "<tr>"
				  "<td width=40%% align=right>"
				  "<tt>");
				p.safeMemcpy(info.m_syn[i], info.m_len[i]);
				p.safePrintf("</tt>"
				  "</td>"
				  "<td width=60%% align=left>"
				  "<tt>");
				if (info.m_affinity[i] >= 0) {
					p.safePrintf("%0.3f/%08lX ",
				  	  (float)info.m_affinity[i] 
					  	/ MAX_AFFINITY,
					  info.m_affinity[i]);
				} else {
					p.safePrintf("u ");
				}
				if (aff >= 0) {
					p.safePrintf("(%0.3f/%08lX) ",
					  (float)aff / MAX_AFFINITY, 
					  aff);
				} else {
					p.safePrintf("(u) ");
				}
				p.safePrintf("(%ld) (%ld) (%ld) (%ld) "
					     "(%lld) (%lld)",
				  (long)info.m_type[i], (long)info.m_sort[i],
				  info.m_firstId[i], info.m_lastId[i],
				  info.m_leftSynHash[i], 
				  info.m_rightSynHash[i]);
				for (int j = info.m_firstId[i]; 
					j <= info.m_lastId[i];
					j++) {
					p.safePrintf(" (%lld)",
						info.m_termId[j]);
				}
				p.safePrintf(
				  "</tt>"
				  "</td>"
				  "</tr>\n");
			}
			p.safePrintf("</table>");
		} else {
			p.safePrintf("<tr>"
			  "<td align=center><font color=#FF0000>"
			  "synonym not found: %s"
			  "</font></td>"
			  "</tr>\n",
			  syn);
		}
	}

	p.safePrintf ( "<br><br>\n" );

	p.safePrintf ( 
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Thesaurus Controls"
		  "</b></center></td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE);
	
	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>rebuild all data</b><br>"
		  "<font size=1>"
		  "rebuilds synonyms and then begins the rebuild process for "
		  "affinity data; this should only be run on one host, as the "
		  "data is copied when the process is finished; full rebuild "
		  "does not use existing affinity data"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?rebuild=1&%s\">"
		  "rebuild all data</a> <a href=\"/master/thesaurus?"
		  "rebuild=1&full=1&%s\">(full)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf, getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>distribute data</b><br>"
		  "<font size=1>"
		  "distributes all thesaurus data to all hosts, this is "
		  "normally done automatically but if there was a problem "
		  "with the copy, this lets you do it manually"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?distribute=1&%s\">"
		  "distribute data</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>reload data</b><br>"
		  "<font size=1>"
		  "reloads the synonyms and affinity table on this host only"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b>"
		  "<a href=\"/master/thesaurus?reload=1&cast=0&%s\">"
		  "reload data</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>reload data (all hosts)</b><br>"
		  "<font size=1>"
		  "reloads the synonyms and affinity table on all hosts"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b>"
		  "<a href=\"/master/thesaurus?reload=1&cast=1&%s\">"
		  "reload data (all hosts)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>list synonyms</b><br>"
		  "<font size=1>"
		  "enter a word here to list all synonym entries and their "
		  "affinities"
		  "</font>"
		  "</td>"
		  "<td width=12%%>"
		  "<form action=\"/master/thesaurus>\">"
		  "<input type=text name=synonym size=20>"
		  "<input type=submit value=Submit>"
		  "%s"
		  "</form></td>"
		  "</tr>\n", formBuf);
		
	p.safePrintf (
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Affinity Controls"
		  "</b></center></td>"
		  "</tr>\n",
		  DARK_BLUE);

	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>cancel running rebuild</b><br>"
		  "<font size=1>"
		  "cancels the rebuild and throws all intermediate data away"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?cancel=1&%s\">"
		  "cancel running rebuild</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf);
	
	p.safePrintf (
		  "<tr>"
		  "<td width=37%%><b>rebuild affinity only</b><br>"
		  "<font size=1>"
		  "begins the rebuild process for affinity data, has no "
		  "effect if a rebuild is already in progress; full rebuild "
		  "does not reuse existing affinity data"
		  "</font>"
		  "</td>"
		  "<td width=12%% bgcolor=#0000ff>"
		  "<center><b><a href=\"/master/thesaurus?rebuildaff=1&%s\">"
		  "rebuild affinity</a> <a href=\"/master/thesaurus?"
		  "rebuildaff=1&full=1&%s\">(full)</a></b></center>"
		  "</td>"
		  "</tr>\n", getBuf, getBuf);
	
	p.safePrintf (
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Manual File Controls"
		  "</b></td>"
		  "</tr>\n",
		  DARK_BLUE);

	p.safePrintf (
		  "<tr>"
		  "<td align=center colspan=2>");
	
	p.safePrintf(
		  "<b>manually added pairs</b><br>\n"
		  "<font size=1>place word pairs here that should be linked "
		  "as synonyms, one pair per line, seperated by a pipe '|' "
		  "character, optionally followed by another pipe and a type "
		  "designation; any badly formatted lines will be silently "
		  "ignored</font><br>\n"
		  "<form action=\"/master/thesaurus\" method=post>"
		  "<textarea name=\"manualadd\" rows=20 cols=80>");

	if (manualAdd && manualAddLen) {
		p.htmlEncode(manualAdd, manualAddLen, true);
	}
	
	p.safePrintf (
		  "</textarea><br>"
		  "<input type=submit value=Submit>"
		  "<input type=reset value=Reset>"
		  "%s"
		  "</form></td>"
		  "</tr>\n",
		  formBuf);

	
	p.safePrintf (
		  "<tr>"
		  "<td align=center colspan=2>"
		  "<b>affinity value overrides</b><br>\n"
		  "<font size=1>place word/phrase pairs here that should have "
		  "there affinity values overridden, format is "
		  "\"word1|word2|value\", where value is a floating point, "
		  "integer (either decimal or hex), or the word \"max\"; "
		  "any badly formatted lines will be silently ignored; note "
		  "that these pairs will only work if the thesaurus otherwise "
		  "has an entry for them, so add them to the manual add file "
		  "above if need be</font><br>\n"
		  "<form action=\"/master/thesaurus\" method=post>"
		  "<textarea name=\"affinityadd\" rows=20 cols=80>");

	if (affinityAdd && affinityAddLen) {
		p.htmlEncode(affinityAdd, affinityAddLen, true);
	}
	
	p.safePrintf (
		  "</textarea><br>"
		  "<input type=submit value=Submit>"
		  "<input type=reset value=Reset>"
		  "%s"
		  "</form></td>"
		  "</tr>\n", 
		  formBuf);


	p.safePrintf ( "</table>\n" );
	p.safePrintf ( "<br><br>\n" );

	p.safePrintf (
		  "<table cellpadding=4 width=100%% bgcolor=#%s border=1>"
		  "<tr>"
		  "<td colspan=2 bgcolor=#%s>"
		  "<center><b>Affinity Builder Status"
		  "</b></td>"
		  "</tr>\n",
		  LIGHT_BLUE, DARK_BLUE);

	long long a, b, c, d, e, f, g, h, i, j, k;
	StateAffinity *aff = g_thesaurus.m_affinityState;
	if (!aff) {
		p.safePrintf (
		  "<tr><td colspan=2>"
		  "<center><b>Not running</b></center>"
		  "</td></tr>\n");
		a = b = c = d = e = f = g = h = i = j = k = 0;
	} else {
		a = aff->m_oldTable->getNumSlotsUsed();
		b = aff->m_oldTable->getNumSlotsUsed() - aff->m_n;
		c = aff->m_n;
		d = (gettimeofdayInMilliseconds() - aff->m_time) / 1000;
		if (!d || !(c / d)) { 
			e = 0;
		} else {
			e = b / (c / d);
		}
		f = aff->m_sent;
		g = aff->m_recv;
		h = aff->m_errors;
		i = aff->m_old;
		j = aff->m_cache;
		k = aff->m_hitsTable.getNumSlotsUsed();
	}
	p.safePrintf (
		  "<tr><td><b># of total pairs</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of pairs remaining</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of pairs processed</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>elapsed time in seconds</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>estimated remaining time in seconds</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of requests sent</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of requests received</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of request errors</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of old values reused</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b># of cache hits</b></td>"
		  "<td>%lli</td></tr>\n"
		  "<tr><td><b>cache size</b></td>"
		  "<td>%lli</td></tr>\n",
		  a, b, c, d, e, f, g, h, i, j, k);
	p.safePrintf ( "</table>\n" );

	return g_httpServer.sendDynamicPage ( s, p.getBufStart(), p.length() );
}

Ejemplo n.º 11

Mostrar archivo

Archivo: Test.cpp Proyecto: BKJackson/open-source-search-engine

// this should be called when all docs have finished spidering
void Test::stopIt ( ) {

	// sanity
	if ( m_isAdding ) { char *xx=NULL;*xx=0; }
	// flag that we are done
	m_isRunning = false;

	// print time
	log("test: took %lli ms to complete injections.",
	    gettimeofdayInMilliseconds() - m_testStartTime );

	// get this before setting testParserEnabled to false
	char *testDir = g_test.getTestDir();

	// turn this off now too
	g_conf.m_testParserEnabled = false;
	g_conf.m_testSpiderEnabled = false;



	// save all!
	bool disabled = g_threads.m_disabled;
	g_threads.disableThreads();
	// save it blocking style
	g_process.save();
	if ( ! disabled ) g_threads.enableThreads();

	// save ips.txt
	saveTestBuf ( testDir );

	log("test: test completed. making qa.html");

	//
	//
	// NOW MAKE THE qa.html FILE
	//
	//

	// only analyze up to last 7 runs
	long start = m_runId - 7;
	if ( start < 0 ) start = 0;

	SafeBuf sb;
	sb.safePrintf("<table border=1>\n");
	sb.safePrintf("<tr>"
		      "<td><b><nobr>run id</nobr></b></td>"
		      "<td><b><nobr>conf diff</nobr></b></td>"
		      "<td><b><nobr>coll diff</nobr></b></td>"
		      "<td><b><nobr>run info</nobr></b></td>"
		      "</tr>\n");

	// take diffs between this run and the last run for confparms
	for ( long i = m_runId ; i > start ; i-- ) {
		// shortcut
		char *dir = g_hostdb.m_dir;
		// make diff filename
		char diff1[200];
		sprintf(diff1,"%s/%s/run.%li.confparms.txt.diff",dir,
			testDir,i);
		File f1;
		f1.set(diff1);
		if ( ! f1.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%li.confparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%li.confparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff1);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		long fs1 = f1.getFileSize();
		sb.safePrintf("<tr><td>%li</td><td>%li</td>", i,fs1);

		// make diff filename
		char diff2[200];
		sprintf(diff2,"%s/%s/run.%li.collparms.txt.diff",dir,
			testDir,i);
		File f2;
		f2.set(diff2);
		if ( ! f2.doesExist() ) {
			char df1[200];
			char df2[200];
			sprintf(df1,"%s/%s/run.%li.collparms.txt",dir,
				testDir,i);
			sprintf(df2,"%s/%s/run.%li.collparms.txt",dir,
				testDir,i-1);
			// do the diff
			char cmd[600];
			sprintf(cmd,"diff %s %s > %s",df1,df2,diff2);
			log("test: system(\"%s\")",cmd);
			system (cmd);
		}
		long fs2 = f2.getFileSize();
		sb.safePrintf("<td>%li</td>", fs2);

		// the version
		char vf[200]; 
		sprintf(vf,"%s/%s/run.%li.version.txt",dir,testDir,i);
		File f3; 
		f3.set ( vf );
		long fs3 = f3.getFileSize();
		char vbuf[1000];
		vbuf[0] = 0;
		if ( fs3 > 0 ) {
			f3.open(O_RDONLY);
			long rs = f3.read(vbuf,fs3,0);
			vbuf[fs3] = '\0';
			if ( rs <= 0 ) continue;
			f3.close();
		}
		// show it
		sb.safePrintf("<td><pre>%s</pre></td></tr>\n", vbuf);
	}
	sb.safePrintf("</table>\n");
	sb.safePrintf("<br>\n");


	//
	// now diff each parser output file for each url in urls.txt
	//


	//
	// loop over url buf first so we can print one table per url
	//

	char *next = NULL;
	// reset the url buf ptr
	m_urlPtr = m_urlBuf;
	// count em
	long count = 0;

	// ptrs to each url table
	long  un = 0;
	long  uptr [5000]; // offsets now, not char ptr since buf gets reallocd
	char  udiff[5000];
	long  ulen [5000];
	long  uhits[5000]; // critical errors! validateOutput() choked!
	long  uunchecked[5000]; // events/addresses found but were not validatd
	long  umiss[5000];
	long  usort[5000];
	long  uevents[5000];
	SafeBuf tmp;

	long niceness = MAX_NICENESS;

	// advance to next url
	for ( ; m_urlPtr < m_urlEnd ; m_urlPtr = next ) {
		// breathe
		QUICKPOLL(niceness);
		// we converted all non-url chars into \0's so skip those!
		for ( ; m_urlPtr<m_urlEnd && !*m_urlPtr ; m_urlPtr++ );
		// breach check
		if ( m_urlPtr >= m_urlEnd ) break;
		// set this up
		next = m_urlPtr;
		// compute next url ptr
		for ( ; next < m_urlEnd && *next ; next++ );
		// point to this url
		char *u = m_urlPtr;
		// get hash
		long long h = hash64 ( u , gbstrlen(u) );
		// shortcut
		char *dir = g_hostdb.m_dir;


		// print into a secondary safe buf with a ptr to
		// it so we can sort that and transfer into the
		// primary safebuf later
		uptr[un] = tmp.length();
		// assume no diff
		udiff[un] = 0;

		// print number
		tmp.safePrintf("%li) ",count++);
		// . link to our stored http server reply
		// . TODO: link it to our [cached] copy in the test coll!!!
		char local[1200];
		sprintf(local,"/%s/doc.%llu.html",testDir,h);
		tmp.safePrintf("<a href=\"%s\"><b>%s</b></a> ",local,u);
		// link to live page
		tmp.safePrintf(" <a href=\"%s\">live</a> ",u);
		// link to page parser
		char ubuf[2000];
		urlEncode(ubuf,2000,u,gbstrlen(u),true);
		tmp.safePrintf(" <a href=\"/master/parser?c=test&"
			       "u=%s\">parser</a> ",ubuf);
		//tmp.safePrintf(" (%llu)",h);
		tmp.safePrintf("<br>\n");
		//tmp.safePrintf("<br>\n");
		tmp.safePrintf("<table border=1>\n");
		tmp.safePrintf("<tr>"
			      "<td><b><nobr>run id</nobr></b></td>"
			      "<td><b><nobr>crit hits</nobr></b></td>"
			      "<td><b><nobr>crit errors</nobr></b></td>"
			      "<td><b><nobr># e</nobr></b></td>"
			      "<td><b><nobr>unchecked</nobr></b></td>"
			      "<td><b><nobr>diff chars</nobr></b></td>"
			      "<td><b><nobr>diff file</nobr></b></td>"
			      "<td><b><nobr>full output</nobr></b></td>"
			      "</tr>\n");

		//SafeBuf sd;

		// loop over all the runs now, starting with latest run first
		for ( long ri = m_runId ; ri >= start ; ri-- ) {

			QUICKPOLL(niceness);

			// the diff filename
			char pdiff[200];
			sprintf(pdiff,"%s/%s/parse.%llu.%li.html.diff",dir,
				testDir,h,ri);
			File f;
			f.set(pdiff);
			long fs = f.getFileSize();
			if ( ! f.doesExist() && ri > 0 ) {
				// make the parse filename
				char pbuf1[200];
				char pbuf2[200];
				sprintf(pbuf1,"%s/%s/parse.%llu.%li.html",
					dir,testDir,h,ri);
				sprintf(pbuf2,"%s/%s/parse.%llu.%li.html",
					dir,testDir,h,ri-1);
				// sanity check
				//File tf; tf.set(pbuf1);
				//if ( ! tf.doesExist()) {char *xx=NULL;*xx=0;}
				// tmp file name
				char tmp1[200];
				char tmp2[200];
				sprintf(tmp1,"%s/%s/t1.html",dir,testDir);
				sprintf(tmp2,"%s/%s/t2.html",dir,testDir);
				// filter first
				char cmd[600];
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf1,tmp1);
				system(cmd);
				sprintf(cmd,
					"cat %s | "
					"grep -v \"<!--ignore-->\" "
					" > %s", pbuf2,tmp2);
				system(cmd);
				// make the system cmd to do the diff
				sprintf(cmd,
					"echo \"<pre>\" > %s ; "
					"diff -w --text %s %s "
					// ignore this table header row
					//" | grep -v \"R#4\""
					" >> %s",
					pdiff,
					tmp1,tmp2,pdiff);
				log("test: system(\"%s\")",cmd);
				system(cmd);
				// try again
				f.set(pdiff);
				fs = f.getFileSize();
			}

			QUICKPOLL(niceness);

			// this means 0 . it just has the <pre> tag in it!
			if ( fs < 0 || fs == 6 ) fs = 0;
			// . if no diff and NOT current run, do not print it
			// . print it if the run right before the current 
			//   now always too
			if ( ri != m_runId && ri != m_runId-1 && fs == 0 ) 
				continue;
			// relative filename
			char rel[200];
			sprintf(rel,"/%s/parse.%llu.%li.html.diff",
				testDir,h,ri);
			char full[200];
			sprintf(full,"/%s/parse.%llu.%li.html",
				testDir,h,ri);
			char validate[200];
			sprintf(validate,
				"/%s/parse-shortdisplay.%llu.%li.html",
				testDir,h,ri);
			// use red font for current run that has a diff!
			char *t1 = "";
			char *t2 = "";
			if ( ri == m_runId && fs != 0 ) {
				t1 = "<font color=pink><b>";
				t2 = "</b></font>";
				// a diff
				udiff[un] = 1;
			}

			// . get critical errors
			// . i.e. XmlDoc::validateOutput() could not validate
			//   a particular event or address that was in the
			//   url's "validated.uh64.txt" file since the admin
			//   clicked on the checkbox in the page parser output
			// . if we do not find such a tag in the parser output
			//   any more then Spider.cpp creates this file!
			if ( ri == m_runId ) {
				char cfile[256];
				sprintf(cfile,"%s/%s/critical.%llu.%li.txt",
					g_hostdb.m_dir,testDir,h,ri);
				SafeBuf ttt;
				ttt.fillFromFile(cfile);
				// first long is misses, then hits then events
				umiss[un] = 0;
				uhits[un] = 0;
				uevents[un] = 0;
				uunchecked[un] = 0;
				if ( ttt.length() >= 3 )
					sscanf(ttt.getBufStart(),
					       "%li %li %li %li",
					       &umiss[un],
					       &uhits[un],
					       &uevents[un],
					       &uunchecked[un]);
				usort[un] = umiss[un] + uunchecked[un];
				//File cf;
				//cf.set(cfile);
				//if ( cf.doesExist()) ucrit[un] = 1;
				//else                 ucrit[un] = 0;
			}

			// more critical?
			if ( ri == m_runId && umiss[un] != 0 ) {
				t1 = "<font color=red><b>";
				t2 = "</b></font>";
			}

			// . these are good to have
			// . if you don't have 1+ critical hits then you
			//   probably need to be validate by the qa guy
			char *uhb1 = "";
			char *uhb2 = "";
			if ( ri == m_runId && uhits[un] != 0 ) {
				uhb1 = "<font color=green><b>**";
				uhb2 = "**</b></font>";
			}

			QUICKPOLL(niceness);

			char *e1 = "<td>";
			char *e2 = "</td>";
			long ne = uevents[un];
			if ( ne ) { 
				e1="<td bgcolor=orange><b><font color=brown>"; 
				e2="</font></b></td>"; 
			}
			char *u1 = "<td>";
			char *u2 = "</td>";
			if ( uunchecked[un] ) {
				u1="<td bgcolor=purple><b><font color=white>"; 
				u2="</font></b></td>"; 
			}
				
			// print the row!
			tmp.safePrintf("<tr>"
				      "<td>%s%li%s</td>"
				       "<td>%s%li%s</td>" // critical hits
				       "<td>%s%li%s</td>" // critical misses
				       "%s%li%s" // # events
				       "%s%li%s" // unchecked
				       "<td>%s%li%s</td>" // filesize of diff
				      // diff filename
				      "<td><a href=\"%s\">%s%s%s</a></td>"
				      // full parser output
				      "<td>"
				       "<a href=\"%s\">full</a> | "
				       "<a href=\"%s\">validate</a> "
				       "</td>"
				      "</tr>\n",
				      t1,ri,t2,
				      uhb1,uhits[un],uhb2,
				      t1,umiss[un],t2,
				      e1,ne,e2,
				       u1,uunchecked[un],u2,
				      t1,fs,t2,
				      rel,t1,rel,t2,
				      full,
				       validate);


			// only fill "sd" for the most recent guy
			if ( ri != m_runId ) continue;

			// now concatenate the parse-shortdisplay file
			// to this little table so qa admin can check/uncheck
			// validation checkboxes for addresses and events
			//sprintf(cfile,
			//	"%s/test/parse-shortdisplay.%llu.%li.html",
			//	g_hostdb.m_dir,h,ri);
			//sd.fillFromFile ( cfile );
		}
		// end table
		tmp.safePrintf("</table>\n");

		// . and a separate little section for the checkboxes
		// . should already be in tables, etc.
		// . each checkbox should provide its own uh64 when it
		//   calls senddiv() when clicked now
		//tmp.cat ( sd );

		tmp.safePrintf("<br>\n");
		tmp.safePrintf("<br>\n");
		// set this
		ulen[un] = tmp.length() - uptr[un] ;
		// sanity check
		if ( ulen[un] > 10000000 ) { char *xx=NULL;*xx=0; }
		// inc it
		un++;
		// increase the 5000!!
		if ( un >= 5000 ) { char *xx=NULL; *xx=0; }
	}


	char flag ;
 bubble:
	flag = 0;
	// sort the url tables
	for ( long i = 0 ; i < un - 1 ; i++ ) {
		QUICKPOLL(niceness);
		if ( usort[i] >  usort[i+1] ) continue;
		if ( usort[i] == usort[i+1] ) 
			if ( udiff[i] >= udiff[i+1] ) continue;
		// swap em
		long  tp = uptr[i];
		long  td = udiff[i];
		long  um = umiss[i];
		long  us = usort[i];
		long  uh = uhits[i];
		long  tl = ulen [i];
		uptr[i] = uptr[i+1];
		umiss[i] = umiss[i+1];
		usort[i] = usort[i+1];
		uhits[i] = uhits[i+1];
		udiff[i] = udiff[i+1];
		ulen[i]  = ulen[i+1];
		uptr[i+1] = tp;
		umiss[i+1] = um;
		usort[i+1] = us;
		uhits[i+1] = uh;
		udiff[i+1] = td;
		ulen [i+1] = tl;
		flag = 1;
	}
	if ( flag ) goto bubble;

	// transfer into primary safe buf now
	for ( long i = 0 ; i < un ; i++ ) 
		sb.safeMemcpy(tmp.getBufStart() + uptr[i],ulen[i]);


	sb.safePrintf("</html>\n");

	char dfile[200];
	sprintf(dfile,"%s/%s/qa.html",g_hostdb.m_dir,testDir);
	sb.dumpToFile ( dfile );

	// free the buffer of urls
	reset();

	// turn off spiders
	g_conf.m_spideringEnabled = 0;

	// all done
	return;
}

Ejemplo n.º 12

Mostrar archivo

Archivo: HttpMime.cpp Proyecto: privacore/open-source-search-engine

bool HttpMime::addCookieHeader(const char *cookieJar, const char *url, SafeBuf *sb) {
	Url tmpUrl;
	tmpUrl.set(url);

	SafeBuf tmpSb;

	size_t cookieJarLen = strlen(cookieJar);

	const char *lineStartPos = cookieJar;
	const char *lineEndPos = NULL;
	while ((lineEndPos = (const char*)memchr(lineStartPos, '\n', cookieJarLen - (lineStartPos - cookieJar))) != NULL) {
		const char *currentPos = lineStartPos;
		const char *tabPos = NULL;
		unsigned fieldCount = 0;

		bool skipCookie = false;
		const char *domain = NULL;
		int32_t domainLen = 0;
		while (fieldCount < 5 && (tabPos = (const char*)memchr(currentPos, '\t', lineEndPos - currentPos)) != NULL) {
			switch (fieldCount) {
				case 0:
					// domain
					domain = currentPos;
					domainLen = tabPos - currentPos;
					break;
				case 1:
					// flag
					if (memcmp(currentPos, "TRUE", 4) == 0) {
						// allow subdomain
						if (tmpUrl.getHostLen() >= domainLen) {
							if (!endsWith(tmpUrl.getHost(), tmpUrl.getHostLen(), domain, domainLen)) {
								// doesn't end with domain - ignore cookie
								skipCookie = true;
								break;
							}
						} else {
							skipCookie = true;
							break;
						}
					} else {
						// only specific domain
						if (tmpUrl.getHostLen() != domainLen || strncasecmp(domain, tmpUrl.getHost(), domainLen) != 0) {
							// non-matching domain - ignore cookie
							skipCookie = true;
							break;
						}
					}
					break;
				case 2: {
					// path
					const char *path = currentPos;
					int32_t pathLen = tabPos - currentPos;
					if (strncasecmp(path, tmpUrl.getPath(), pathLen) == 0) {
						if (tmpUrl.getPathLen() != pathLen) {
							if (path[pathLen - 1] != '/' && tmpUrl.getPath()[tmpUrl.getPathLen() - 1] != '/') {
								// non-matching path - ignore cookie
								skipCookie = true;
								break;
							}
						}
					} else {
						// non-matching path - ignore cookie
						skipCookie = true;
						break;
					}
				} break;
				case 3:
					// secure

					break;
				case 4:
					// expiration

					break;
			}

			currentPos = tabPos + 1;
			++fieldCount;
		}

		if (!skipCookie) {
			tmpSb.safeMemcpy(currentPos, lineEndPos - currentPos);
			tmpSb.pushChar(';');
		}

		lineStartPos = lineEndPos + 1;
	}
	// we don't need to care about the last line (we always end on \n)

	if (tmpSb.length() > 0) {
		sb->safeStrcpy("Cookie: ");
		sb->safeMemcpy(&tmpSb);
		sb->safeStrcpy("\r\n");
	}

	return true;
}