TEST( XmlTest, MetaDescription) {
	const char* input_strs[] =  {
	    // valid
	    "totally valid description",
	    "“inside special quotes” and outside",

	    // invalid
	    "my \"invalid\" double quote description",
	    "\"someone has quotes\", and nobody else has it"
	    "'my 'invalid' single quote description'",
	    "it's a description",
	    "what is this quote \" doing here?"
	};

	const char* format_strs[] = {
	    "<meta name=\"description\" content=\"%s\">",
	    "<meta name=\"description\" content='%s'>",
	    "<meta name=\"description\" content=\"%s\" ng-attr-content=\"{{meta.description}}\">",
	    "<meta name=\"description\" content='%s' ng-attr-content=\"{{meta.description}}\" >",
	    "<meta name=\"description\" ng-attr-content=\"{{meta.description}}\" content=\"%s\">",
	    "<meta name=\"description\" ng-attr-content=\"{{meta.description}}\" content='%s'>",
	    "<meta name=\"description\" content=\"%s\" other-content=\"%s\">",
	    "<meta name=\"description\" content='%s' other-content='%s'>",
	    "<meta content=\"%s\" name=\"description\">",
	    "<meta content='%s' name=\"description\">",
	    "<meta name=\"description\" other-content=\"%s\" content=\"%s\">",
	    "<meta name=\"description\" other-content='%s' content='%s'>"
	};

	size_t len = sizeof( input_strs ) / sizeof( input_strs[0] );
	size_t format_len = sizeof( format_strs ) / sizeof( format_strs[0] );

	for ( size_t i = 0; i < len; i++ ) {
		for (size_t j = 0; j < format_len; j++) {
			const char *input_str = input_strs[i];

			char desc[MAX_BUF_SIZE];
			std::sprintf(desc, format_strs[j], input_str, input_str);

			char input[MAX_BUF_SIZE];
			std::sprintf(input, HTML_HEAD_FORMAT, desc);

			Xml xml;
			ASSERT_TRUE(xml.set(input, strlen(input), 0, CT_HTML));

			char buf[MAX_BUF_SIZE];
			int32_t bufLen = MAX_BUF_SIZE;
			int32_t contentLen = 0;

			ASSERT_TRUE(xml.getTagContent("name", "description", buf, bufLen, 0, bufLen, &contentLen, false, TAG_META));
			EXPECT_EQ(strlen(input_str), contentLen);
			EXPECT_STREQ(input_str, buf);
		}
	}
}
void parse_doc_icu(char *s, int len, bool doHash, char *charset){
	Xml xml;
	xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML );

	// Extract text from (x)html
	char *text_buf = (char*)malloc(64*1024);
	int32_t textLen = xml.getText( text_buf, 64 * 1024, 0, 99999999, doFilterSpaces );
	Words w;
	w.set(text_buf, textLen, doHash);
	free(text_buf);
}
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset)
{
	Xml xml;
	xml.set( s, len, TITLEREC_CURRENT_VERSION, 0, CT_HTML );

	// Extract text from (x)html
	char *text_buf = (char*)malloc(len+1);
	xml.getText( text_buf, len, 0, 99999999, doFilterSpaces );
	Words words;

	// just tokenize words
	words.set(text_buf, len, doHash);
	free(text_buf);
}
TEST( XmlTest, MetaDescriptionStripTags) {
	const char* input_strs[] =  {
	    "my title<br> my <b>very important</b> text",
	    "Lesser than (<) and greater than (>).",
	    "We shouldn't strip <3 out",
	    "123 < 1234; 1234 > 123",
	    "<p style='text-align: center;'>A color cartoon drawing of a clapping cod fish ( rebus in the danish language for klaptorsk )</p>"
	};

	const char* expected_outputs[] = {
	    "my title. my very important text",
	    "Lesser than (<) and greater than (>).",
	    "We shouldn't strip <3 out",
	    "123 < 1234; 1234 > 123",
	    "A color cartoon drawing of a clapping cod fish ( rebus in the danish language for klaptorsk ). "
	};

	const char* format_str = "<meta name=\"description\" content=\"%s\">";

	size_t len = sizeof( input_strs ) / sizeof( input_strs[0] );

	ASSERT_EQ(sizeof(input_strs)/sizeof(input_strs[0]), sizeof(expected_outputs)/sizeof(expected_outputs[0]));

	for ( size_t i = 0; i < len; i++ ) {
		const char *input_str = input_strs[i];
		const char *output_str = expected_outputs[i];

		char desc[MAX_BUF_SIZE];
		std::sprintf(desc, format_str, input_str, input_str);

		char input[MAX_BUF_SIZE];
		std::sprintf(input, HTML_HEAD_FORMAT, desc);

		Xml xml;
		ASSERT_TRUE(xml.set(input, strlen(input), 0, CT_HTML));

		char buf[MAX_BUF_SIZE];
		int32_t bufLen = MAX_BUF_SIZE;
		int32_t contentLen = 0;

		ASSERT_TRUE(xml.getTagContent("name", "description", buf, bufLen, 0, bufLen, &contentLen, false, TAG_META));
		EXPECT_EQ(strlen(output_str), contentLen);
		EXPECT_STREQ(output_str, buf);
	}
}
void parse_doc_icu(char *s, int len, bool doHash, char *charset){
	Xml xml;
	xml.set(csUTF8,s,len,false, 0,false, TITLEREC_CURRENT_VERSION);
	//fprintf(stderr,"\nparse_doc_icu\n");	
	// Extract text from (x)html
	char *text_buf = (char*)malloc(64*1024);
	long textLen = xml.getText(text_buf, 
				   64*1024, 
				   0,
				   99999999,
				   false,
				   true,
				   false,
				   doFilterSpaces,
				   false);
	Words w;
	w.set(true,false, text_buf, textLen, TITLEREC_CURRENT_VERSION,doHash);
	free(text_buf);
}
static void generateSummary( Summary &summary, char *htmlInput, const char *queryStr, const char *urlStr ) {
	Xml xml;
	ASSERT_TRUE(xml.set(htmlInput, strlen(htmlInput), 0, CT_HTML));

	Words words;
	ASSERT_TRUE(words.set(&xml, true));

	Bits bits;
	ASSERT_TRUE(bits.set(&words));

	Url url;
	url.set(urlStr);

	Sections sections;
	ASSERT_TRUE(sections.set(&words, &bits, &url, "", CT_HTML));

	Query query;
	ASSERT_TRUE(query.set2(queryStr, langEnglish, true));

	LinkInfo linkInfo;
	memset ( &linkInfo , 0 , sizeof(LinkInfo) );
	linkInfo.m_lisize = sizeof(LinkInfo);

	Title title;
	ASSERT_TRUE(title.setTitle(&xml, &words, 80, &query, &linkInfo, &url, NULL, 0, CT_HTML, langEnglish));

	Pos pos;
	ASSERT_TRUE(pos.set(&words));

	Bits bitsForSummary;
	ASSERT_TRUE(bitsForSummary.setForSummary(&words));

	Phrases phrases;
	ASSERT_TRUE(phrases.set(&words, &bits));

	Matches matches;
	matches.setQuery(&query);
	ASSERT_TRUE(matches.set(&words, &phrases, &sections, &bitsForSummary, &pos, &xml, &title, &url, &linkInfo));

	summary.setSummary(&xml, &words, &sections, &pos, &query, 180, 3, 3, 180, &url, &matches, title.getTitle(), title.getTitleLen());
}
void parse_doc_8859_1(char *s, int len, bool doHash,char *charset)
{
	Xml xml;
	xml.set(csASCII,s,len,false, 0, false, TITLEREC_CURRENT_VERSION);
	//fprintf(stderr,"\nparse_doc_8859_1\n");

	// Extract text from (x)html
	char *text_buf = (char*)malloc(len+1);
	xml.getText(text_buf, 
		    len, 
		    0,
		    99999999,
		    false,
		    true,
		    false,
		    doFilterSpaces,
		    false);
	Words words;

	// just tokenize words
	words.set(false, text_buf, TITEREC_CURRENT_VERSION, doHash);
	free(text_buf);
}
// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}
void DataFeed::parse ( char *dataFeedPage,
		       long  dataFeedPageLen ) {
	// use Xml Class to parse up the page
	Xml xml;
	xml.set ( csUTF8, dataFeedPage, dataFeedPageLen, false, 0, false,
		  TITLEREC_CURRENT_VERSION );
	// get the nodes
	long numNodes  = xml.getNumNodes();
	XmlNode *nodes = xml.getNodes();
	// to count the tiers, result levels, and level costs
	long currTier = 0;
	long currResultLevel = 0;
	long currLevelCost = 0;
	// pull out the keywords for the data feed
	for (long i = 0; i < numNodes; i++) {
		// skip if this isn't a meta tag, shouldn't happen
		if (nodes[i].m_nodeId != 68)
			continue;
		// get the meta tag name
		//long tagLen;
		//char *tag = xml.getString(i, "name", &tagLen);
		long  ucTagLen;
		char *ucTag = xml.getString(i, "name", &ucTagLen);
		char tag[256];
		long tagLen = utf16ToLatin1 ( tag, 256,
					      (UChar*)ucTag, ucTagLen>>1 );
		// skip if empty
		if (!tag || tagLen <= 0)
			continue;
		// get the content
		long ucConLen;
		char *ucCon = xml.getString(i, "content", &ucConLen);
		char con[1024];
		long conLen = utf16ToLatin1 ( con, 1024,
					      (UChar*)ucCon, ucConLen>>1 );
		if (!con || conLen <= 0)
			continue;
		// match the meta tag to its local var and copy content
		if (tagLen == 10 && strncasecmp(tag, "customerid", 10) == 0)
			m_customerId = atoll(con);
		else if (tagLen == 11 && strncasecmp(tag, "datafeedurl", 11) == 0)
			setUrl(con, conLen);
		else if (tagLen == 8 && strncasecmp(tag, "passcode", 8) == 0)
			m_passcodeLen = setstr(m_passcode, MAX_PASSCODELEN, con, conLen);
		else if (tagLen == 6 && strncasecmp(tag, "status", 6) == 0)
			m_isActive = (bool)atoi(con);
		else if (tagLen == 6 && strncasecmp(tag, "locked", 6) == 0)
			m_isLocked = (bool)atoi(con);
		else if (tagLen == 14 && 
			 strncasecmp(tag, "dfcreationtime", 14) == 0)
			m_creationTime = atol(con);
		else if (tagLen == 8 && strncasecmp(tag, "numtiers", 8) == 0)
			m_priceTable.m_numTiers = atol(con);
		else if (tagLen == 15 && strncasecmp(tag, "numresultlevels", 15) == 0)
			m_priceTable.m_numResultLevels = atol(con);
		else if (tagLen == 10 && strncasecmp(tag, "monthlyfee", 10) == 0)
			m_priceTable.m_monthlyFee = atol(con);
		else if (tagLen == 7 && strncasecmp(tag, "tiermax", 7) == 0) {
			m_priceTable.m_tierMax[currTier] = (unsigned long)atol(con);
			currTier++;
		}
		else if (tagLen == 11 && strncasecmp(tag, "resultlevel", 11) == 0) {
			m_priceTable.m_resultLevels[currResultLevel] = (unsigned long)atol(con);
			currResultLevel++;
		}
		else if (tagLen == 9 && strncasecmp(tag, "levelcost", 9) == 0) {
			m_priceTable.m_levelCosts[currLevelCost] = (unsigned long)atol(con);
			currLevelCost++;
		}
		else
			log(LOG_INFO, "datafeed: Invalid Meta Tag Parsed [%li]:"
			    " %s", tagLen, tag);
	}
}
예제 #10
0
void Blaster::gotDoc4 ( void *state, TcpSocket *s){
	StateBD *st=(StateBD *)state;
	st->m_numUrlDocsReceived++;
	if (!s) {
		//Shouldn't happen, but still putting a checkpoint
		log (LOG_WARN,"blaster: Got a null s in gotDoc4."
		     "Happened because ip could not be found for gigablast"
		     "server");
		if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
			m_launched--;
			// Free stateBD
			freeStateBD(st);
		}
		return;
	}
	// bail if got cut off
	if ( s->m_readOffset == 0 ) {
		log("blasterDiff : lost the Request in gotDoc4");
		if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
			m_launched--;
			freeStateBD(st);
		}
		return;
	}
	char *reply = s->m_readBuf ;
	long  size  = s->m_readOffset;
	HttpMime mime;
	mime.set ( reply , size , NULL );
	char *content    = reply + mime.getMimeLen();
	long  contentLen = size  - mime.getMimeLen();

	//short csEnum = get_iana_charset(mime.getCharset(), 
	//				mime.getCharsetLen());
	/*	if (csEnum == csUnknown)
		log(LOG_DEBUG, "blaster: Unknown charset : %s", mime.getCharset());*/
	
	Xml xml;
	if (!xml.set(
		     content, 
		     contentLen,
		     false,
		     0,
		     false,
		     TITLEREC_CURRENT_VERSION)){
		log(LOG_WARN,"blaster: Couldn't set XML Class in gotDoc4");
	}
	Links links;
	Url *url=mime.getLocationUrl();
	if (!links.set(0,//siterec xml
		       &xml,
		       url,
		       false,
		       NULL,
		       TITLEREC_CURRENT_VERSION,
		       0,
		       false,
		       NULL)){
		log(LOG_WARN, "blaster: Coudn't set Links class in gotDoc4");
	}
	for (long i=0;i<links.getNumLinks();i++){
		char *ss=links.getLink(i);
		char *p;
		// This page *should* always be a gigablast page. So not adding
		// checks for msn or yahoo or google page.
		p=strstr(ss,"google.");
		if(p) continue;
		p=strstr(ss,"cache:");  //googles cache page
		if(p) continue;
		p= strstr(ss,"gigablast.");
		if(p) continue;
		p= strstr(ss,"web.archive.org");//older copies on gigablast
		if(p) continue;
		p= strstr(ss,"search.yahoo.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.msn.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"s.teoma.com");//from gigablast search
		if(p) continue;
		p= strstr(ss,"search.dmoz.org");//from gigablast search
		if(p) continue;
		p= strstr(ss,"www.answers.com");//from gigablast search
		if(p) continue;
       		if (m_verbose)
			log(LOG_WARN,"blaster: Link Present on server2=%s",ss);
	}
	
	// So if one of the links that is returned is the exact url,
	// then we know that the url is present.So get the url from the
	// mime, search for it in the links that are returned.
	char tmp[1024];
	char *sendBuf=s->m_sendBuf;
	char *p1,*p2;

	// First get the Host, which is the domain. Since socket s is going to
	// be useless after this function, changing m_sendBuf instead of using 
	// more space
	p1=strstr(sendBuf,"%3A");
	if(p1){
		p1+=3;
		p2=strstr(p1," HTTP");
		if (p2){
			//Since I do not care about the sendbuf anymore
			*p2='\0';
		}
	}
	if (!p1 || !p2){
		log(LOG_WARN,"blasterdiff: Could not find search link"
		    "from m_sendBuf in gotdoc4");
	}
	else{
		sprintf(tmp,"%s",p1);
		//log(LOG_WARN,"blaster: tmp in gotDoc4 = %s",tmp);
		bool isFound=false;
		// So now we search for tmp in the links
		for (long i=0;i<links.getNumLinks();i++){
			if(strstr(links.getLink(i),tmp) && 
			   links.getLinkLen(i)==(int)gbstrlen(tmp)){
				isFound=true;
				log(LOG_WARN,"blaster: %s in results1 but not"
				    " in results2 for query %s but does exist"
				    " in server2",tmp,st->m_u1);//->getQuery()
			}
		}
		if (!isFound)
			log(LOG_WARN,"blaster: %s in results1 but not"
			    " in results2 for query %s and does NOT exist"
			    " in server2",tmp,st->m_u1); // ->getQuery()
	}
	

      	if (st->m_numUrlDocsReceived==st->m_numUrlDocsSent){
		m_launched--;
		// Free stateBD
		freeStateBD(st);
	}
	return;
}
// returns length of stripped content, but will set g_errno and return -1
// on error
int32_t stripHtml( char *content, int32_t contentLen, int32_t version, int32_t strip ) {
	if ( !strip ) {
		log( LOG_WARN, "query: html stripping not required!" );
		return contentLen;
	}
	if ( ! content )
		return 0;
	if ( contentLen == 0 )
		return 0;

	// filter content if we should
	// keep this on the big stack so "content" still references something
	Xml tmpXml;
	// . get the content as xhtml (should be NULL terminated)
	// . parse as utf8 since all we are doing is messing with 
	//   the tags...content manipulation comes later
	if ( !tmpXml.set( content, contentLen, version, CT_HTML ) ) {
		return -1;
	}

	//if( strip == 4 )
	//	return tmpXml.getText( content, contentLen );

	// go tag by tag
	int32_t     n       = tmpXml.getNumNodes();
	XmlNode *nodes   = tmpXml.getNodes();
	// Xml class may have converted to utf16
	content    = tmpXml.getContent();
	contentLen = tmpXml.getContentLen();
	char    *x       = content;
	char    *xend    = content + contentLen;
	int32_t     stackid = -1;
	int32_t     stackc  =  0;
	char     skipIt  =  0;
	// . hack COL tag to NOT require a back tag
	// . do not leave it that way as it could mess up our parsing
	//g_nodes[25].m_hasBackTag = 0;
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// get id of this node
		int32_t id = nodes[i].m_nodeId;
		
		// if strip is 4, just remove the script tag
		if( strip == 4 ){
			if ( id ){
				if ( id == TAG_SCRIPT ){
					skipIt ^= 1;
					continue;
				}
			}
			else if ( skipIt ) continue;
			goto keepit;
		}
		
		// if strip is 3, ALL tags will be removed!
		if( strip == 3 ) {
			if( id ) {
				// . we dont want anything in between:
				//   - script tags (83)
				//   - style tags  (111)
				if ((id == TAG_SCRIPT) || (id == TAG_STYLE)) skipIt ^= 1;
				// save img to have alt text kept.
				if ( id == TAG_IMG  ) goto keepit;
				continue;
			}
			else {
				if( skipIt ) continue;
				goto keepit;
			}
		}
		// get it
		int32_t fk;
		if   ( strip == 1 ) fk = g_nodes[id].m_filterKeep1;
		else                fk = g_nodes[id].m_filterKeep2;
		// if tag is <link ...> only keep it if it has
		// rel="stylesheet" or rel=stylesheet
		if ( strip == 2 && id == TAG_LINK ) { // <link> tag id
			int32_t   fflen;
			char *ff = nodes[i].getFieldValue ( "rel" , &fflen );
			if ( ff && fflen == 10 &&
			     strncmp(ff,"stylesheet",10) == 0 )
				goto keepit;
		}
		// just remove just the tag if this is 2
		if ( fk == 2 ) continue;
		// keep it if not in a stack
		if ( ! stackc && fk ) goto keepit;
		// if no front/back for tag, just skip it
		if ( ! nodes[i].m_hasBackTag ) continue;
		// start stack if none
		if ( stackc == 0 ) {
			// but not if this is a back tag
			if ( nodes[i].m_node[1] == '/' ) continue;
			// now start the stack
			stackid = id;
			stackc  =  1;
			continue;
		}
		// skip if this tag does not match what is on stack
		if ( id != stackid ) continue;
		// if ANOTHER front tag, inc stack
		if ( nodes[i].m_node[1] != '/' ) stackc++;
		// otherwise, dec the stack count
		else                             stackc--;
		// . ensure not negative from excess back tags
		// . reset stackid to -1 to indicate no stack
		if ( stackc <= 0 ) { stackid= -1; stackc = 0; }
		// skip it
		continue;
	keepit:
		// replace images with their alt text
		int32_t vlen;
		char *v;
		if ( id == TAG_IMG ) {
			v = nodes[i].getFieldValue("alt", &vlen );
			// try title if no alt text
			if ( ! v )
				v = nodes[i].getFieldValue("title", &vlen );
			if ( v ) { gbmemcpy ( x, v, vlen ); x += vlen; }
			continue;
		}
		// remove background image from body,table,td tags
		if ( id == TAG_BODY || id == TAG_TABLE || id == TAG_TD ) {
			v = nodes[i].getFieldValue("background", &vlen);
			// remove background, just sabotage it
			if ( v ) v[-4] = 'x';
		}
		// store it
		gbmemcpy ( x , nodes[i].m_node , nodes[i].m_nodeLen );
		x += nodes[i].m_nodeLen;
		// sanity check
		if ( x > xend ) { g_process.shutdownAbort(true);}
	}
	contentLen = x - content;
	content [ contentLen ] = '\0';
	// unhack COL tag
	//g_nodes[25].m_hasBackTag = 1;
	return contentLen;
}