C++ (Cpp) SafeBuf::nullTerm примеры использования

Язык программирования: C++ (Cpp)

Класс/Тип: SafeBuf

Метод/Функция: nullTerm

Примеров на hotexamples.com: 8

C++ (Cpp) SafeBuf::nullTerm - 8 примеров найдено. Это лучшие примеры C++ (Cpp) кода для SafeBuf::nullTerm, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

safePrintf(30)

getBufStart(30)

safeMemcpy(12)

urlEncode(8)

nullTerm(8)

reserve(7)

pushChar(5)

reset(5)

getBuf(4)

fillFromFile(3)

safeStrcpy(3)

load(3)

pushLong(3)

htmlEncode(3)

safeTruncateEllipsis(3)

detachBuf(2)

getBufPtr(2)

replaceChar(1)

save(1)

reserve2x(1)

setLength(1)

stealBuf(1)

cdataEncode(1)

pushFloat(1)

removeLastChar(1)

pushLongLong(1)

purge(1)

base64Encode(1)

length(1)

jsonEncode(1)

incrementLength(1)

htmlEncodeXmlTags(1)

getLength(1)

getCapacity(1)

dumpToFile(1)

getBufEnd(1)

base64Decode(1)

Пример #1

Показать файл

Файл: Json.cpp Проект: DeadNumbers/open-source-search-engine

bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {

	// reset, but don't free mem etc. just set m_length to 0
	nameBuf.reset();
	// get its full compound name like "meta.twitter.title"
	JsonItem *p = this;//ji;
	char *lastName = NULL;
	char *nameArray[20];
	int32_t  numNames = 0;
	for ( ; p ; p = p->m_parent ) {
		// empty name?
		if ( ! p->m_name ) continue;
		if ( ! p->m_name[0] ) continue;
		// dup? can happen with arrays. parent of string
		// in object, has same name as his parent, the
		// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
		if ( p->m_name == lastName ) continue;
		// update
		lastName = p->m_name;
		// add it up
		nameArray[numNames++] = p->m_name;
		// breach?
		if ( numNames < 15 ) continue;
		log("build: too many names in json tag");
		break;
	}
	// assemble the names in reverse order which is correct order
	for ( int32_t i = 1 ; i <= numNames ; i++ ) {
		// copy into our safebuf
		if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) 
			return false;
		// separate names with periods
		if ( ! nameBuf.pushChar('.') ) return false;
	}
	// remove last period
	nameBuf.removeLastChar('.');
	// and null terminate
	if ( ! nameBuf.nullTerm() ) return false;
	// change all :'s in names to .'s since : is reserved!
	char *px = nameBuf.getBufStart();
	for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';

	return true;
}

Пример #2

Показать файл

Файл: PageGet.cpp Проект: NEXUS1000/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isMasterAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//int32_t   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	int32_t  contentLen = xd->size_utf8Content - 1;

	// int16_tcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//int32_t  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	int32_t startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	if ( xd->m_contentType == CT_JSON ) sb->reset();
	if ( xd->m_contentType == CT_XML  ) sb->reset();
	if ( xd->m_contentType == CT_STATUS ) sb->reset();

	// for undoing the stuff below
	int32_t startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_qsb.getBufStart();
	int32_t  qlen = st->m_qsb.getLength(); // m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( xd->m_contentType == CT_STATUS )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%"INT32"&amp;"
			      "d=%"INT64"&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (int32_t)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//int32_t avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//uint32_t  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	uint32_t  ip   = h->m_ip;
	uint16_t port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"",
	//	  (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%"INT64"",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( int32_t i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	int32_t hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//gbmemcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;
	if ( xd->m_contentType == CT_XML )
		includeHeader = false;
	if ( xd->m_contentType == CT_STATUS )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	
	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;

	char ctype = (char)xd->m_contentType;

	// do not calc title or print it if doc is xml or json
	if ( ctype == CT_XML ) sbend = sbstart;
	if ( ctype == CT_JSON ) sbend = sbstart;
	if ( ctype == CT_STATUS ) sbend = sbstart;

	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		int32_t printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			int32_t tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(int32_t)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;
	if ( xd->m_contentType == CT_STATUS )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		xb->nullTerm();
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//int32_t bufLen = p - buf;

	/*

	  MDW: return the xml page as is now. 9/28/2014

	int32_t ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}
	*/

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( xd->m_contentType == CT_STATUS )
		contentType = "application/json";

	if ( xd->m_contentType == CT_XML )
		contentType = "test/xml";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Пример #3

Показать файл

Файл: qa.cpp Проект: firatkarakusoglu/open-source-search-engine

bool qaspider1 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // just one spider out allowed for consistency
	       "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode("tag:shallow site:www.walmart.com\r\n"
			     "tag:shallow site:http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		
	//
	// use the add url interface now
	// walmart.com above was not seeded because of the site: directive
	// so this will seed it.
	//
	//static bool s_y2 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=1"
			      "&urls=www.walmart.com+ibm.com"
			      );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		//sb.urlEncode(s_urls1);
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}


	// wait for index msg4 to not be cached to ensure all results indexed
	if ( ! s_flags[22] ) {
		s_flags[22] = true;
		wait(1.5);
	}


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1672870556 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A0",
				908338607 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&"
				"q=gbfacetstr%3Agbxpathsitehash2492664135",
				55157060 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[13] ) {
	// 	s_flags[13] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=site2%3Awww.walmart.com+"
				"gbsortby%3Agbspiderdate",
				999 ) )
			return false;
	}

	// xpath is like a title here i think. check the returned
	// facet table in the left column
	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&"
				"q=gbfacetstr%3Agbxpathsitehash3624590799"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[19] ) {
		s_flags[19] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetint%3Agbhopcount"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&"
				"q=gbfacetint%3Alog.score"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[21] ) {
		s_flags[21] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetfloat%3Atalks.rating"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[23] ) {
		s_flags[23] = true;
		// test facets mixed with gigabits in left hand column
		if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&"
				"q=gbfacetint%3Agbhopcount+walmart"
				, 999 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA SPIDER1 TEST");
		return true;
	}

	return true;
}

Пример #4

Показать файл

Файл: qa.cpp Проект: firatkarakusoglu/open-source-search-engine

//
// the injection qa test suite
//
bool qainject1 ( ) {

	//if ( ! s_callback ) s_callback = qainject1;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// this only loads once
	loadUrls();
	long max = s_ubuf2.length()/(long)sizeof(char *);
	//max = 1;

	//
	// inject urls, return false if not done yet
	//
	//static bool s_x4 = false;
	if ( ! s_flags[2] ) {
		// TODO: try delimeter based injection too
		//static long s_ii = 0;
		for ( ; s_flags[20] < max ; ) {
			// inject using html api
			SafeBuf sb;
			sb.safePrintf("&c=qatest123&deleteurl=0&"
				      "format=xml&u=");
			sb.urlEncode ( s_urlPtrs[s_flags[20]] );
			// the content
			sb.safePrintf("&hasmime=1");
			// sanity
			//if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") )
			//	log("hey");
			sb.safePrintf("&content=");
			sb.urlEncode(s_contentPtrs[s_flags[20]] );
			sb.nullTerm();
			// pre-inc it in case getUrl() blocks
			s_flags[20]++;//ii++;
			if ( ! getUrl("/admin/inject",
				      0, // no idea what crc to expect
				      sb.getBufStart()) )
				return false;
		}
		s_flags[2] = true;
	}

	// +the
	//static bool s_x5 = false;
	if ( ! s_flags[3] ) {
		wait(1.5);
		s_flags[3] = true;
		return false;
	}

	if ( ! s_flags[16] ) {
		s_flags[16] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				702467314 ) )
			return false;
	}

	// sports news
	//static bool s_x7 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=sports+news",2009472889 ) )
		     return false;
	}

	// 'washer & dryer' does some algorithmic synonyms 'washer and dryer'
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"debug=1&q=washer+%26+dryer",9999 ) )
		     return false;
	}

	//
	// mdw: query reindex test
	//
	// if ( ! s_flags[30] ) {
	// 	s_flags[30] = true;
	// 	if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=1&format=xml&"
	// 			"debug=1&q=sports",9999 ) )
	// 		return false;
	// }

	// // temp end it here
	// return true;

	//
	// eject/delete the urls
	//
	//static long s_ii2 = 0;
	for ( ; s_flags[5] < max ; ) {
		// reject using html api
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
			       "format=xml&u=");
		sb.urlEncode ( s_urlPtrs[s_flags[5]] );
		sb.nullTerm();
		// pre-inc it in case getUrl() blocks
		//s_ii2++;
		s_flags[5]++;
		if ( ! getUrl ( sb.getBufStart() , 0 ) )
			return false;
	}

	//
	// make sure no results left, +the
	//
	if ( ! s_flags[6] ) {
		wait(1.5);
		s_flags[6] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=2&format=xml&q=%2Bthe",
				-1672870556 ) )
			return false;
	}

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 1");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

Пример #5

Показать файл

Файл: qa.cpp Проект: firatkarakusoglu/open-source-search-engine

bool qaspider2 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // sitepages is a little fuzzy so take it
			      // out for this test and use hopcount!!!
			      //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
			      "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	// these should auto seed so no need to use addurl
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode(//walmart has too many pages at depth 1, so remove it
			     //"tag:shallow www.walmart.com\r\n"
			     "tag:shallow http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[4] ) {
		//usleep(5000000); // 5 seconds
		s_flags[4] = true;
		wait(3.0);
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[5] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[4] = false;
			s_flags[14] = false;
			goto checkagain;
		}
		s_flags[5] = true;
	}




	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[6] ) {
		s_flags[6] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1310551262 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=500&"
				"q=gbhopcount%3A0",
				999 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=0&"
				"q=gbfacetstr%3Agbxpathsitehash3311332088",
				999 ) )
			return false;
	}

	// wait for some reason
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		wait(1.5);
		return false;
	}



	//static bool s_y6 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash3311332088&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[12] ) {
	// 	s_flags[12] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA SPIDER2 TEST");
		return true;
	}

	return true;
}

Пример #6

Показать файл

Файл: Summary.cpp Проект: exename/open-source-search-engine

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
		}

		if ( wids[a] ) {
			wordCount++;
		}
	}

	// if didn't find a good start, then start at the start of the frag
	if ( !goodStart && firstFrag != -1 ) {
		a = firstFrag;
	}

	// don't let punct or tag word start a line, unless a quote
	if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){
		while ( a < matchWordNum && !wids[a] ) a++;
		
		// do not break right after a "strong connector", like 
		// apostrophe
		while ( a < matchWordNum && a > 0 && 
			( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
			a++;
		
		// don't let punct or tag word start a line
		while ( a < matchWordNum && !wids[a] ) a++;
	}

	// remember, b is not included in the summary, the summary is [a,b-1]
	// remember to include all words in a matched phrase
	int32_t b = matchWordNum + m->m_numWords ;
	int32_t endQuoteWordNum = -1;
	int32_t numTagsCrossed = 0;

	for ( ; b <= nw; b++ ) {
		if ( b == nw ) {
			break;
		}

		if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
			break;
		}
		
		if ( startOnQuote && words->getWord(b)[0] == '\"' ) {
			endQuoteWordNum = b;
		}

		// don't include any dead zone, those are already-used samples
		if ( bb[b] & D_USED ) {
			break;
		}

		// stop on a title word
		if ( bb[b] & D_IN_TITLE ) {
			break;
		}

		if ( wids[b] ) {
			wordCount++;
		}

		// don't go beyond an LI or TR backtag
		if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
		               tids[b] == (BACKBIT|TAG_TR) ) ) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 ) {
				break;
			}
		}

		// go beyond a P or DIV backtag in case the earlier char is a
		// ':'. This came from a special case for wikipedia pages 
		// eg. http://en.wikipedia.org/wiki/Flyover
		if ( tids && ( tids[b] == (BACKBIT|TAG_P)  ||
		               tids[b] == (BACKBIT|TAG_DIV) )) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) {
				break;
			}
		}
	}

	// don't end on a lot of punct words
	if ( b > matchWordNum && !wids[b-1]){
		// remove more than one punct words. if we're ending on a quote
		// keep it
		while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
			b--;
		}
		
		// do not break right after a "strong connector", like apostrophe
		while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) {
			b--;
		}
	}

	Match *ms = matches->m_matches;

	// make m_matches.m_matches[mi] the first match in our [a,b) window
	int32_t mi ;

	// . the match at the center of the window is match #"mm", so that
	//   matches->m_matches[mm] is the Match class
	// . set "mi" to it and back up "mi" as int32_t as >= a
	for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- )
		;

	// now get the score of this excerpt. Also mark all the represented 
	// query words. Mark the represented query words in the array that
	// comes to us. also mark how many times the same word is repeated in
	// this summary.
	int64_t score = 0LL;

	// is a url contained in the summary, that looks bad! punish!
	bool hasUrl = false;

	// the word count we did above was just an approximate. count it right
	wordCount = 0;

	// for debug
	//char buf[5000];
	//char *xp = buf;
	SafeBuf xp;

	// wtf?
	if ( b > nw ) {
		b = nw;
	}

	// first score from the starting match down to a, including match
	for ( int32_t i = a ; i < b ; i++ ) {
		// debug print out
		if ( g_conf.m_logDebugSummary ) {
			int32_t len = words->getWordLen(i);
			char cs;
			for (int32_t k=0;k<len; k+=cs ) {
				const char *c = words->getWord(i)+k;
				cs = getUtf8CharSize(c);
				if ( is_binary_utf8 ( c ) ) {
					continue;
				}
				xp.safeMemcpy ( c , cs );
				xp.nullTerm();
			}
		}

		// skip if in bad section, marquee, select, script, style
		if ( sp && (sp[i]->m_flags & badFlags) ) {
			continue;
		}

		// don't count just numeric words
		if ( words->isNum(i) ) {
			continue;
		}

		// check if there is a url. best way to check for '://'
		if ( wids && !wids[i] ) {
			const char *wrd = words->getWord(i);
			int32_t  wrdLen = words->getWordLen(i);
			if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' &&  wrd[2] == '/' ) {
				hasUrl = true;
			}
		}

		// skip if not wid
		if ( ! wids[i] ) {
			continue;
		}

		// just make every word 100 pts
		int32_t t = 100;

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		// boost it if in bold or italics
		if ( bb[i] & D_IN_BOLDORITALICS ) {
			t *= 2;
		}

		// add the score for this word
		score += t;

		// print the score, "t"
		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf("(%" PRId32")",t);
		}

		// count the alpha words we got
		wordCount++;

		// if no matches left, skip
		if ( mi >= matches->m_numMatches ) {
			continue;
		}

		// get the match
		Match *next = &ms[mi];

		// skip if not a match
		if ( i != next->m_wordNum ) {
			continue;
		}

		// must be a match in this class
		if ( next->m_words != words ) {
			continue;
		}

		// advance it
		mi++;

		// which query word # does it match
		int32_t qwn = next->m_qwordNum;

		if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}

		// undo old score
		score -= t;

		// add 100000 per match
		t = 100000;

		// weight based on tf, goes from 0.1 to 1.0
		t = (int32_t)((float)t * m_wordWeights [ qwn ]);

		// if it is a query stop word, make it 10000 pts
		if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
			t = 0;//10000;
		}

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		if ( gotIt[qwn] > 0 ) {
			// have we matched it in this [a,b) already?
			if ( gotIt[qwn] == 1 ) {
				t /= 15;
			} else {
				// if we have more than 2 matches in the same window,
				// it may not give a good summary. give a heavy penalty
				t -= 200000;
			}
		} else if ( retired [qwn] > 0 ) {
			// have we matched it already in a winning window?
			t /= 12;
		}

		// add it back
		score += t;

		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
				       m_wordWeights[qwn]);
		}

		// inc the query word count for this window
		if ( gotIt[qwn] < 100 ) {
			gotIt[qwn]++;
		}
	}

	int32_t oldScore = score;
	
	// apply the bonus if it starts or a sentence
	// only apply if the score is positive and if the wordcount is decent
	if ( score > 0 && wordCount > 7 ){
		// a match can give us 10k to 100k pts based on the tf weights
		// so we don't want to overwhelm that too much, so let's make
		// this a 20k bonus if it starts a sentence
		if ( bb[a] & D_STARTS_SENTENCE ) {
			score += 8000;
		} else if ( bb[a] & D_STARTS_FRAG ) {
			// likewise, a fragment, like after a comma
			score += 4000;
		}

		// 1k if the match word is very close to the
		// start of a sentence, lets say 3 alphawords
		if ( matchWordNum - a < 7 ) {
			score += 1000;
		}
	}

	// a summary isn't really a summary if its less than 7 words.
	// reduce the score, but still give it a decent score.
	// minus 5M.
	if ( wordCount < 7 ) {
		score -= 20000;
	}

	// summaries that cross a lot of tags are usually bad, penalize them
	if ( numTagsCrossed > 1 ) {
		score -= (numTagsCrossed * 20000);
	}

	if ( hasUrl ) {
		score -= 8000;
	}

	// show it
	if ( g_conf.m_logDebugSummary ) {
		log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
		     xp.getBufStart());
	}

	// set lasta, besta, bestb
	*lasta = a;
	*besta = a;
	*bestb = b;

	return score;
}

Пример #7

Показать файл

Файл: PageInject.cpp Проект: chushuai/open-source-search-engine

// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {

	// advance round now in case we return early
	m_round++;

	GigablastRequest *gr = &m_gr;

	// error?
	char *qts = gr->m_queryToScrape;
	if ( ! qts ) { char *xx=NULL;*xx=0; }

	if ( gbstrlen(qts) > 500 ) {
		g_errno = EQUERYTOOBIG;
		return true;
	}

	// first encode the query
	SafeBuf ebuf;
	ebuf.urlEncode ( qts ); // queryUNEncoded );
	ebuf.nullTerm();

	char *uf;
	if ( m_round == 1 )
		// set to 1 for debugging
		uf="http://www.google.com/search?num=20&"
			"q=%s&scoring=d&filter=0";
		//uf = "https://startpage.com/do/search?q=%s";
		//uf = "http://www.google.com/"
		//	"/cse?cx=013269018370076798483%3A8eec3papwpi&"
		//	"ie=UTF-8&q=%s&"
		//	"num=20";
	else
		uf="http://www.bing.com/search?q=%s";

	// skip bing for now
	//if ( m_round == 2 )
	//	return true;
	//if ( m_round == 1 )
	//	return true;
		
	// make the url we will download
	char ubuf[2048];
	sprintf ( ubuf , uf , ebuf.getBufStart() );

	// log it
	log("inject: SCRAPING %s",ubuf);

	SpiderRequest sreq;
	sreq.reset();
	// set the SpiderRequest
	strcpy(sreq.m_url, ubuf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	long firstIp = hash32n(ubuf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	char *coll2 = gr->m_coll;
	CollectionRec *cr = g_collectiondb.getRec ( coll2 );

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , cr->m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// this will tell it to index ahrefs first before indexing
	// the doc. but do NOT do this if we are from ahrefs.com
	// ourselves to avoid recursive explosion!!
	if ( m_useAhrefs )
		m_xd.m_useAhrefs = true;

	m_xd.m_reallyInjectLinks = true;//gr->m_injectLinks;

	//
	// rather than just add the links of the page to spiderdb,
	// let's inject them!
	//
	m_xd.setCallback ( this , doneInjectingLinksWrapper );

	// niceness is 0
	m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");

	// do we actually inject the links, or just scrape?
	if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
				  NULL,
				  this , 
				  doneInjectingLinksWrapper ) ) 
		return false;
	// otherwise, just download the google/bing search results so we
	// can display them in xml
	//else if ( m_xd.getUtf8Content() == (char **)-1 )
	//	return false;
		
	// print reply..
	//printReply();
	return true;
}

Пример #8

Показать файл

Файл: HttpMime.cpp Проект: privacore/open-source-search-engine

bool HttpMime::addToCookieJar(Url *currentUrl, SafeBuf *sb) {
	/// @note Slightly modified from Netscape HTTP Cookie File format
	/// Difference is we only have one column for name/value

	// http://www.cookiecentral.com/faq/#3.5
	// The layout of Netscape's cookies.txt file is such that each line contains one name-value pair.
	// An example cookies.txt file may have an entry that looks like this:
	// .netscape.com TRUE / FALSE 946684799 NETSCAPE_ID 100103
	//
	// Each line represents a single piece of stored information. A tab is inserted between each of the fields.
	// From left-to-right, here is what each field represents:
	//
	// domain - The domain that created AND that can read the variable.
	// flag - A TRUE/FALSE value indicating if all machines within a given domain can access the variable. This value is set automatically by the browser, depending on the value you set for domain.
	// path - The path within the domain that the variable is valid for.
	// secure - A TRUE/FALSE value indicating if a secure connection with the domain is needed to access the variable.
	// expiration - The UNIX time that the variable will expire on. UNIX time is defined as the number of seconds since Jan 1, 1970 00:00:00 GMT.
	// name/value - The name/value of the variable.

	/// @todo ALC we should sort cookie-list
	// The user agent SHOULD sort the cookie-list in the following order:
	// *  Cookies with longer paths are listed before cookies with shorter paths.
	// *  Among cookies that have equal-length path fields, cookies with earlier creation-times are listed
	// before cookies with later creation-times.

	// fill in cookies from cookieJar
	std::map<std::string, httpcookie_t> oldCookies;

	const char *cookieJar = sb->getBufStart();
	int32_t cookieJarLen = sb->length();

	const char *lineStartPos = cookieJar;
	const char *lineEndPos = NULL;
	while ((lineEndPos = (const char*)memchr(lineStartPos, '\n', cookieJarLen - (lineStartPos - cookieJar))) != NULL) {
		const char *currentPos = lineStartPos;
		const char *tabPos = NULL;
		unsigned fieldCount = 0;

		httpcookie_t cookie = {};
		while (fieldCount < 5 && (tabPos = (const char*)memchr(currentPos, '\t', lineEndPos - currentPos)) != NULL) {
			switch (fieldCount) {
				case 0:
					// domain
					cookie.m_domain = currentPos;
					cookie.m_domainLen = tabPos - currentPos;
					break;
				case 1:
					// flag
					if (memcmp(currentPos, "TRUE", 4) != 0) {
						cookie.m_defaultDomain = true;
					}
					break;
				case 2: {
					// path
					cookie.m_path = currentPos;
					cookie.m_pathLen = tabPos - currentPos;
				} break;
				case 3:
					// secure
					cookie.m_secure = (memcmp(currentPos, "TRUE", 4) == 0);
					break;
				case 4:
					// expiration
					break;
			}

			currentPos = tabPos + 1;
			++fieldCount;
		}

		cookie.m_cookie = currentPos;
		cookie.m_cookieLen = lineEndPos - currentPos;

		const char *equalPos = (const char *)memchr(cookie.m_cookie, '=', cookie.m_cookieLen);
		if (equalPos) {
			cookie.m_nameLen = equalPos - cookie.m_cookie;

			oldCookies[std::string(cookie.m_cookie, cookie.m_nameLen)] = cookie;
		}

		lineStartPos = lineEndPos + 1;
	}
	// we don't need to care about the last line (we always end on \n)

	SafeBuf newCookieJar;

	// add old cookies
	for (auto &pair : oldCookies) {
		if (m_cookies.find(pair.first) == m_cookies.end()) {
			addCookie(pair.second, *currentUrl, &newCookieJar);
		}
	}

	// add new cookies
	for (auto &pair : m_cookies) {
		addCookie(pair.second, *currentUrl, &newCookieJar);
	}

	newCookieJar.nullTerm();

	// replace old with new
	sb->reset();
	sb->safeMemcpy(&newCookieJar);
	sb->nullTerm();

	return true;
}