C++ (Cpp) SafeBuf::nullTerm Exemples

Exemple #1

0

Afficher le fichier

Fichier : Json.cpp Projet : DeadNumbers/open-source-search-engine

bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {

	// reset, but don't free mem etc. just set m_length to 0
	nameBuf.reset();
	// get its full compound name like "meta.twitter.title"
	JsonItem *p = this;//ji;
	char *lastName = NULL;
	char *nameArray[20];
	int32_t  numNames = 0;
	for ( ; p ; p = p->m_parent ) {
		// empty name?
		if ( ! p->m_name ) continue;
		if ( ! p->m_name[0] ) continue;
		// dup? can happen with arrays. parent of string
		// in object, has same name as his parent, the
		// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
		if ( p->m_name == lastName ) continue;
		// update
		lastName = p->m_name;
		// add it up
		nameArray[numNames++] = p->m_name;
		// breach?
		if ( numNames < 15 ) continue;
		log("build: too many names in json tag");
		break;
	}
	// assemble the names in reverse order which is correct order
	for ( int32_t i = 1 ; i <= numNames ; i++ ) {
		// copy into our safebuf
		if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) 
			return false;
		// separate names with periods
		if ( ! nameBuf.pushChar('.') ) return false;
	}
	// remove last period
	nameBuf.removeLastChar('.');
	// and null terminate
	if ( ! nameBuf.nullTerm() ) return false;
	// change all :'s in names to .'s since : is reserved!
	char *px = nameBuf.getBufStart();
	for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';

	return true;
}

Exemple #2

0

Afficher le fichier

Fichier : PageGet.cpp Projet : NEXUS1000/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isMasterAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//int32_t   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %"INT32" is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	int32_t  contentLen = xd->size_utf8Content - 1;

	// int16_tcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//int32_t  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	int32_t startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	if ( xd->m_contentType == CT_JSON ) sb->reset();
	if ( xd->m_contentType == CT_XML  ) sb->reset();
	if ( xd->m_contentType == CT_STATUS ) sb->reset();

	// for undoing the stuff below
	int32_t startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_qsb.getBufStart();
	int32_t  qlen = st->m_qsb.getLength(); // m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( xd->m_contentType == CT_STATUS )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%"INT32"&amp;"
			      "d=%"INT64"&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (int32_t)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//int32_t avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//uint32_t  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//uint16_t port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	uint32_t  ip   = h->m_ip;
	uint16_t port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%"INT32"/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	int32_t elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%"INT32"&rtq=%"INT32"d=%"INT64"",
	//	  (int32_t)st->m_seq,(int32_t)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%"INT64"",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( int32_t i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	int32_t hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//gbmemcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;
	if ( xd->m_contentType == CT_XML )
		includeHeader = false;
	if ( xd->m_contentType == CT_STATUS )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%"UINT64"</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%"INT32"</cachedTimeUTC>\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%"UINT64",\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%"INT32",\n",
			       (int32_t)lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	
	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;

	char ctype = (char)xd->m_contentType;

	// do not calc title or print it if doc is xml or json
	if ( ctype == CT_XML ) sbend = sbstart;
	if ( ctype == CT_JSON ) sbend = sbstart;
	if ( ctype == CT_STATUS ) sbend = sbstart;

	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		int32_t printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%"INT64"&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			int32_t tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(int32_t)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;
	if ( xd->m_contentType == CT_STATUS )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		xb->nullTerm();
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//int32_t bufLen = p - buf;

	/*

	  MDW: return the xml page as is now. 9/28/2014

	int32_t ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}
	*/

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( xd->m_contentType == CT_STATUS )
		contentType = "application/json";

	if ( xd->m_contentType == CT_XML )
		contentType = "test/xml";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Exemple #3

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool qaspider1 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // just one spider out allowed for consistency
	       "fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode("tag:shallow site:www.walmart.com\r\n"
			     "tag:shallow site:http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		
	//
	// use the add url interface now
	// walmart.com above was not seeded because of the site: directive
	// so this will seed it.
	//
	//static bool s_y2 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		SafeBuf sb;
		// delim=+++URL:
		sb.safePrintf("&c=qatest123"
			      "&format=json"
			      "&strip=1"
			      "&spiderlinks=1"
			      "&urls=www.walmart.com+ibm.com"
			      );
		// . now a list of websites we want to spider
		// . the space is already encoded as +
		//sb.urlEncode(s_urls1);
		if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
			return false;
	}

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[5] ) {
		// wait 5 seconds, call sleep timer... then call qatest()
		//usleep(5000000); // 5 seconds
		wait(3.0);
		s_flags[5] = true;
		return false;
	}

	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[6] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[5] = false;
			s_flags[15] = false;
			goto checkagain;
		}
		s_flags[6] = true;
	}


	// wait for index msg4 to not be cached to ensure all results indexed
	if ( ! s_flags[22] ) {
		s_flags[22] = true;
		wait(1.5);
	}


	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1672870556 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A0",
				908338607 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=1&"
				"q=gbfacetstr%3Agbxpathsitehash2492664135",
				55157060 ) )
			return false;
	}

	//static bool s_y6 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[12] ) {
		s_flags[12] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9861563119&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[13] ) {
	// 	s_flags[13] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	if ( ! s_flags[17] ) {
		s_flags[17] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=site2%3Awww.walmart.com+"
				"gbsortby%3Agbspiderdate",
				999 ) )
			return false;
	}

	// xpath is like a title here i think. check the returned
	// facet table in the left column
	if ( ! s_flags[18] ) {
		s_flags[18] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=html&"
				"q=gbfacetstr%3Agbxpathsitehash3624590799"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[19] ) {
		s_flags[19] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetint%3Agbhopcount"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[20] ) {
		s_flags[20] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&json=1&"
				"q=gbfacetint%3Alog.score"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[21] ) {
		s_flags[21] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&xml=1&"
				"q=gbfacetfloat%3Atalks.rating"
				, 999 ) )
			return false;
	}

	if ( ! s_flags[23] ) {
		s_flags[23] = true;
		// test facets mixed with gigabits in left hand column
		if ( ! getUrl ( "/search?c=qatest123&qa=1&html=1&"
				"q=gbfacetint%3Agbhopcount+walmart"
				, 999 ) )
			return false;
	}


	//static bool s_fee2 = false;
	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA SPIDER1 TEST");
		return true;
	}

	return true;
}

Exemple #4

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

//
// the injection qa test suite
//
bool qainject1 ( ) {

	//if ( ! s_callback ) s_callback = qainject1;

	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// this only loads once
	loadUrls();
	long max = s_ubuf2.length()/(long)sizeof(char *);
	//max = 1;

	//
	// inject urls, return false if not done yet
	//
	//static bool s_x4 = false;
	if ( ! s_flags[2] ) {
		// TODO: try delimeter based injection too
		//static long s_ii = 0;
		for ( ; s_flags[20] < max ; ) {
			// inject using html api
			SafeBuf sb;
			sb.safePrintf("&c=qatest123&deleteurl=0&"
				      "format=xml&u=");
			sb.urlEncode ( s_urlPtrs[s_flags[20]] );
			// the content
			sb.safePrintf("&hasmime=1");
			// sanity
			//if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") )
			//	log("hey");
			sb.safePrintf("&content=");
			sb.urlEncode(s_contentPtrs[s_flags[20]] );
			sb.nullTerm();
			// pre-inc it in case getUrl() blocks
			s_flags[20]++;//ii++;
			if ( ! getUrl("/admin/inject",
				      0, // no idea what crc to expect
				      sb.getBufStart()) )
				return false;
		}
		s_flags[2] = true;
	}

	// +the
	//static bool s_x5 = false;
	if ( ! s_flags[3] ) {
		wait(1.5);
		s_flags[3] = true;
		return false;
	}

	if ( ! s_flags[16] ) {
		s_flags[16] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
				702467314 ) )
			return false;
	}

	// sports news
	//static bool s_x7 = false;
	if ( ! s_flags[4] ) {
		s_flags[4] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=sports+news",2009472889 ) )
		     return false;
	}

	// 'washer & dryer' does some algorithmic synonyms 'washer and dryer'
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"debug=1&q=washer+%26+dryer",9999 ) )
		     return false;
	}

	//
	// mdw: query reindex test
	//
	// if ( ! s_flags[30] ) {
	// 	s_flags[30] = true;
	// 	if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=1&format=xml&"
	// 			"debug=1&q=sports",9999 ) )
	// 		return false;
	// }

	// // temp end it here
	// return true;

	//
	// eject/delete the urls
	//
	//static long s_ii2 = 0;
	for ( ; s_flags[5] < max ; ) {
		// reject using html api
		SafeBuf sb;
		sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
			       "format=xml&u=");
		sb.urlEncode ( s_urlPtrs[s_flags[5]] );
		sb.nullTerm();
		// pre-inc it in case getUrl() blocks
		//s_ii2++;
		s_flags[5]++;
		if ( ! getUrl ( sb.getBufStart() , 0 ) )
			return false;
	}

	//
	// make sure no results left, +the
	//
	if ( ! s_flags[6] ) {
		wait(1.5);
		s_flags[6] = true;
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=2&format=xml&q=%2Bthe",
				-1672870556 ) )
			return false;
	}

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
			"QA INJECT TEST 1");
		//if ( s_callback == qainject ) exit(0);
		return true;
	}


	return true;
}

Exemple #5

0

Afficher le fichier

Fichier : qa.cpp Projet : firatkarakusoglu/open-source-search-engine

bool qaspider2 ( ) {
	//
	// delete the 'qatest123' collection
	//
	//static bool s_x1 = false;
	if ( ! s_flags[0] ) {
		s_flags[0] = true;
		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
			return false;
	}

	//
	// add the 'qatest123' collection
	//
	//static bool s_x2 = false;
	if ( ! s_flags[1] ) {
		s_flags[1] = true;
		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
				// checksum of reply expected
				238170006 ) )
			return false;
	}

	// restrict hopcount to 0 or 1 in url filters so we do not spider
	// too deep
	//static bool s_z1 = false;
	if ( ! s_flags[2] ) {
		s_flags[2] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&"
			      // make it the custom filter
			      "ufp=0&"

	       "fe=%%21ismanualadd+%%26%%26+%%21insitelist&hspl=0&hspl=1&fsf=0.000000&mspr=0&mspi=1&xg=1000&fsp=-3&"

			      // take out hopcount for now, just test quotas
			      //	       "fe1=tag%%3Ashallow+%%26%%26+hopcount%%3C%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=3&"

			      // sitepages is a little fuzzy so take it
			      // out for this test and use hopcount!!!
			      //"fe1=tag%%3Ashallow+%%26%%26+sitepages%%3C%%3D20&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"
			      "fe1=tag%%3Ashallow+%%26%%26+hopcount<%%3D1&hspl1=0&hspl1=1&fsf1=1.000000&mspr1=1&mspi1=1&xg1=1000&fsp1=45&"

	       "fe2=default&hspl2=0&hspl2=1&fsf2=1.000000&mspr2=0&mspi2=1&xg2=1000&fsp2=45&"

		);
		if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
			return false;
	}

	// set the site list to 
	// a few sites
	// these should auto seed so no need to use addurl
	//static bool s_z2 = false;
	if ( ! s_flags[3] ) {
		s_flags[3] = true;
		SafeBuf sb;
		sb.safePrintf("&c=qatest123&format=xml&sitelist=");
		sb.urlEncode(//walmart has too many pages at depth 1, so remove it
			     //"tag:shallow www.walmart.com\r\n"
			     "tag:shallow http://www.ibm.com/\r\n");
		sb.nullTerm();
		if ( ! getUrl ("/admin/settings",0,sb.getBufStart() ) )
			return false;
	}
		

	//
	// wait for spidering to stop
	//
 checkagain:

	// wait until spider finishes. check the spider status page
	// in json to see when completed
	//static bool s_k1 = false;
	if ( ! s_flags[4] ) {
		//usleep(5000000); // 5 seconds
		s_flags[4] = true;
		wait(3.0);
		return false;
	}

	if ( ! s_flags[14] ) {
		s_flags[14] = true;
		if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
			return false;
	}

	//static bool s_k2 = false;
	if ( ! s_flags[5] ) {
		// ensure spiders are done. 
		// "Nothing currently available to spider"
		if ( s_content&&!strstr(s_content,"Nothing currently avail")){
			s_flags[4] = false;
			s_flags[14] = false;
			goto checkagain;
		}
		s_flags[5] = true;
	}




	// verify no results for gbhopcount:2 query
	//static bool s_y4 = false;
	if ( ! s_flags[6] ) {
		s_flags[6] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
				"q=gbhopcount%3A2",
				-1310551262 ) )
			return false;
	}

	// but some for gbhopcount:0 query
	//static bool s_t0 = false;
	if ( ! s_flags[7] ) {
		s_flags[7] = true;
		if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&n=500&"
				"q=gbhopcount%3A0",
				999 ) )
			return false;
	}
	
	// check facet sections query for walmart
	//static bool s_y5 = false;
	if ( ! s_flags[8] ) {
		s_flags[8] = true;
		if ( ! getUrl ( "/search?c=qatest123&format=json&stream=0&"
				"q=gbfacetstr%3Agbxpathsitehash3311332088",
				999 ) )
			return false;
	}

	// wait for some reason
	if ( ! s_flags[15] ) {
		s_flags[15] = true;
		wait(1.5);
		return false;
	}



	//static bool s_y6 = false;
	if ( ! s_flags[9] ) {
		s_flags[9] = true;
		if ( ! getUrl ( "/get?page=4&q=gbfacetstr:gbxpathsitehash3311332088&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// in xml
	//static bool s_y7 = false;
	if ( ! s_flags[10] ) {
		s_flags[10] = true;
		if ( ! getUrl ( "/get?xml=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}

	// and json
	//static bool s_y8 = false;
	if ( ! s_flags[11] ) {
		s_flags[11] = true;
		if ( ! getUrl ( "/get?json=1&page=4&q=gbfacetstr:gbxpathsitehash2492664135&qlang=xx&c=qatest123&d=9577169402&cnsp=0" , 999 ) )
			return false;
	}


	// delete the collection
	//static bool s_fee = false;
	// if ( ! s_flags[12] ) {
	// 	s_flags[12] = true;
	// 	if ( ! getUrl ( "/admin/delcoll?delcoll=qatest123" ) )
	// 		return false;
	// }

	//static bool s_fee2 = false;
	if ( ! s_flags[13] ) {
		s_flags[13] = true;
		log("qa: SUCCESSFULLY COMPLETED "
		    "QA SPIDER2 TEST");
		return true;
	}

	return true;
}

Exemple #6

0

Afficher le fichier

Fichier : Summary.cpp Projet : exename/open-source-search-engine

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
		}

		if ( wids[a] ) {
			wordCount++;
		}
	}

	// if didn't find a good start, then start at the start of the frag
	if ( !goodStart && firstFrag != -1 ) {
		a = firstFrag;
	}

	// don't let punct or tag word start a line, unless a quote
	if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){
		while ( a < matchWordNum && !wids[a] ) a++;
		
		// do not break right after a "strong connector", like 
		// apostrophe
		while ( a < matchWordNum && a > 0 && 
			( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
			a++;
		
		// don't let punct or tag word start a line
		while ( a < matchWordNum && !wids[a] ) a++;
	}

	// remember, b is not included in the summary, the summary is [a,b-1]
	// remember to include all words in a matched phrase
	int32_t b = matchWordNum + m->m_numWords ;
	int32_t endQuoteWordNum = -1;
	int32_t numTagsCrossed = 0;

	for ( ; b <= nw; b++ ) {
		if ( b == nw ) {
			break;
		}

		if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
			break;
		}
		
		if ( startOnQuote && words->getWord(b)[0] == '\"' ) {
			endQuoteWordNum = b;
		}

		// don't include any dead zone, those are already-used samples
		if ( bb[b] & D_USED ) {
			break;
		}

		// stop on a title word
		if ( bb[b] & D_IN_TITLE ) {
			break;
		}

		if ( wids[b] ) {
			wordCount++;
		}

		// don't go beyond an LI or TR backtag
		if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
		               tids[b] == (BACKBIT|TAG_TR) ) ) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 ) {
				break;
			}
		}

		// go beyond a P or DIV backtag in case the earlier char is a
		// ':'. This came from a special case for wikipedia pages 
		// eg. http://en.wikipedia.org/wiki/Flyover
		if ( tids && ( tids[b] == (BACKBIT|TAG_P)  ||
		               tids[b] == (BACKBIT|TAG_DIV) )) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) {
				break;
			}
		}
	}

	// don't end on a lot of punct words
	if ( b > matchWordNum && !wids[b-1]){
		// remove more than one punct words. if we're ending on a quote
		// keep it
		while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
			b--;
		}
		
		// do not break right after a "strong connector", like apostrophe
		while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) {
			b--;
		}
	}

	Match *ms = matches->m_matches;

	// make m_matches.m_matches[mi] the first match in our [a,b) window
	int32_t mi ;

	// . the match at the center of the window is match #"mm", so that
	//   matches->m_matches[mm] is the Match class
	// . set "mi" to it and back up "mi" as int32_t as >= a
	for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- )
		;

	// now get the score of this excerpt. Also mark all the represented 
	// query words. Mark the represented query words in the array that
	// comes to us. also mark how many times the same word is repeated in
	// this summary.
	int64_t score = 0LL;

	// is a url contained in the summary, that looks bad! punish!
	bool hasUrl = false;

	// the word count we did above was just an approximate. count it right
	wordCount = 0;

	// for debug
	//char buf[5000];
	//char *xp = buf;
	SafeBuf xp;

	// wtf?
	if ( b > nw ) {
		b = nw;
	}

	// first score from the starting match down to a, including match
	for ( int32_t i = a ; i < b ; i++ ) {
		// debug print out
		if ( g_conf.m_logDebugSummary ) {
			int32_t len = words->getWordLen(i);
			char cs;
			for (int32_t k=0;k<len; k+=cs ) {
				const char *c = words->getWord(i)+k;
				cs = getUtf8CharSize(c);
				if ( is_binary_utf8 ( c ) ) {
					continue;
				}
				xp.safeMemcpy ( c , cs );
				xp.nullTerm();
			}
		}

		// skip if in bad section, marquee, select, script, style
		if ( sp && (sp[i]->m_flags & badFlags) ) {
			continue;
		}

		// don't count just numeric words
		if ( words->isNum(i) ) {
			continue;
		}

		// check if there is a url. best way to check for '://'
		if ( wids && !wids[i] ) {
			const char *wrd = words->getWord(i);
			int32_t  wrdLen = words->getWordLen(i);
			if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' &&  wrd[2] == '/' ) {
				hasUrl = true;
			}
		}

		// skip if not wid
		if ( ! wids[i] ) {
			continue;
		}

		// just make every word 100 pts
		int32_t t = 100;

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		// boost it if in bold or italics
		if ( bb[i] & D_IN_BOLDORITALICS ) {
			t *= 2;
		}

		// add the score for this word
		score += t;

		// print the score, "t"
		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf("(%" PRId32")",t);
		}

		// count the alpha words we got
		wordCount++;

		// if no matches left, skip
		if ( mi >= matches->m_numMatches ) {
			continue;
		}

		// get the match
		Match *next = &ms[mi];

		// skip if not a match
		if ( i != next->m_wordNum ) {
			continue;
		}

		// must be a match in this class
		if ( next->m_words != words ) {
			continue;
		}

		// advance it
		mi++;

		// which query word # does it match
		int32_t qwn = next->m_qwordNum;

		if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}

		// undo old score
		score -= t;

		// add 100000 per match
		t = 100000;

		// weight based on tf, goes from 0.1 to 1.0
		t = (int32_t)((float)t * m_wordWeights [ qwn ]);

		// if it is a query stop word, make it 10000 pts
		if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
			t = 0;//10000;
		}

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		if ( gotIt[qwn] > 0 ) {
			// have we matched it in this [a,b) already?
			if ( gotIt[qwn] == 1 ) {
				t /= 15;
			} else {
				// if we have more than 2 matches in the same window,
				// it may not give a good summary. give a heavy penalty
				t -= 200000;
			}
		} else if ( retired [qwn] > 0 ) {
			// have we matched it already in a winning window?
			t /= 12;
		}

		// add it back
		score += t;

		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
				       m_wordWeights[qwn]);
		}

		// inc the query word count for this window
		if ( gotIt[qwn] < 100 ) {
			gotIt[qwn]++;
		}
	}

	int32_t oldScore = score;
	
	// apply the bonus if it starts or a sentence
	// only apply if the score is positive and if the wordcount is decent
	if ( score > 0 && wordCount > 7 ){
		// a match can give us 10k to 100k pts based on the tf weights
		// so we don't want to overwhelm that too much, so let's make
		// this a 20k bonus if it starts a sentence
		if ( bb[a] & D_STARTS_SENTENCE ) {
			score += 8000;
		} else if ( bb[a] & D_STARTS_FRAG ) {
			// likewise, a fragment, like after a comma
			score += 4000;
		}

		// 1k if the match word is very close to the
		// start of a sentence, lets say 3 alphawords
		if ( matchWordNum - a < 7 ) {
			score += 1000;
		}
	}

	// a summary isn't really a summary if its less than 7 words.
	// reduce the score, but still give it a decent score.
	// minus 5M.
	if ( wordCount < 7 ) {
		score -= 20000;
	}

	// summaries that cross a lot of tags are usually bad, penalize them
	if ( numTagsCrossed > 1 ) {
		score -= (numTagsCrossed * 20000);
	}

	if ( hasUrl ) {
		score -= 8000;
	}

	// show it
	if ( g_conf.m_logDebugSummary ) {
		log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
		     xp.getBufStart());
	}

	// set lasta, besta, bestb
	*lasta = a;
	*besta = a;
	*bestb = b;

	return score;
}

Exemple #7

0

Afficher le fichier

Fichier : PageInject.cpp Projet : chushuai/open-source-search-engine

// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {

	// advance round now in case we return early
	m_round++;

	GigablastRequest *gr = &m_gr;

	// error?
	char *qts = gr->m_queryToScrape;
	if ( ! qts ) { char *xx=NULL;*xx=0; }

	if ( gbstrlen(qts) > 500 ) {
		g_errno = EQUERYTOOBIG;
		return true;
	}

	// first encode the query
	SafeBuf ebuf;
	ebuf.urlEncode ( qts ); // queryUNEncoded );
	ebuf.nullTerm();

	char *uf;
	if ( m_round == 1 )
		// set to 1 for debugging
		uf="http://www.google.com/search?num=20&"
			"q=%s&scoring=d&filter=0";
		//uf = "https://startpage.com/do/search?q=%s";
		//uf = "http://www.google.com/"
		//	"/cse?cx=013269018370076798483%3A8eec3papwpi&"
		//	"ie=UTF-8&q=%s&"
		//	"num=20";
	else
		uf="http://www.bing.com/search?q=%s";

	// skip bing for now
	//if ( m_round == 2 )
	//	return true;
	//if ( m_round == 1 )
	//	return true;
		
	// make the url we will download
	char ubuf[2048];
	sprintf ( ubuf , uf , ebuf.getBufStart() );

	// log it
	log("inject: SCRAPING %s",ubuf);

	SpiderRequest sreq;
	sreq.reset();
	// set the SpiderRequest
	strcpy(sreq.m_url, ubuf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	long firstIp = hash32n(ubuf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	char *coll2 = gr->m_coll;
	CollectionRec *cr = g_collectiondb.getRec ( coll2 );

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , cr->m_coll , NULL , 0 ); 

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// this will tell it to index ahrefs first before indexing
	// the doc. but do NOT do this if we are from ahrefs.com
	// ourselves to avoid recursive explosion!!
	if ( m_useAhrefs )
		m_xd.m_useAhrefs = true;

	m_xd.m_reallyInjectLinks = true;//gr->m_injectLinks;

	//
	// rather than just add the links of the page to spiderdb,
	// let's inject them!
	//
	m_xd.setCallback ( this , doneInjectingLinksWrapper );

	// niceness is 0
	m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");

	// do we actually inject the links, or just scrape?
	if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
				  NULL,
				  this , 
				  doneInjectingLinksWrapper ) ) 
		return false;
	// otherwise, just download the google/bing search results so we
	// can display them in xml
	//else if ( m_xd.getUtf8Content() == (char **)-1 )
	//	return false;
		
	// print reply..
	//printReply();
	return true;
}

Exemple #8

0

Afficher le fichier

Fichier : HttpMime.cpp Projet : privacore/open-source-search-engine

bool HttpMime::addToCookieJar(Url *currentUrl, SafeBuf *sb) {
	/// @note Slightly modified from Netscape HTTP Cookie File format
	/// Difference is we only have one column for name/value

	// http://www.cookiecentral.com/faq/#3.5
	// The layout of Netscape's cookies.txt file is such that each line contains one name-value pair.
	// An example cookies.txt file may have an entry that looks like this:
	// .netscape.com TRUE / FALSE 946684799 NETSCAPE_ID 100103
	//
	// Each line represents a single piece of stored information. A tab is inserted between each of the fields.
	// From left-to-right, here is what each field represents:
	//
	// domain - The domain that created AND that can read the variable.
	// flag - A TRUE/FALSE value indicating if all machines within a given domain can access the variable. This value is set automatically by the browser, depending on the value you set for domain.
	// path - The path within the domain that the variable is valid for.
	// secure - A TRUE/FALSE value indicating if a secure connection with the domain is needed to access the variable.
	// expiration - The UNIX time that the variable will expire on. UNIX time is defined as the number of seconds since Jan 1, 1970 00:00:00 GMT.
	// name/value - The name/value of the variable.

	/// @todo ALC we should sort cookie-list
	// The user agent SHOULD sort the cookie-list in the following order:
	// *  Cookies with longer paths are listed before cookies with shorter paths.
	// *  Among cookies that have equal-length path fields, cookies with earlier creation-times are listed
	// before cookies with later creation-times.

	// fill in cookies from cookieJar
	std::map<std::string, httpcookie_t> oldCookies;

	const char *cookieJar = sb->getBufStart();
	int32_t cookieJarLen = sb->length();

	const char *lineStartPos = cookieJar;
	const char *lineEndPos = NULL;
	while ((lineEndPos = (const char*)memchr(lineStartPos, '\n', cookieJarLen - (lineStartPos - cookieJar))) != NULL) {
		const char *currentPos = lineStartPos;
		const char *tabPos = NULL;
		unsigned fieldCount = 0;

		httpcookie_t cookie = {};
		while (fieldCount < 5 && (tabPos = (const char*)memchr(currentPos, '\t', lineEndPos - currentPos)) != NULL) {
			switch (fieldCount) {
				case 0:
					// domain
					cookie.m_domain = currentPos;
					cookie.m_domainLen = tabPos - currentPos;
					break;
				case 1:
					// flag
					if (memcmp(currentPos, "TRUE", 4) != 0) {
						cookie.m_defaultDomain = true;
					}
					break;
				case 2: {
					// path
					cookie.m_path = currentPos;
					cookie.m_pathLen = tabPos - currentPos;
				} break;
				case 3:
					// secure
					cookie.m_secure = (memcmp(currentPos, "TRUE", 4) == 0);
					break;
				case 4:
					// expiration
					break;
			}

			currentPos = tabPos + 1;
			++fieldCount;
		}

		cookie.m_cookie = currentPos;
		cookie.m_cookieLen = lineEndPos - currentPos;

		const char *equalPos = (const char *)memchr(cookie.m_cookie, '=', cookie.m_cookieLen);
		if (equalPos) {
			cookie.m_nameLen = equalPos - cookie.m_cookie;

			oldCookies[std::string(cookie.m_cookie, cookie.m_nameLen)] = cookie;
		}

		lineStartPos = lineEndPos + 1;
	}
	// we don't need to care about the last line (we always end on \n)

	SafeBuf newCookieJar;

	// add old cookies
	for (auto &pair : oldCookies) {
		if (m_cookies.find(pair.first) == m_cookies.end()) {
			addCookie(pair.second, *currentUrl, &newCookieJar);
		}
	}

	// add new cookies
	for (auto &pair : m_cookies) {
		addCookie(pair.second, *currentUrl, &newCookieJar);
	}

	newCookieJar.nullTerm();

	// replace old with new
	sb->reset();
	sb->safeMemcpy(&newCookieJar);
	sb->nullTerm();

	return true;
}