C++ (Cpp) to_lower_a Examples

Programming Language: C++ (Cpp)

Method/Function: to_lower_a

Examples at hotexamples.com: 12

C++ (Cpp) to_lower_a - 12 examples found. These are the top rated real world C++ (Cpp) examples of to_lower_a extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: fctypes.cpp Project: privacore/open-source-search-engine

// . like strstr but haystack may not be NULL terminated
// . needle, however, IS null terminated
char *strncasestr ( char *haystack , const char *needle , 
		    int32_t haystackSize, int32_t needleSize ) {
	int32_t n = haystackSize - needleSize ;
	for ( int32_t i = 0 ; i <= n ; i++ ) {
		// keep looping if first chars do not match
		if ( to_lower_a(haystack[i]) != to_lower_a(needle[0]) ) 
			continue;
		// if needle was only 1 char it's a match
		if ( ! needle[1] ) return &haystack[i];
		// compare the whole strings now
		if ( strncasecmp ( &haystack[i] , needle , needleSize ) == 0 ) 
			return &haystack[i];			
	}
	return NULL;
}

Example #2

Show file

File: hash.cpp Project: RogerTheGreat/open-source-search-engine

uint32_t hash32Lower_a ( char *s,int32_t len,uint32_t startHash) {
    uint32_t h = startHash;
    register int32_t i = 0;
    while ( i < len ) {
        h ^= (uint32_t) g_hashtab [(unsigned char)i]
             [(unsigned char)to_lower_a(s[i])];
        i++;
    }
    return h;
}

Example #3

Show file

File: LanguageIdentifier.cpp Project: BlaBlaNet/open-source-search-engine

uint8_t getLanguageFromAbbr2 ( char *str , int32_t len ) {
	// truncate
	if ( len > 5 ) len = 5;
	// copy it and check it
	char lang[6];
	for ( int32_t j = 0 ; j < len ; j++ )
		lang[j] = to_lower_a(str[j]);
	lang[len]='\0';
	return getLanguageFromAbbr(lang);
}

Example #4

Show file

File: hash.cpp Project: BILObilo/open-source-search-engine

unsigned long hash32Lower_a ( char *s,long len,unsigned long startHash){
	unsigned long h = startHash;
	long i = 0;
	while ( i < len ) {
		h ^= (unsigned long) g_hashtab [(unsigned char)i] 
			[(unsigned char)to_lower_a(s[i])];
		i++;
	}
	return h;
}

Example #5

Show file

File: fctypes.cpp Project: privacore/open-source-search-engine

char *strncasestr( char *haystack, int32_t haylen, const char *needle){
	int32_t matchLen = 0;
	int32_t needleLen = strlen(needle);
	for (int32_t i = 0; i < haylen;i++){
		char c1 = to_lower_a(haystack[i]);
		char c2 = to_lower_a(needle[matchLen]);
		if ( c1 != c2 ){
			// no match
			matchLen = 0;
			continue;
		}
		// we matched another character
		matchLen++;
		if (matchLen < needleLen) continue;
		
		// we've matched the whole string
		return haystack + i - matchLen + 1;
	}
	return NULL;
}

Example #6

Show file

File: CountryCode.cpp Project: ariosx/open-source-search-engine

// get the id from a 2 character country code
uint8_t getCountryId ( char *cc ) {
	static bool s_init = false;
	static char buf[2000];
	static HashTableX ht;
	char tmp[4];
	if ( ! s_init ) {
		s_init = true;
		// hash them up
		ht.set ( 4 , 1 , -1,buf,2000,false,MAX_NICENESS,"ctryids");
		// now add in all the country codes
		long n = (long) sizeof(s_countryCode) / sizeof(char *); 
		for ( long i = 0 ; i < n ; i++ ) {
			char *s    = (char *)s_countryCode[i];
			//long  slen = gbstrlen ( s );
			// sanity check
			if ( !s[0] || !s[1] || s[2]) { char *xx=NULL;*xx=0; }
			// map it to a 4 byte key
			tmp[0]=s[0];
			tmp[1]=s[1];
			tmp[2]=0;
			tmp[3]=0;
			// a val of 0 does not mean empty in HashTableX,
			// that is an artifact of HashTableT
			uint8_t val = i; // +1;
			// add 1 cuz 0 means lang unknown
			if ( ! ht.addKey ( tmp , &val ) ) {
				char *xx=NULL;*xx=0; }
		}
	}
	// lookup
	tmp[0]=to_lower_a(cc[0]);
	tmp[1]=to_lower_a(cc[1]);
	tmp[2]=0;
	tmp[3]=0;
	long slot = ht.getSlot ( tmp );
	if ( slot < 0 ) return 0;
	void *val = ht.getValueFromSlot ( slot );
	return *(uint8_t *)val ;
}

Example #7

Show file

File: PageParser.cpp Project: acchou/open-source-search-engine

// . a new interface so Msg3b can call this with "s" set to NULL
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageParser2 ( TcpSocket   *s , 
		       HttpRequest *r ,
		       State8      *st ,
		       long long    docId ,
		       Query       *q ,
		       // in query term space, not imap space
		       long long   *termFreqs       ,
		       // in imap space
		       float       *termFreqWeights ,
		       // in imap space
		       float       *affWeights      ,
		       void        *state ,
		       void       (* callback)(void *state) ) {

	//log("parser: read sock=%li",s->m_sd);

	// might a simple request to addsomething to validated.*.txt file
	// from XmlDoc::print() or XmlDoc::validateOutput()
	char *add = r->getString("add",NULL);
	//long long uh64 = r->getLongLong("uh64",0LL);
	char *uh64str = r->getString("uh64",NULL);
	//char *divTag = r->getString("div",NULL);
	if ( uh64str ) {
		// convert add to number
		long addNum = 0;
		if ( to_lower_a(add[0])=='t' ) // "true" or "false"?
			addNum = 1;
		// convert it. skip beginning "str" inserted to prevent
		// javascript from messing with the long long since it
		// was rounding it!
		//long long uh64 = atoll(uh64str);//+3);
		// urldecode that
		//long divTagLen = gbstrlen(divTag);
		//long newLen  = urlDecode ( divTag , divTag , divTagLen );
		// null term?
		//divTag[newLen] = '\0';
		// do it. this is defined in XmlDoc.cpp
		//addCheckboxSpan ( uh64 , divTag , addNum );
		// make basic reply
		char *reply;
		reply = "HTTP/1.0 200 OK\r\n"
			"Connection: Close\r\n";
		// that is it! send a basic reply ok
		bool status = g_httpServer.sendDynamicPage( s , 
							    reply,
							    gbstrlen(reply),
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		return status;
	}

	// make a state
	if (   st ) st->m_freeIt = false;
	if ( ! st ) {
		try { st = new (State8); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("PageParser: new(%i): %s", 
			    (int)sizeof(State8),mstrerror(g_errno));
			return g_httpServer.sendErrorReply(s,500,
						       mstrerror(g_errno));}
		mnew ( st , sizeof(State8) , "PageParser" );
		st->m_freeIt = true;
	}
	// msg3b uses this to get a score from the query
	st->m_state           = state;
	st->m_callback        = callback;
	st->m_q               = q;
	st->m_termFreqs       = termFreqs;
	st->m_termFreqWeights = termFreqWeights;
	st->m_affWeights      = affWeights;
	//st->m_total           = (score_t)-1;
	st->m_indexCode       = 0;
	st->m_blocked         = false;
	st->m_didRootDom      = false;
	st->m_didRootWWW      = false;
	st->m_wasRootDom      = false;
	st->m_u               = NULL;
	st->m_recompute       = false;
	//st->m_url.reset();

	// do not allow more than one to be launched at a time if in 
	// a quickpoll. will cause quickpoll in quickpoll.
	g_inPageParser = true;

	// password, too
	long pwdLen = 0;
	char *pwd = r->getString ( "pwd" , &pwdLen );
	if ( pwdLen > 31 ) pwdLen = 31;
	if ( pwdLen > 0 ) strncpy ( st->m_pwd , pwd , pwdLen );
	st->m_pwd[pwdLen]='\0';

	// save socket ptr
	st->m_s = s;
	st->m_r.copy ( r );
	// get the collection
	char *coll    = r->getString ( "c" , &st->m_collLen ,NULL /*default*/);
	if ( st->m_collLen > MAX_COLL_LEN )
		return sendErrorReply ( st , ENOBUFS );
	if ( ! coll )
		return sendErrorReply ( st , ENOCOLLREC );
	strcpy ( st->m_coll , coll );

	// version to use, if -1 use latest
	st->m_titleRecVersion = r->getLong("version",-1);
	if ( st->m_titleRecVersion == -1 ) 
		st->m_titleRecVersion = TITLEREC_CURRENT_VERSION;
	// default to 0 if not provided
	st->m_hopCount = r->getLong("hc",0);
	//long  ulen    = 0;
	//char *u     = r->getString ( "u" , &ulen     , NULL /*default*/);
	long  old     = r->getLong   ( "old", 0 );
	// set query
	long qlen;
	char *qs = r->getString("q",&qlen,NULL);
	if ( qs ) st->m_tq.set2 ( qs , langUnknown , true );
	// url will override docid if given
	if ( ! st->m_u || ! st->m_u[0] ) 
		st->m_docId = r->getLongLong ("docid",-1);
	else    
		st->m_docId = -1;
	// set url in state class (may have length 0)
	//if ( u ) st->m_url.set ( u , ulen );
	//st->m_urlLen = ulen;
	st->m_u = st->m_r.getString("u",&st->m_ulen,NULL);
	// should we recycle link info?
	st->m_recycle  = r->getLong("recycle",0);
	st->m_recycle2 = r->getLong("recycleimp",0);
	st->m_render   = r->getLong("render" ,0);
	// for quality computation... takes way longer cuz we have to 
	// lookup the IP address of every outlink, so we can get its root
	// quality using Msg25 which needs to filter out voters from that IP
	// range.
	st->m_oips     = r->getLong("oips"    ,0);

	long  linkInfoLen  = 0;
	// default is NULL
	char *linkInfoColl = r->getString ( "oli" , &linkInfoLen, NULL );
	if ( linkInfoColl ) strcpy ( st->m_linkInfoColl , linkInfoColl );
	else st->m_linkInfoColl[0] = '\0';
	
	// set the flag in our SafeBuf class so that Words.cpp knows to show 
	// html or html source depending on this value
	st->m_xbuf.m_renderHtml = st->m_render;

	// should we use the old title rec?
	st->m_old    = old;
	// are we coming from a local machine?
	st->m_isLocal = r->isLocal();
 	//no more setting the default root quality to 30, instead if we do not
 	// know it setting it to -1
 	st->m_rootQuality=-1;







	// header
	SafeBuf *xbuf = &st->m_xbuf;
	xbuf->safePrintf("<meta http-equiv=\"Content-Type\" "
			 "content=\"text/html; charset=utf-8\">\n");

	// print standard header
	g_pages.printAdminTop ( xbuf , st->m_s , &st->m_r );


	// print the standard header for admin pages
	char *dd     = "";
	char *rr     = "";
	char *rr2    = "";
	char *render = "";
	char *oips   = "";
	char *us     = "";
	if ( st->m_u && st->m_u[0] ) us = st->m_u;
	//if ( st->m_sfn != -1 ) sprintf ( rtu , "%li",st->m_sfn );
	if ( st->m_old ) dd = " checked";
	if ( st->m_recycle            ) rr     = " checked";
	if ( st->m_recycle2           ) rr2    = " checked";
	if ( st->m_render             ) render = " checked";
	if ( st->m_oips               ) oips   = " checked";

	xbuf->safePrintf(
			 "<style>"
			 ".poo { background-color:#%s;}\n"
			 "</style>\n" ,
			 LIGHT_BLUE );


	long clen;
	char *contentParm = r->getString("content",&clen,"");
	
	// print the input form
	xbuf->safePrintf (
		       "<style>\n"
		       "h2{font-size: 12px; color: #666666;}\n"

		       ".gbtag { border: 1px solid gray;"
		  	"background: #ffffef;display:inline;}\n"
		  	".gbcomment { border: 1px solid gray;"
		  	"color: #888888; font-style:italic; "
		  	"background: #ffffef;display:inline;}\n"

		       ".token { border: 1px solid gray;"
		       "background: #f0ffff;display:inline;}\n"
		       ".spam { border: 1px solid gray;"
		       "background: #af0000;"
		       "color: #ffffa0;}"
		       ".hs {color: #009900;}"
		       "</style>\n"
		       "<center>"

		  "<table %s>"

			  "<tr><td colspan=5><center><b>"
			  "Parser"
			  "</b></center></td></tr>\n"

		  "<tr class=poo>"
		  "<td>"
		  "<b>url</b>"
			  "<br><font size=-2>"
			  "Type in <b>FULL</b> url to parse."
			  "</font>"
		  "</td>"

		  "</td>"
		  "<td>"
		  "<input type=text name=u value=\"%s\" size=\"40\">\n"
		  "</td>"
		  "</tr>"


			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Parser version to use: "
		  "</td>"
		  "<td>"
		  "<input type=text name=\"version\" size=\"4\" value=\"-1\"> "
		  "</td>"
		  "<td>"
		  "(-1 means to use latest title rec version)<br>"
		  "</td>"
		  "</tr>"
			  */

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Hop count to use: "
		  "</td>"
		  "<td>"
		  "<input type=text name=\"hc\" size=\"4\" value=\"%li\"> "
		  "</td>"
		  "<td>"
		  "(-1 is unknown. For root urls hopcount is always 0)<br>"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
			  "<b>use cached</b>"

			  "<br><font size=-2>"
			  "Load page from cache (titledb)?"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=old value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Reparse root:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=artr value=1%s> "
		  "</td>"
		  "<td>"
		  "Apply selected ruleset to root to update quality"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>recycle link info</b>"

			  "<br><font size=-2>"
			  "Recycle the link info from the title rec"
			  "Load page from cache (titledb)?"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=recycle value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Recycle Link Info Imported:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=recycleimp value=1%s> "
		  "</td>"
		  "<td>"
		  "Recycle the link info imported from other coll"
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>render html</b>"

			  "<br><font size=-2>"
			  "Render document content as HTML"
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=checkbox name=render value=1%s> "
		  "</td>"
		  "</tr>"

			  /*
		  "<tr class=poo>"
		  "<td>"
		  "Lookup outlinks' ruleset, ips, quality:"
		  "</td>"
		  "<td>"
		  "<input type=checkbox name=oips value=1%s> "
		  "</td>"
		  "<td>"
		  "To compute quality lookup IP addresses of roots "
		  "of outlinks."
		  "</td>"
		  "</tr>"

		  "<tr class=poo>"
		  "<td>"
		  "LinkInfo Coll:"
		  "</td>"
		  "<td>"
		  "<input type=text name=\"oli\" size=\"10\" value=\"\"> "
		  "</td>"
		  "<td>"
		  "Leave empty usually. Uses this coll to lookup link info."
		  "</td>"
		  "</tr>"
			  */

		  "<tr class=poo>"
		  "<td>"
		  "<b>optional query</b>"

			  "<br><font size=-2>"
			  "Leave empty usually. For title generation only."
			  "</font>"

		  "</td>"
		  "<td>"
		  "<input type=text name=\"q\" size=\"20\" value=\"\"> "
		  "</td>"
		       "</tr>",

			  TABLE_STYLE,
			  us ,
			   dd,
			   rr, 
			   render
			  );

	xbuf->safePrintf(
		  "<tr class=poo>"
			  "<td>"
			  "<b>content type below is</b>"
			  "<br><font size=-2>"
			  "Is the content below HTML? XML? JSON?"
			  "</font>"
			  "</td>"

		  "<td>"
		       //"<input type=checkbox name=xml value=1> "
		       "<select name=ctype>\n"
		       "<option value=%li selected>HTML</option>\n"
		       "<option value=%li selected>XML</option>\n"
		       "<option value=%li selected>JSON</option>\n"
		       "</select>\n"

		  "</td>"
		       "</tr>",
		       (long)CT_HTML,
		       (long)CT_XML,
		       (long)CT_JSON
			  );

	xbuf->safePrintf(

			  "<tr class=poo>"
			  "<td><b>content</b>"
			  "<br><font size=-2>"
			  "Use this content for the provided <i>url</i> "
			  "rather than downloading it from the web."
			  "</td>"

			  "<td>"
			  "<textarea rows=10 cols=80 name=content>"
			  "%s"
			  "</textarea>"
			  "</td>"
			  "</tr>"

		  "</table>"
		  "</center>"
		  "</form>"
		  "<br>",

			  //oips ,
			  contentParm );



	xbuf->safePrintf(
			 "<center>"
			 "<input type=submit value=Submit>"
			 "</center>"
			 );


	// just print the page if no url given
	if ( ! st->m_u || ! st->m_u[0] ) return processLoop ( st );


	XmlDoc *xd = &st->m_xd;
	// set this up
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url,st->m_u);
	long firstIp = hash32n(st->m_u);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	// parentdocid of 0
	sreq.setKey( firstIp, 0LL, false );
	sreq.m_isPageParser = 1;
	sreq.m_hopCount = st->m_hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp = firstIp;
	Url nu;
	nu.set(sreq.m_url);
	sreq.m_domHash32 = nu.getDomainHash32();
	sreq.m_siteHash32 = nu.getHostHash32();

	// . get provided content if any
	// . will be NULL if none provided
	// . "content" may contain a MIME
	long  contentLen = 0;
	char *content = r->getString ( "content" , &contentLen , NULL );
	// is the "content" url-encoded? default is true.
	bool contentIsEncoded = true;
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		contentIsEncoded = false;
	}
	// ensure null
	if ( contentLen == 0 ) content = NULL;

	uint8_t contentType = CT_HTML;
	if ( r->getBool("xml",0) ) contentType = CT_XML;

	contentType = r->getLong("ctype",contentType);//CT_HTML);


	// if facebook, load xml content from title rec...
	bool isFacebook = (bool)strstr(st->m_u,"http://www.facebook.com/");
	if ( isFacebook && ! content ) {
		long long docId = g_titledb.getProbableDocId(st->m_u);
		sprintf(sreq.m_url ,"%llu", docId );
		sreq.m_isPageReindex = true;
	}

	// hack
	if ( content ) {
		st->m_dbuf.purge();
		st->m_dbuf.safeStrcpy(content);
		//char *data = strstr(content,"\r\n\r\n");
		//long dataPos = 0;
		//if ( data ) dataPos = (data + 4) - content;
		//st->m_dbuf.convertJSONtoXML(0,dataPos);
		//st->m_dbuf.decodeJSON(0);
		content = st->m_dbuf.getBufStart();
	}

	// . use the enormous power of our new XmlDoc class
	// . this returns false if blocked
	if ( ! xd->set4 ( &sreq       ,
			  NULL        ,
			  st->m_coll  ,
			  &st->m_wbuf        ,
			  0 ,//PP_NICENESS ))
			  content ,
			  false, // deletefromindex
			  0, // forced ip
			  contentType ))
		// return error reply if g_errno is set
		return sendErrorReply ( st , g_errno );
	// make this our callback in case something blocks
	xd->setCallback ( st , processLoop );
	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	if ( st->m_recycle ) xd->m_recycleContent = true;

	return processLoop ( st );
}

Example #8

Show file

File: PageGet.cpp Project: firatkarakusoglu/open-source-search-engine

// returns false if blocked, true otherwise
bool processLoop ( void *state ) {
	// get it
	State2 *st = (State2 *)state;
	// get the tcp socket from the state
	TcpSocket *s = st->m_socket;
	// get it
	XmlDoc *xd = &st->m_xd;

	if ( ! xd->m_loaded ) {
		// setting just the docid. niceness is 0.
		//xd->set3 ( st->m_docId , st->m_coll , 0 );
		// callback
		xd->setCallback ( state , processLoop );
		// . and tell it to load from the old title rec
		// . this sets xd->m_oldTitleRec/m_oldTitleRecSize
		// . this sets xd->ptr_* and all other member vars from
		//   the old title rec if found in titledb.
		if ( ! xd->loadFromOldTitleRec ( ) ) return false;
	}

	if ( g_errno ) return sendErrorReply ( st , g_errno );
	// now force it to load old title rec
	//char **tr = xd->getTitleRec();
	SafeBuf *tr = xd->getTitleRecBuf();
	// blocked? return false if so. it will call processLoop() when it rets
	if ( tr == (void *)-1 ) return false;
	// we did not block. check for error? this will free "st" too.
	if ( ! tr ) return sendErrorReply ( st , g_errno );
	// if title rec was empty, that is a problem
	if ( xd->m_titleRecBuf.length() == 0 ) 
		return sendErrorReply ( st , ENOTFOUND);

	// set callback
	char *na = xd->getIsNoArchive();
	// wait if blocked
	if ( na == (void *)-1 ) return false;
	// error?
	if ( ! na ) return sendErrorReply ( st , g_errno );
	// forbidden? allow turkeys through though...
	if ( ! st->m_isAdmin && *na )
		return sendErrorReply ( st , ENOCACHE );

	SafeBuf *sb = &st->m_sb;


	// &page=4 will print rainbow sections
	if ( ! st->m_printed && st->m_r.getLong("page",0) ) {
		// do not repeat this call
		st->m_printed = true;
		// this will call us again since we called
		// xd->setCallback() above to us
		if ( ! xd->printDocForProCog ( sb , &st->m_r ) )
			return false;
	}

	char *contentType = "text/html";
	char format = st->m_format;
	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// if we printed a special page (like rainbow sections) then return now
	if ( st->m_printed ) {
		bool status = g_httpServer.sendDynamicPage (s,
							    //buf,bufLen,
							    sb->getBufStart(),
							    sb->getLength(),
							    -1,false,
							    //"text/html",
							    contentType,
							    -1, NULL, "utf8" );
		// nuke state2
		mdelete ( st , sizeof(State2) , "PageGet1" );
		delete (st);
		return status;
	}

	/*
	  // this was calling XmlDoc and setting sections, etc. to
	  // get the SpiderReply junk... no no no
	// is it banned or filtered? this ignores the TagRec in the titleRec
	// and uses msg8a to get it fresh instead
	char *vi = xd->getIsFiltered();//Visible( );
	// wait if blocked
	if ( vi == (void *)-1 ) return false;
	// error?
	if ( ! vi ) return sendErrorReply ( st , g_errno );
	// banned?
	if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
	*/

	// get the utf8 content
	char **utf8 = xd->getUtf8Content();
	//long   len  = xd->size_utf8Content - 1;
	// wait if blocked???
	if ( utf8 == (void *)-1 ) return false;
	// strange
	if ( xd->size_utf8Content<=0) {
		log("pageget: utf8 content <= 0");
		return sendErrorReply(st,EBADENGINEER );
	}
	// alloc error?
	if ( ! utf8 ) return sendErrorReply ( st , g_errno );

	// get this host
	Host *h = g_hostdb.getHost ( g_hostdb.m_hostId );
	if ( ! h ) {
		log("pageget: hostid %li is bad",g_hostdb.m_hostId);
		return sendErrorReply(st,EBADENGINEER );
	}


	char *content    = xd->ptr_utf8Content;
	long  contentLen = xd->size_utf8Content - 1;

	// shortcut
	char strip = st->m_strip;

	// alloc buffer now
	//char *buf = NULL;
	//long  bufMaxSize = 0;
	//bufMaxSize = len + ( 32 * 1024 ) ;
	//bufMaxSize = contentLen + ( 32 * 1024 ) ;
	//buf        = (char *)mmalloc ( bufMaxSize , "PageGet2" );
	//char *p          = buf;
	//char *bufEnd     = buf + bufMaxSize;
	//if ( ! buf ) {
	//	return sendErrorReply ( st , g_errno );
	//}

	// for undoing the header
	//char *start1 = p;
	long startLen1 = sb->length();

	// we are always utfu
	if ( strip != 2 )
		sb->safePrintf( "<meta http-equiv=\"Content-Type\" "
			     "content=\"text/html;charset=utf8\">\n");

	// base href
	//Url *base = &xd->m_firstUrl;
	//if ( xd->ptr_redirUrl.m_url[0] )
	//	base = &xd->m_redirUrl;
	char *base = xd->ptr_firstUrl;
	if ( xd->ptr_redirUrl ) base = xd->ptr_redirUrl;
	//Url *redir = *xd->getRedirUrl();
	if ( strip != 2 ) {
		sb->safePrintf ( "<BASE HREF=\"%s\">" , base );
		//p += gbstrlen ( p );
	}

	// default colors in case css files missing
	if ( strip != 2 ) {
		sb->safePrintf( "\n<style type=\"text/css\">\n"
			  "body{background-color:white;color:black;}\n"
			  "</style>\n");
		//p += gbstrlen ( p );
	}

	//char format = st->m_format;
	if ( format == FORMAT_XML ) sb->reset();
	if ( format == FORMAT_JSON ) sb->reset();

	// for undoing the stuff below
	long startLen2 = sb->length();//p;

	// query should be NULL terminated
	char *q    = st->m_q;
	long  qlen = st->m_qlen;

	char styleTitle[128] =  "font-size:14px;font-weight:600;"
				"color:#000000;";
	char styleText[128]  =  "font-size:14px;font-weight:400;"
				"color:#000000;";
	char styleLink[128] =  "font-size:14px;font-weight:400;"
				"color:#0000ff;";
	char styleTell[128] =  "font-size:14px;font-weight:600;"
				"color:#cc0000;";

	// get the url of the title rec
	Url *f = xd->getFirstUrl();

	bool printDisclaimer = st->m_printDisclaimer;

	if ( xd->m_contentType == CT_JSON )
		printDisclaimer = false;

	if ( format == FORMAT_XML ) printDisclaimer = false;
	if ( format == FORMAT_JSON ) printDisclaimer = false;

	char tbuf[100];
	tbuf[0] = 0;
	time_t lastSpiderDate = xd->m_spideredTime;

	if ( printDisclaimer ||
	     format == FORMAT_XML ||
	     format == FORMAT_JSON ) {
		struct tm *timeStruct = gmtime ( &lastSpiderDate );
		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
	}

	// We should always be displaying this disclaimer.
	// - May eventually want to display this at a different location
	//   on the page, or on the click 'n' scroll browser page itself
	//   when this page is not being viewed solo.
	// CNS: if ( ! st->m_clickNScroll ) {
	if ( printDisclaimer ) {

		sb->safePrintf(//sprintf ( p , 
			  //"<BASE HREF=\"%s\">"
			  //"<table border=1 width=100%%>"
			  //"<tr><td>"
			  "<table border=\"1\" bgcolor=\"#"
			  BGCOLOR
			  "\" cellpadding=\"10\" "
			  //"id=\"gbcnsdisctable\" class=\"gbcnsdisctable_v\""
			  "cellspacing=\"0\" width=\"100%%\" color=\"#ffffff\">"
			  "<tr"
			  //" id=\"gbcnsdisctr\" class=\"gbcnsdisctr_v\""
			  "><td>"
			  //"<font face=times,sans-serif color=black size=-1>"
			  "<span style=\"%s\">"
			  "This is Gigablast's cached page of </span>"
			  "<a href=\"%s\" style=\"%s\">%s</a>"
			  "" , styleTitle, f->getUrl(), styleLink,
			  f->getUrl() );
		//p += gbstrlen ( p );
		// then the rest
		//sprintf(p , 
		sb->safePrintf(
			"<span style=\"%s\">. "
			"Gigablast is not responsible for the content of "
			"this page.</span>", styleTitle );
		//p += gbstrlen ( p );

		sb->safePrintf ( "<br/><span style=\"%s\">"
			  "Cached: </span>"
			  "<span style=\"%s\">",
			  styleTitle, styleText );
		//p += gbstrlen ( p );

		// then the spider date in GMT
		// time_t lastSpiderDate = xd->m_spideredTime;
		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
		// char tbuf[100];
		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
		//p += gbstrlen ( p );
		sb->safeStrcpy(tbuf);

		// Moved over from PageResults.cpp
		sb->safePrintf( "</span> - <a href=\""
			      "/get?"
			      "q=%s&amp;c=%s&amp;rtq=%li&amp;"
			      "d=%lli&amp;strip=1\""
			      " style=\"%s\">"
			      "[stripped]</a>", 
			      q , st->m_coll , 
			      (long)st->m_rtq,
			      st->m_docId, styleLink ); 

		// a link to alexa
		if ( f->getUrlLen() > 5 ) {
			sb->safePrintf( " - <a href=\"http:"
					 "//web.archive.org/web/*/%s\""
					 " style=\"%s\">"
					 "[older copies]</a>" ,
					 f->getUrl(), styleLink );
		}

		if (st->m_noArchive){
			sb->safePrintf( " - <span style=\"%s\"><b>"
				     "[NOARCHIVE]</b></span>",
				     styleTell );
		}
		if (st->m_isBanned){
			sb->safePrintf(" - <span style=\"%s\"><b>"
				     "[BANNED]</b></span>",
				     styleTell );
		}

		// only print this if we got a query
		if ( qlen > 0 ) {
			sb->safePrintf("<br/><br/><span style=\"%s\"> "
				   "These search terms have been "
				   "highlighted:  ",
				   styleText );
			//p += gbstrlen ( p );
		}
		
	}

	// how much space left in p?
	//long avail = bufEnd - p;
	// . make the url that we're outputting for (like in PageResults.cpp)
	// . "thisUrl" is the baseUrl for click & scroll
	char thisUrl[MAX_URL_LEN];
	char *thisUrlEnd = thisUrl + MAX_URL_LEN;
	char *x = thisUrl;
	// . use the external ip of our gateway
	// . construct the NAT mapped port
	// . you should have used iptables to map port to the correct
	//   internal ip:port
	//unsigned long  ip   =g_conf.m_mainExternalIp  ; // h->m_externalIp;
	//unsigned short port=g_conf.m_mainExternalPort;//h->m_externalHttpPort
	// local check
	//if ( st->m_isLocal ) {
	unsigned long  ip   = h->m_ip;
	unsigned short port = h->m_httpPort;
	//}
	//sprintf ( x , "http://%s:%li/get?q=" , iptoa ( ip ) , port );
	// . we no longer put the port in here
	// . but still need http:// since we use <base href=>
	if (port == 80) sprintf(x,"http://%s/get?q=",iptoa(ip));
	else            sprintf(x,"http://%s:%hu/get?q=",iptoa(ip),port);
	x += gbstrlen ( x );
	// the query url encoded
	long elen = urlEncode ( x , thisUrlEnd - x , q , qlen );
	x += elen;
	// separate cgi vars with a &
	//sprintf ( x, "&seq=%li&rtq=%lid=%lli",
	//	  (long)st->m_seq,(long)st->m_rtq,st->m_msg22.getDocId());
	sprintf ( x, "&d=%lli",st->m_docId );
	x += gbstrlen(x);		
	// set our query for highlighting
	Query qq;
	qq.set2 ( q, st->m_langId , true );

	// print the query terms into our highlight buffer
	Highlight hi;
	// make words so we can set the scores to ignore fielded terms
	Words qw;
	qw.set ( q            ,  // content being highlighted, utf8
		 qlen         ,  // content being highlighted, utf8
		 TITLEREC_CURRENT_VERSION,
		 true         ,  // computeIds
		 false        ); // hasHtmlEntities?
	// . assign scores of 0 to query words that should be ignored
	// . TRICKY: loop over words in qq.m_qwords, but they should be 1-1
	//   with words in qw.
	// . sanity check
	//if ( qw.getNumWords() != qq.m_numWords ) { char *xx = NULL; *xx = 0;}
	// declare up here
	Matches m;
	// do the loop
	//Scores ss;
	//ss.set ( &qw , NULL );
	//for ( long i = 0 ; i < qq.m_numWords ; i++ )
	//	if ( ! m.matchWord ( &qq.m_qwords[i],i ) ) ss.m_scores[i] = 0;
	// now set m.m_matches[] to those words in qw that match a query word
	// or phrase in qq.
	m.setQuery ( &qq );
	//m.addMatches ( &qw , &ss , true );
	m.addMatches ( &qw );
	long hilen = 0;

	// CNS: if ( ! st->m_clickNScroll ) {
	// and highlight the matches
	if ( printDisclaimer ) {
		hilen = hi.set ( //p       ,
				 //avail   ,
				sb ,
				 &qw     , // words to highlight
				 &m      , // matches relative to qw
				 false   , // doSteming
				 false   , // st->m_clickAndScroll , 
				 (char *)thisUrl );// base url for ClcknScrll
		//p += hilen;
		// now an hr
		//memcpy ( p , "</span></table></table>\n" , 24 );   p += 24;
		sb->safeStrcpy("</span></table></table>\n");
	}


	bool includeHeader = st->m_includeHeader;

	// do not show header for json object display
	if ( xd->m_contentType == CT_JSON )
		includeHeader = false;

	if ( format == FORMAT_XML ) includeHeader = false;
	if ( format == FORMAT_JSON ) includeHeader = false;

	//mfree(uq, uqCapacity, "PageGet");
	// undo the header writes if we should
	if ( ! includeHeader ) {
		// including base href is off by default when not including
		// the header, so the caller must explicitly turn it back on
		if ( st->m_includeBaseHref ) sb->m_length=startLen2;//p=start2;
		else                         sb->m_length=startLen1;//p=start1;
	}

	//sb->safeStrcpy(tbuf);



	if ( format == FORMAT_XML ) {
		sb->safePrintf("<response>\n");
		sb->safePrintf("<statusCode>0</statusCode>\n");
		sb->safePrintf("<statusMsg>Success</statusMsg>\n");
		sb->safePrintf("<url><![CDATA[");
		sb->cdataEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("]]></url>\n");
		sb->safePrintf("<docId>%llu</docId>\n",xd->m_docId);
		sb->safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
			      lastSpiderDate);
		sb->safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("{\"response\":{\n");
		sb->safePrintf("\t\"statusCode\":0,\n");
		sb->safePrintf("\t\"statusMsg\":\"Success\",\n");
		sb->safePrintf("\t\"url\":\"");
		sb->jsonEncode(xd->m_firstUrl.m_url);
		sb->safePrintf("\",\n");
		sb->safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
		sb->safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
		sb->safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
	}

	// identify start of <title> tag we wrote out
	char *sbstart = sb->getBufStart();
	char *sbend   = sb->getBufEnd();
	char *titleStart = NULL;
	char *titleEnd   = NULL;
	for ( char *t = sbstart ; t < sbend ; t++ ) {
		// title tag?
		if ( t[0]!='<' ) continue;
		if ( to_lower_a(t[1])!='t' ) continue;
		if ( to_lower_a(t[2])!='i' ) continue;
		if ( to_lower_a(t[3])!='t' ) continue;
		if ( to_lower_a(t[4])!='l' ) continue;
		if ( to_lower_a(t[5])!='e' ) continue;
		// point to it
		char *x = t + 5;
		// max - to keep things fast
		char *max = x + 500;
		for ( ; *x && *x != '>' && x < max ; x++ );
		x++;
		// find end
		char *e = x;
		for ( ; *e && e < max ; e++ ) {
			if ( e[0]=='<' &&
			     to_lower_a(e[1])=='/' &&
			     to_lower_a(e[2])=='t' &&
			     to_lower_a(e[3])=='i' &&
			     to_lower_a(e[4])=='t' &&
			     to_lower_a(e[5])=='l' &&
			     to_lower_a(e[6])=='e' )
				break;
		}
		if ( e < max ) {
			titleStart = x;
			titleEnd   = e;
		}
		break;
	}

	// . print title at top!
	// . consider moving
	if ( titleStart ) {

		char *ebuf = st->m_r.getString("eb");
		if ( ! ebuf ) ebuf = "";

		//p += sprintf ( p , 
		sb->safePrintf(
			       "<table border=1 "
			       "cellpadding=10 "
			       "cellspacing=0 "
			       "width=100%% "
			       "color=#ffffff>" );

		long printLinks = st->m_r.getLong("links",0);

		if ( ! printDisclaimer && printLinks )
			sb->safePrintf(//p += sprintf ( p , 
				       // first put cached and live link
				       "<tr>"
				       "<td bgcolor=lightyellow>"
				       // print cached link
				       //"<center>"
				       "&nbsp; "
				       "<b>"
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=\""
				       "/get?"
				       "c=%s&d=%lli&qh=0&cnsp=1&eb=%s\">"
				       "cached link</a>"
				       " &nbsp; "
				       "<a "
				       "style=\"font-size:18px;font-weight:600;"
				       "color:#000000;\" "
				       "href=%s>live link</a>"
				       "</b>"
				       //"</center>"
				       "</td>"
				       "</tr>\n"
				       ,st->m_coll
				       ,st->m_docId 
				       ,ebuf
				       ,thisUrl // st->ptr_ubuf
				       );

		if ( printLinks ) {
			sb->safePrintf(//p += sprintf ( p ,
				       "<tr><td bgcolor=pink>"
				       "<span style=\"font-size:18px;"
				       "font-weight:600;"
				       "color:#000000;\">"
				       "&nbsp; "
				       "<b>PAGE TITLE:</b> "
				       );
			long tlen = titleEnd - titleStart;
			sb->safeMemcpy ( titleStart , tlen );
			sb->safePrintf ( "</span></td></tr>" );
		}

		sb->safePrintf( "</table><br>\n" );

	}

	// is the content preformatted?
	bool pre = false;
	char ctype = (char)xd->m_contentType;
	if ( ctype == CT_TEXT ) pre = true ; // text/plain
	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
	if ( ctype == CT_PS   ) pre = true ; // filtered postscript

	if ( format == FORMAT_XML ) pre = false;
	if ( format == FORMAT_JSON ) pre = false;

	// if it is content-type text, add a <pre>
	if ( pre ) {//p + 5 < bufEnd && pre ) {
		sb->safePrintf("<pre>");
		//p += 5;
	}

	if ( st->m_strip == 1 )
		contentLen = stripHtml( content, contentLen, 
					(long)xd->m_version, st->m_strip );
	// it returns -1 and sets g_errno on error, line OOM
	if ( contentLen == -1 ) {
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		return sendErrorReply ( st , g_errno );
	}

	Xml xml;
	Words ww;

	// if no highlighting, skip it
	bool queryHighlighting = st->m_queryHighlighting;
	if ( st->m_strip == 2 ) queryHighlighting = false;

	// do not do term highlighting if json
	if ( xd->m_contentType == CT_JSON )
		queryHighlighting = false;

	SafeBuf tmp;
	SafeBuf *xb = sb;
	if ( format == FORMAT_XML ) xb = &tmp;
	if ( format == FORMAT_JSON ) xb = &tmp;
	

	if ( ! queryHighlighting ) {
		xb->safeMemcpy ( content , contentLen );
		//p += contentLen ;
	}
	else {
		// get the content as xhtml (should be NULL terminated)
		//Words *ww = xd->getWords();
		if ( ! xml.set ( content , contentLen , false ,
				 0 , false , TITLEREC_CURRENT_VERSION ,
				 false , 0 , CT_HTML ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}			
		if ( ! ww.set ( &xml , true , 0 ) ) { // niceness is 0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// sanity check
		//if ( ! xd->m_wordsValid ) { char *xx=NULL;*xx=0; }
		// how much space left in p?
		//avail = bufEnd - p;

		Matches m;
		m.setQuery ( &qq );
		m.addMatches ( &ww );
		hilen = hi.set ( xb , // p , avail , 
				 &ww , &m ,
				 false /*doStemming?*/ ,  
				 st->m_clickAndScroll , 
				 thisUrl /*base url for click & scroll*/);
		//p += hilen;
		log(LOG_DEBUG, "query: Done highlighting cached page content");
	}


	if ( format == FORMAT_XML ) {
		sb->safePrintf("\t<content><![CDATA[");
		sb->cdataEncode ( xb->getBufStart() );
		sb->safePrintf("]]></content>\n");
		sb->safePrintf("</response>\n");
	}

	if ( format == FORMAT_JSON ) {
		sb->safePrintf("\t\"content\":\"\n");
		sb->jsonEncode ( xb->getBufStart() );
		sb->safePrintf("\"\n}\n}\n");
	}


	// if it is content-type text, add a </pre>
	if ( pre ) { // p + 6 < bufEnd && pre ) {
		sb->safeMemcpy ( "</pre>" , 6 );
		//p += 6;
	}

	// calculate bufLen
	//long bufLen = p - buf;

	long ct = xd->m_contentType;

	// now filter the entire buffer to escape out the xml tags
	// so it is displayed nice
	SafeBuf newbuf;

	if ( ct == CT_XML ) {
		// encode the xml tags into &lt;tagname&gt; sequences
		if ( !newbuf.htmlEncodeXmlTags ( sb->getBufStart() ,
						 sb->getLength(),
						 0)){// niceness=0
			//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );
			return sendErrorReply ( st , g_errno );
		}
		// free out buffer that we alloc'd before returning since this
		// should have copied it into another buffer
		//if ( buf ) mfree ( buf , bufMaxSize , "PageGet2" );	
		// reassign
		//buf    = newbuf.getBufStart();
		//bufLen = newbuf.length();
		sb->stealBuf ( &newbuf );
	}

	// now encapsulate it in html head/tail and send it off
	// sendErr:
	contentType = "text/html";
	if ( strip == 2 ) contentType = "text/xml";
	// xml is usually buggy and this throws browser off
	//if ( ctype == CT_XML ) contentType = "text/xml";

	if ( xd->m_contentType == CT_JSON )
		contentType = "application/json";

	if ( format == FORMAT_XML ) contentType = "text/xml";
	if ( format == FORMAT_JSON ) contentType = "application/json";

	// safebuf, sb, is a member of "st" so this should copy the buffer
	// when it constructs the http reply, and we gotta call delete(st)
	// AFTER this so sb is still valid.
	bool status = g_httpServer.sendDynamicPage (s,
						    //buf,bufLen,
						    sb->getBufStart(),
						    sb->getLength(),
						    -1,false,
						    contentType,
						     -1, NULL, "utf8" );

	// nuke state2
	mdelete ( st , sizeof(State2) , "PageGet1" );
	delete (st);


	// free out buffer that we alloc'd before returning since this
	// should have copied it into another buffer

	//if      ( ct == CT_XML ) newbuf.purge();
	//else if ( buf          ) mfree ( buf , bufMaxSize , "PageGet2" );
	
	// and convey the status
	return status;
}

Example #9

Show file

File: XmlNode.cpp Project: Doken-Tokuyama/open-source-search-engine

// Return the value of the specified "field" within this node.
// the case of "field" does not matter.
char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	int32_t flen = gbstrlen(field);
	char inQuotes = '\0';
	int32_t i;

	// scan the characters in the node, looking for the field name in ascii
	for ( i = 1; i + flen < m_nodeLen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (m_node[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (m_node[i]=='\"' || m_node[i]=='\'')){ 
			inQuotes = m_node[i];
			continue;
		} 
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( m_node[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}


	// return NULL if no matching field
	if ( i + flen >= m_nodeLen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
	
	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; 

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) {
		while (i<m_nodeLen && m_node[i] != inQuotes ) 
			i++;
	}
	else {
		while ( i<m_nodeLen &&
			!is_wspace_a(m_node[i])&&
			m_node[i]!='>')
			i++;
	}

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return m_node + start;
}

Example #10

Show file

File: Words.cpp Project: BillWangCS/open-source-search-engine

// . return the value of the specified "field" within this html tag, "s"
// . the case of "field" does not matter
char *getFieldValue ( char *s , 
		      long  slen ,
		      char *field , 
		      long *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	long flen = gbstrlen(field);
	char inQuotes = '\0';
	long i;

	// make it sane
	if ( slen > 2000 ) slen = 2000;

	for ( i = 1; i + flen < slen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (s[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (s[i]=='\"' || s[i]=='\'')){ 
			inQuotes = s[i];
			continue;
		} 
		// if not in quote tag might end
		if ( s[i] == '>' && ! inQuotes ) return NULL;
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( s[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}
	
	
	// return NULL if no matching field
	if ( i + flen >= slen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < slen && s[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;
	
	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++]; 

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) while (i<slen && s[i] != inQuotes ) i++;
	else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++;

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return s + start;
}

Example #11

Show file

File: Matches.cpp Project: privacore/open-source-search-engine

// . TODO: support stemming later. each word should then have multiple ids.
// . add to our m_matches[] array iff addToMatches is true, otherwise we just
//   set the m_foundTermVector for doing the BIG HACK described in Summary.cpp
bool Matches::addMatches(Words *words, Phrases *phrases, Sections *sections, Bits *bits, Pos *pos, mf_t flags ) {
	// if no query term, bail.
	if ( m_numSlots <= 0 ) {
		return true;
	}

	// . do not breach
	// . happens a lot with a lot of link info text
	if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
		return true;
	}

	Section *sp = NULL;
	if ( sections ) {
		sp = sections->m_sections;
	}

	mf_t eflag = 0;

	m_numMatchGroups++;

	const int64_t *pids = NULL;
	if ( phrases ) {
		pids = phrases->getPhraseIds2();
	}

	// set convenience vars
	uint32_t mask = m_numSlots - 1;
	const int64_t *wids = words->getWordIds();
	const int32_t *wlens = words->getWordLens();
	const char * const *wptrs = words->getWordPtrs();
	nodeid_t *tids = words->getTagIds();
	int32_t nw = words->getNumWords();
	int32_t n;
	int32_t matchStack = 0;
	int64_t nextMatchWordIdMustBeThis = 0;
	int32_t nextMatchWordPos = 0;
	int32_t lasti = -3;

	if ( getNumXmlNodes() > 512 ) { g_process.shutdownAbort(true); }

	int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;

	int32_t qwn;
	int32_t numQWords;
	int32_t numWords;

	//
	// . set m_matches[] array
	// . loop over all words in the document
	//
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		//if      (tids && (tids[i]            ) == TAG_A) 
		//	inAnchTag = true;
		//else if (tids && (tids[i]&BACKBITCOMP) == TAG_A) 
		//	inAnchTag = false;

		if ( tids && tids[i] ){
			// tagIds don't have wids and are skipped
			continue;
		}

		// skip if wid is 0, it is not an alnum word then
		if ( ! wids[i] ) {
			continue;
		}

		// count the number of alnum words
		m_numAlnums++;

		// clear this
		eflag = 0;

		// NO NO, a score of -1 means in a select tag, and
		// we do index that!! so only skip if wscores is 0 now.
		// -1 means in script, style, select or marquee. it is
		// indexed but with very little weight... this is really
		// a hack in Scores.cpp and should be fixed.
		// in Scores.cpp we set even the select tag stuff to -1...
		//if ( wscores && wscores[i] == -1 ) continue;
		if ( sp && (sp->m_flags & badFlags) ) continue;


		// . does it match a query term?
		// . hash to the slot in the hash table
		n = ((uint32_t)wids[i]) & mask;
		//n2 = swids[i]?((uint32_t)swids[i]) & mask:n;
	chain1:
		// skip if slot is empty (doesn't match query term)
		//if ( ! m_qtableIds[n] && ! m_qtableIds[n2]) continue;
		if ( ! m_qtableIds[n] ) goto tryPhrase;
		// otherwise chain
		if ( (m_qtableIds[n] != wids[i]) ) {
			if ( m_qtableIds[n] && ++n >= m_numSlots ) n = 0;
			goto chain1;
		}
		// we got one!
		goto gotMatch;


		//
		// fix so we hihglight "woman's" when query term is "woman"
		// for 'spiritual books for women' query
		// 
	tryPhrase:
		// try without 's if it had it
		if ( wlens[i] >= 3 && 
		     wptrs[i][wlens[i]-2] == '\'' &&
		     to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
			// move 's from word hash... very tricky
			int64_t nwid = wids[i];
			// undo hash64Lower_utf8 in hash.h
			nwid ^= g_hashtab[wlens[i]-1][(uint8_t)'s'];
			nwid ^= g_hashtab[wlens[i]-2][(uint8_t)'\''];
			n = ((uint32_t)nwid) & mask;
		chain2:
			if ( ! m_qtableIds[n] ) goto tryPhrase2;
			if ( (m_qtableIds[n] != nwid) ) {
				if ( m_qtableIds[n] && ++n >= m_numSlots ) n=0;
				goto chain2;
			}
			qwn = m_qtableWordNums[n];
			numWords = 1;
			numQWords = 1;
			// we got one!
			goto gotMatch2;
		}

	tryPhrase2:
		// try phrase first
		if ( pids && pids[i] ) {
			n = ((uint32_t)pids[i]) & mask;
		chain3:
			if ( ! m_qtableIds[n] ) continue;
			if ( (m_qtableIds[n] != pids[i]) ) {
				if ( m_qtableIds[n] && ++n >= m_numSlots)n = 0;
				goto chain3;
			}
			// what query word # do we match?
			qwn = m_qtableWordNums[n];
			// get that query word #
			QueryWord *qw = &m_q->m_qwords[qwn];
			// . do we match it as a single word?
			// . did they search for "bluetribe" ...?
			if ( qw->m_rawWordId == pids[i] ) {
				// set our # of words basically to 3
				numWords = 3;
				// matching a single query word
				numQWords = 1;
				// got a match
				goto gotMatch2;
			}
			if ( qw->m_phraseId == pids[i] ) {
				// might match more if we had more query
				// terms in the quote
				numWords = getNumWordsInMatch( words, i, n, &numQWords, &qwn, true );

				// this is 0 if we were an unmatched quote
				if ( numWords <= 0 ) continue;

				// got a match
				goto gotMatch2;
			}
			// otherwise we are matching a query phrase id
			log("matches: wtf? query word not matched for "
			    "highlighting... strange.");
			// assume one word for now
			numWords = 1;
			numQWords = 1;
			goto gotMatch2;
		}

		//
		// shucks, no match
		//
		continue;

	gotMatch:
		// what query word # do we match?
		qwn = m_qtableWordNums[n];
		

		// . how many words are in this match?
		// . it may match a single word or a phrase or both
		// . this will be 1 for just matching a single word, and 
		//   multiple words for quotes/phrases. The number of words
		//   in both cases will included unmatched punctuation words
		//   and tags in between matching words.
		numQWords = 0;
		numWords = getNumWordsInMatch( words, i, n, &numQWords, &qwn, true );
		// this is 0 if we were an unmatched quote
		if ( numWords <= 0 ) continue;

	gotMatch2:
		// get query word
		QueryWord *qw = &m_q->m_qwords[qwn];
		// point to next word in the query
		QueryWord *nq = NULL;
		if ( qwn+2 < m_q->m_numWords ) nq = &m_q->m_qwords[qwn+2];

		// . if only one word matches and its a stop word, make sure
		//   it's next to the correct words in the query
		// . if phraseId is 0, that means we do not start a phrase,
		//   because stop words can start phrases if they are the
		//   first word, are capitalized, or have breaking punct before
		//   them.
		if ( numWords == 1 && 
		     ! qw->m_inQuotes &&
		     m_q->m_numWords > 2 &&
		     qw->m_wordSign == '\0' &&
		     (nq && nq->m_wordId) && // no field names can follow
		     //(qw->m_isQueryStopWord || qw->m_isStopWord ) ) {
		     // we no longer consider single alnum chars to be
		     // query stop words as stated in StopWords.cpp to fix
		     // the query 'j. w. eagan'
		     qw->m_isQueryStopWord ) {
			// if stop word does not start a phrase in the query 
			// then he must have a matched word before him in the
			// document. if he doesn't then do not count as a match
			if ( qw->m_phraseId == 0LL && i-2 != lasti ) {
				// peel off anybody before us
				m_numMatches -= matchStack;
				if ( m_numMatches < 0 ) m_numMatches = 0;
				// don't forget to reset the match stack
				matchStack = 0;	

				continue; 
			}
			// if we already have a match stack, we must
			// be in nextMatchWordPos
			if ( matchStack && nextMatchWordPos != i ) {
				// peel off anybody before us 
				m_numMatches -= matchStack;
				if ( m_numMatches < 0 ) m_numMatches = 0;
				// don't forget to reset the match stack
				matchStack = 0;	
				//continue; 
			}
			// if the phraseId is 0 and the previous word
			// is a match, then we're ok, but put us on a stack
			// so if we lose a match, we'll be erased
			QueryWord *nq = &m_q->m_qwords[qwn+2];
			// next match is only required if next word in query
			// is indeed valid.
			if ( nq->m_wordId && nq->m_fieldCode == 0 ) {
				nextMatchWordIdMustBeThis = nq->m_rawWordId;
				nextMatchWordPos          = i + 2;
				matchStack++;
			}
		}
		else if ( matchStack ) {
			// if the last word matched was a stop word, we have to
			// match otherwise we have to remove the whole stack.
			if ( qw->m_rawWordId != nextMatchWordIdMustBeThis ||
			     i                > nextMatchWordPos ) {
				m_numMatches -= matchStack;
				// ensure we never go negative like for 
				// www.experian.com query
				if ( m_numMatches < 0 ) m_numMatches = 0;
			}
			// always reset this here if we're not a stop word
			matchStack = 0;
		}

		// record word # of last match
		lasti = i;

		// otherwise, store it in our m_matches[] array
		Match *m = &m_matches[m_numMatches];

		// the word # in the doc, and how many of 'em are in the match
		m->m_wordNum  = i;
		m->m_numWords = numWords;

		// the word # in the query, and how many of 'em we match
		m->m_qwordNum  = qwn;
		m->m_numQWords = numQWords;

		// get the first query word # of this match
		qw = &m_q->m_qwords[qwn];

		// convenience, used by Summary.cpp
		m->m_words    = words;
		m->m_sections = sections;
		m->m_bits     = bits;
		m->m_pos      = pos;
		m->m_flags    = flags | eflag ;

		// add to our vector. we want to know where each QueryWord
		// is. i.e. in the title, link text, meta tag, etc. so
		// the proximity algo in Summary.cpp can use that info.
		m_qwordFlags[qwn] |= flags;

		// advance
		m_numMatches++;

		// we get atleast MAX_MATCHES
		if ( m_numMatches < MAX_MATCHES ) {
			continue;
		}

		break;
	}

	// peel off anybody before us
	m_numMatches -= matchStack;
	if ( m_numMatches < 0 ) m_numMatches = 0;

	return true;
}

Example #12

Show file

File: matches2.cpp Project: DeadNumbers/open-source-search-engine

char *getMatches2 ( Needle *needles          , 
		    int32_t    numNeedles       ,
		    char   *haystack         , 
		    int32_t    haystackSize     ,
		    char   *linkPos          ,
		    int32_t   *needleNum        ,
		    bool    stopAtFirstMatch ,
		    bool   *hadPreMatch      ,
		    bool    saveQuickTables  ,
		    int32_t    niceness         ) {

	// assume not
	if ( hadPreMatch ) *hadPreMatch = false;
	// empty haystack? then no matches
	if ( ! haystack || haystackSize <= 0 ) return NULL;
	// JAB: no needles? then no matches
	if ( ! needles  || numNeedles   <= 0 ) return NULL;

	//char tmp[8192];
	//char *t    = tmp;
	//char *tend = tmp + 8192;

	// reset counts to 0
	//if ( ! stopAtFirstMatch )
	//	for ( int32_t i=0 ; i < numNeedles ; i++ ) 
	//		needles[i].m_count = 0;

	// are we responsible for init'ing string lengths? this is much
	// faster than having to specify lengths manually.
	for ( int32_t i=0 ; i < numNeedles; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		// clear
		needles[i].m_count      = 0;
		needles[i].m_firstMatch = NULL;
		// set the string size in bytes if not provided
		if ( needles[i].m_stringSize == 0 )
			needles[i].m_stringSize = gbstrlen(needles[i].m_string);
	}

	// . set up the quick tables.
	// . utf16 is not as effective here because half the bytes are zeroes!
	// . TODO: use a static cache of like 4 of these tables where the key
	//         is the Needles ptr ... done
	int32_t numNeedlesToInit = numNeedles;
	char space[256 * 4 * sizeof(BITVEC)];
	char *buf = NULL;

	BITVEC *s0;
	BITVEC *s1;
	BITVEC *s2;
	BITVEC *s3;

	/*
	static bool s_quickTableInit = false;
	static char s_qtbuf[128*(12+1)*2];

	int32_t slot = -1;
	if(saveQuickTables) {
		if ( ! s_quickTableInit ) {
			s_quickTableInit = true;
			s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
		}
		uint64_t key = (uint32_t)needles;
		slot = s_quickTables.getSlot(&key);
		if ( slot >= 0 ) {
			buf = s_quickTables.getValueFromSlot(slot);
			numNeedlesToInit = 0;
		}
	}
	*/

	if(!buf) {
		buf = space;
		memset ( buf , 0 , sizeof(BITVEC)*256*4);
	}

	/*
	if( useQuickTables && slot == -1 ) {
		//buf = (char*)mcalloc(sizeof(uint32_t)*256*5,
		//		     "matches");
		if(buf) s_quickTables.addKey(&key, &buf);
		//sanity check, no reason why there needs to be a 
		//limit, I just don't expect there to be this many
		//static needles at this point.
		if(s_quickTables.getNumSlotsUsed() > 32){
			char *xx=NULL; *xx = 0;
		}
	}
	*/

	// try 64 bit bit vectors now since we doubled # of needles
	int32_t offset = 0;
	s0 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s1 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s2 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s3 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;

	BITVEC mask;

	// set the letter tables, s0[] through sN[], for each needle
	for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		unsigned char *w    = (unsigned char *)needles[i].m_string;
		unsigned char *wend = w + needles[i].m_stringSize;
		// BITVEC is now 64 bits
		mask = (1<<(i&0x3f)); // (1<<(i%64));
		// if the needle is small, fill up the remaining letter tables
		// with its mask... so it matches any character in haystack.
		s0[(unsigned char)to_lower_a(*w)] |= mask;
		s0[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s1[j] |= mask;
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s1[(unsigned char)to_lower_a(*w)] |= mask;
		s1[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s2[(unsigned char)to_lower_a(*w)] |= mask;
		s2[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s3[j] |= mask;
			}
			continue;
		}

		s3[(unsigned char)to_lower_a(*w)] |= mask;
		s3[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
	}

	// return a ptr to the first match if we should, this is it
	char *retVal = NULL;
	// debug vars
	//int32_t debugCount = 0;
	//int32_t pp = 0;
	// now find the first needle in the haystack
	unsigned char *p    = (unsigned char *)haystack;
	unsigned char *pend = (unsigned char *)haystack + haystackSize;
	char          *dend = (char *)pend;

	// do not breach!
	pend -= 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// analytics...
		
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		mask &= s1[*(p+1)];
		if ( ! mask ) continue;
		mask &= s2[*(p+2)];
		if ( ! mask ) continue;
		mask &= s3[*(p+3)];
		if ( ! mask ) continue;
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//gbmemcpy ( xx , p , 8 );
		for ( int32_t k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		gbmemcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain... 
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			int32_t msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection && 
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// see if we match another needle, fixes bug
			// of matching "anal" but not "analy[tics]"
			continue;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}

	//
	// HACK:
	// 
	// repeat above loop but for the last 4 characters in haystack!!
	// this fixes a electric fence mem breach core
	//
	// it is slower because we check for \0
	//
	pend += 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		if ( p+1 < pend ) {
			mask &= s1[*(p+1)];
			if ( ! mask ) continue;
		}
		if ( p+2 < pend ) {
			mask &= s2[*(p+2)];
			if ( ! mask ) continue;
		}
		if ( p+3 < pend ) {
			mask &= s3[*(p+3)];
			if ( ! mask ) continue;
		}
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//gbmemcpy ( xx , p , 8 );
		for ( int32_t k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		gbmemcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain... 
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (!p[1] ||
				    needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (!p[2] ||
				    needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (!p[3] ||
				    needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			int32_t msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection && 
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}


	//if ( debugCount > 0 ) pp = haystackSize / debugCount;
	//log("build: debug count = %"INT32" uc=%"INT32" hsize=%"INT32" "
	//    "1 in %"INT32" chars matches.",
	//    debugCount,(int32_t)isHaystackUtf16,haystackSize,pp);

	// before we exit, clean up
	return retVal;
}