Exemplos de is_wspace_a em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: HashTableT.cpp Projeto: BILObilo/open-source-search-engine

// hash the space (or +) separated list of numbers in this string
//template<class Key_t, class Val_t> 
//bool HashTableT<Key_t,Val_t>::hashFromString ( HashTableT *ht , char *x ) {
bool hashFromString ( HashTableT<long long,char> *ht , char *x ) {
	if ( ! x ) return true;
	char *xend = x + gbstrlen(x);
	long  n    = 1;
	for ( char *s = x ; s < xend ; s++ ) 
		// i am assuming this is ascii here!
		if (is_wspace_a(*s)||*s == '+') n++;
	// double # slots to nd*2 so that hashtable is somewhat sparse --> fast
	if ( ! ht->set ( n * 2 , NULL , 0 , false ) ) return false;
	// now populate with the docids
	for ( char *s = x ; s < xend ; ) {
		// skip the plusses
		while ( s < xend && (is_wspace_a(*s) || *s == '+') ) s++;
		// are we done?
		if ( s >= xend ) break;
		// get the docid, a long long (64 bits)
		long long d = atoll ( s );
		// add it, should never fail!
		if ( ! ht->addKey ( d , 1 ) ) return false;
		// skip till +
		while ( s < xend && (*s != '+' && !is_wspace_a(*s)) ) s++;
		// are we done?
		if ( s >= xend ) break;
	}
	return true;
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: fctypes.cpp Projeto: privacore/open-source-search-engine

double atof2 ( const char *s, int32_t len ) {
	// skip over spaces
	const char *end = s + len;
	while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
	// return 0 if all spaces
	if ( s == end ) return 0;
	char tmpBuf[128];
	if ( len >= 128 ) len = 127;
	//strncpy ( dst , s , len );

	const char *p = s;
	const char *srcEnd = s + len;
	char *dst = tmpBuf;
	// remove commas
	for ( ; p < srcEnd ; p++ ) {
		// skip commas
		if ( *p == ',' ) continue;
		// otherwise store it
		*dst++ = *p;
	}
	// null term
	*dst = '\0';
	//buf[len] = '\0';
	return atof ( tmpBuf );
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: HttpMime.cpp Projeto: privacore/open-source-search-engine

bool HttpMime::getField(const char **field, size_t *fieldLen) {
	size_t currentLinePos = m_valueStartPos;

	const char *colonPos = (const char *)memchr(m_currentLine + currentLinePos, ':', m_currentLineLen);

	// no colon
	if (colonPos == NULL) {
		return false;
	}

	currentLinePos = colonPos - m_currentLine;
	m_valueStartPos = currentLinePos + 1;

	*field = m_currentLine;
	*fieldLen = currentLinePos;

	// strip ending whitespaces
	while (*fieldLen > 0 && is_wspace_a(m_currentLine[*fieldLen - 1])) {
		--(*fieldLen);
	}

	logTrace(g_conf.m_logTraceHttpMime, "field='%.*s'", static_cast<int>(*fieldLen), *field);

	return (*fieldLen > 0);
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: qa.cpp Projeto: firatkarakusoglu/open-source-search-engine

bool loadUrls ( ) {
	static bool s_loaded = false;
	if ( s_loaded ) return true;
	s_loaded = true;
	// use injectme3 file
	s_ubuf1.load("./injectme3");
	// scan for +++URL: xxxxx
	char *s = s_ubuf1.getBufStart();
	for ( ; *s ; s++ ) {
		if ( strncmp(s,"+++URL: ",8) ) continue;
		// got one
		// \0 term it for s_contentPtrs below
		*s = '\0';
		// find end of it
		s += 8;
		char *e = s;
		for ( ; *e && ! is_wspace_a(*e); e++ );
		// null term it
		if ( *e ) *e = '\0';
		// store ptr
		s_ubuf2.pushLong((long)s);
		// skip past that
		s = e;
		// point to content
		s_cbuf2.pushLong((long)(s+1));
	}
	// make array of url ptrs
	s_urlPtrs = (char **)s_ubuf2.getBufStart();
	s_contentPtrs= (char **)s_cbuf2.getBufStart();
	return true;
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Conf.cpp Projeto: BlaBlaNet/open-source-search-engine

bool isInWhiteSpaceList ( char *p , char *buf ) {

	if ( ! p ) return false;

	char *match = strstr ( buf , p );
	if ( ! match ) return false;
	
	int32_t len = gbstrlen(p);

	// ensure book-ended by whitespace
	if (  match && 
	      (match == buf || is_wspace_a(match[-1])) &&
	      (!match[len] || is_wspace_a(match[len])) )
		return true;

	// no match
	return false;
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: Json.cpp Projeto: DeadNumbers/open-source-search-engine

bool endsInCurly ( char *s , int32_t slen ) {
	char *e = s + slen - 1;
	// don't backup more than 30 chars
	char *m = e - 30;
	if ( m < s ) m = s;
	// \0?
	if ( e > m && *e == '\0' ) e--;
	// scan backwards, skipping whitespace
	for ( ; e > m && is_wspace_a(*e) ; e-- );
	// should be a } now to be valid json
	if ( e >= m && *e == '}' ) return true;
	return false;
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: fctypes.cpp Projeto: privacore/open-source-search-engine

int64_t atoll2 ( const char *s, int32_t len ) {
	// skip over spaces
	const char *end = s + len;
	while ( s < end && is_wspace_a ( *s ) ) s++;
	// return 0 if all spaces
	if ( s == end ) return 0;
	int32_t i   = 0;
	int64_t val = 0LL;
	bool negative = false;
	if ( s[0] == '-' ) { negative = true; i++; }
	while ( i < len && is_digit(s[i]) ) val = val * 10LL + ( s[i++] - '0');
	if ( negative ) return -val;
	return val;
}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: HttpRequest.cpp Projeto: BKJackson/open-source-search-engine

long HttpRequest::getLong ( char *field , long defaultLong ) {
	 long len;
	 char *value = getValue ( field, &len, NULL );
	 // return default if no match
	 if ( ! value || len == 0 ) return defaultLong;
	 // otherwise, it's a match
	 char c = value[len];
	 value[len] = '\0';
	 long res = atol ( value );
	 value[len] = c;
	 if ( res == 0 ) {
		 // may be an error. if so return the default
		 long i = 0;
		 while ( i < len && is_wspace_a(value[i]) ) i++;
		 if ( i < len && (value[i] == '-' || value[i] == '+') ) i++;
		 if ( i >= len || !is_digit(value[i]) ) return defaultLong;
	 }
	 return res;
 }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: HttpMime.cpp Projeto: privacore/open-source-search-engine

bool HttpMime::getValue(const char **value, size_t *valueLen) {
	// strip starting whitespaces
	while (is_wspace_a(m_currentLine[m_valueStartPos]) && (m_valueStartPos < m_currentLineLen)) {
		++m_valueStartPos;
	}

	*value = m_currentLine + m_valueStartPos;
	*valueLen = m_currentLineLen - m_valueStartPos;

	const char *semicolonPos = (const char *)memchr(*value, ';', *valueLen);
	if (semicolonPos) {
		// value should end at semicolon if present
		*valueLen = semicolonPos - *value;
		m_attributeStartPos = semicolonPos - m_currentLine + 1;
	}

	logTrace(g_conf.m_logTraceHttpMime, "value='%.*s'", static_cast<int>(*valueLen), *value);

	return (*valueLen > 0);
}

Exemplo n.º 10

0

Exibir arquivo

Arquivo: HttpRequest.cpp Projeto: BKJackson/open-source-search-engine

double HttpRequest::getDouble ( char *field , double defaultDouble ) {
	 long len;
	 char *value = getValue ( field, &len, NULL );
	 // return default if no match
	 if ( ! value || len == 0 ) return defaultDouble;
	 // otherwise, it's a match
	 char c = value[len];
	 value[len] = '\0';
	 double res = strtod ( value , NULL );
	 value[len] = c;
	 if ( res == +0.0 ) {
		 // may be an error. if so return the default
		 long i = 0;
		 while ( i < len && is_wspace_a(value[i]) ) i++;
		 if ( i < len && 
		      (value[i] == '-' || 
		       value[i] == '+' || 
		       value[i] == '.') ) i++;
		 if ( i >= len || !is_digit(value[i]) ) return defaultDouble;
	 }
	 return res;
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: SearchInput.cpp Projeto: FlavioFalcao/open-source-search-engine

bool SearchInput::set ( TcpSocket *sock , HttpRequest *r , Query *q ) {

	// store list of collection #'s to search here. usually just one.
	m_collnumBuf.reset();

	// zero out everything, set niceness to 0
	clear ( 0 ) ;

	// save it now
	m_socket = sock;

	// get coll rec
	long  collLen9;
	char *coll9 = r->getString ( "c" , &collLen9 );

	m_firstCollnum = -1;

	CollectionRec *cr = NULL;

	// now convert list of space-separated coll names into list of collnums
	char *p = coll9;

	// if no collection list was specified look for "token=" and
	// use those to make collections. hack for diffbot.
	char *token = r->getString("token",NULL);
	// find all collections under this token
	for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		// must not have a "&c="
		if ( p ) break;
		// must have a "&token="
		if ( ! token ) break;
		// skip if empty
		CollectionRec *tmpcr = g_collectiondb.m_recs[i];
		if ( ! tmpcr ) continue;
		// skip if does not match token
		if ( strcmp(token,tmpcr->m_diffbotToken.getBufStart()) ) 
			continue;
		// . we got a match
		// . set initial junk
		if ( ! cr ) {
			cr = tmpcr;
			m_firstCollnum = tmpcr->m_collnum;
		}
		// save the collection #
		if ( ! m_collnumBuf.safeMemcpy ( &tmpcr->m_collnum, 
						 sizeof(collnum_t) ) )
			return false;
	}

	// if we had a "&c=..." in the GET request process that
	if ( p ) {
	loop:
		char *end = p;
		for ( ; *end && ! is_wspace_a(*end) ; end++ );
		// temp null
		char c = *end;
		*end = '\0';
		CollectionRec *tmpcr = g_collectiondb.getRec ( p );
		// set defaults from the FIRST one
		if ( tmpcr && ! cr ) {
			cr = tmpcr;
			m_firstCollnum = tmpcr->m_collnum;
		}
		if ( ! tmpcr ) { 
			g_errno = ENOCOLLREC;
			log("query: missing collection %s",p);
			g_msg = " (error: no such collection)";		
			return false;
		}
		// add to our list
		if (!m_collnumBuf.safeMemcpy(&cr->m_collnum,sizeof(collnum_t)))
			return false;
		// restore the \0 character we wrote in there
		*end = c;
		// advance
		p = end;
		// skip to next collection name if there is one
		while ( *p && is_wspace_a(*p) ) p++; 
		// now add it's collection # to m_collnumBuf if there
		if ( *p ) goto loop;
	}


	//if (! coll){coll = g_conf.m_defaultColl; collLen = gbstrlen(coll); }
	//if ( ! coll )
	//	coll = g_conf.getDefaultColl(r->getHost(), r->getHostLen());
	//if ( ! coll || ! coll[0] )
	//	coll = "main";
	//if ( ! coll ) { g_errno = ENOCOLLREC; return false; }
	//collLen = gbstrlen(coll);
	//CollectionRec *cr = g_collectiondb.getRec ( coll9 );
	//if ( ! cr ) { 
	//	g_errno = ENOCOLLREC;
	//	g_msg = " (error: no such collection)";		
	//	return false;
	//}

	// set all to 0 just to avoid any inconsistencies
	//long size = (char *)&m_END_TEST - (char *)&m_START;
	//memset ( this , 0x00 , size );
	//setToDefaults( cr , 0 ); // niceness

	m_cr = cr;

	if ( ! cr ) {
		log("si: collection does not exist");
		g_errno = ENOCOLLREC;
		return false;
	}

	//m_coll2    = m_cr->m_coll;
	//m_collLen2 = gbstrlen(m_coll2);

	

	// from ::reset()
	m_languageWeightFactor = 0.33;

	// Set IP for language detection.
	// (among other things)
	if ( sock ) m_queryIP = sock->m_ip;
	else        m_queryIP = 0;
	m_hr = r;

	// keep ptr to the query class to use
	m_q        = q;

	// set this here since its size can be variable
	m_sq = r->getString("sq",&m_sqLen);
	// negative docids
	m_noDocIds = r->getString("nodocids",&m_noDocIdsLen);
	// negative sites
	m_noSiteIds = r->getString("nositeids",&m_noSiteIdsLen);

	// Msg5e calls Msg40 with this set to true in the searchInput
	// so it can analyze the entire pages of each search result so it
	// can find the article start/end tag sequence indicators
	m_getTitleRec = r->getLong("gettrs",0);

	m_getSitePops = r->getLong("getsitepops",0 );

        // does this collection ban this IP?
	/*
	long  encapIp = 0; 
m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
		g_errno = ENOPERM;
		g_msg = " (error: permission denied)";
		return false;
	}
	*/

	// set all search parms in SearchInput to defaults
	for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) {
		Parm *m = g_parms.m_searchParms[i];
		// sanity
		if ( m->m_soff < 0 ) { char *xx=NULL;*xx=0; }
		char *x = (char *)this + m->m_soff;
		// what is the def val ptr
		char *def = NULL;
		if      ( m->m_off >= 0 && m->m_obj == OBJ_COLL )
			def = ((char *)cr) + m->m_off;
		else if ( m->m_off >= 0 && m->m_obj == OBJ_CONF )
			def = ((char *)&g_conf) + m->m_off;
		// set it based on type
		if      ( m->m_type == TYPE_LONG ) {
			long v = 0;
			if ( def )
				v = *(long *)def;
			else if ( m->m_def ) 
				v = atol(m->m_def);
			*(long *)x = v;
		}
		else if ( m->m_type == TYPE_BOOL ) {
			long v = 0;
			if ( def ) 
				v = *(char *)def;
			else if ( m->m_def ) 
				v = atol(m->m_def);
			// sanity test!
			if ( v != 0 && v != 1 )
				log("query: got non-bool default "
				    "for bool parm %s",m->m_title);
			if ( v ) *(char *)x = 1;
			else     *(char *)x = 0;
		}
		else if ( m->m_type == TYPE_CHAR ) {
			if ( def )
				*(char *)x = *(char *)def;
			else if ( m->m_def ) 
				*(char *)x = atol(m->m_def);
		}
		else if ( m->m_type == TYPE_FLOAT ) {
			float v = 0;
			if ( def )
				v = *(float *)def;
			else if ( m->m_def ) 
				v = atof(m->m_def);
			*(float *)x = (float)v;
		}
		else if ( m->m_type == TYPE_DOUBLE ) {
			double v = 0;
			if ( def )
				v = *(double *)def;
			else if ( m->m_def ) 
				v = atof(m->m_def);
			*(double *)x = (double)v;
		}
		else if ( m->m_type == TYPE_LONG_LONG ) {
			long long v = 0;
			if ( def )
				v = *(long long *)def;
			else if ( m->m_def ) 
				v = atoll(m->m_def);
			*(long long *)x = (long long)v;
		}
		else if ( m->m_type == TYPE_STRING ||
			  m->m_type == TYPE_STRINGBOX ) {
			//if ( m->m_cgi && strcmp ( m->m_cgi, "erpc" ) == 0 )
			//	log("hey1");
			//if ( m->m_cgi && strcmp ( m->m_scgi, "q" ) == 0 )
			//	log("hey1");
			char *v = NULL;
			if ( def )
				v = (char *)def;
			else if ( m->m_def ) 
				v = m->m_def;
			*(char **)x = v;
			// set the length
			if ( ! v ) *(long *)(x-4) = 0;
			else       *(long *)(x-4) = gbstrlen(v);
		}
	}

	// this is just used to determine in PageResults.cpp if we should
	// show admin knobs next to each result...
	// default to off for now. default back on.
	m_isAdmin = r->getLong("admin",1);
	//if ( m_isAdmin ) m_isAdmin = g_users.hasPermission ( r,PAGE_MASTER);
	// local ip?
	if ( ! r->isLocal() ) m_isAdmin = 0;

	// default set does not take into account g_conf,
	// so we will take care of that here ourselves...
	m_adFeedEnabled  = g_conf.m_adFeedEnabled;
	//m_excludeLinkText = g_conf.m_excludeLinkText;
	//m_excludeMetaText = g_conf.m_excludeMetaText;

	// we need to get some cgi values in order to correct the defaults
	// based on if we're doing an xml feed, have a site: query, etc.
	//long  xml      = r->getLong ( "xml" , 0 ); // was "raw"
	long  siteLen  = 0; r->getString ("site",&siteLen);
	long  sitesLen = 0; 
	char *sites = r->getString ("sites",&sitesLen,NULL);

	// save it if there
	if ( sites && sitesLen > 0 && 
	     ( ! m_whiteListBuf.safeStrcpy(sites)||
	       ! m_whiteListBuf.nullTerm() ) )
		return log("query: unable to strcpy whitelist");
	

	char format = getFormatFromRequest ( r );

	// now override automatic defaults for special cases
	if ( format != FORMAT_HTML ) {
		m_familyFilter            = 0;
		// this is causing me a headache when on when i dont know it
		m_restrictIndexdbForQuery   = false;
		// this is hackish
		if ( r->getLong("rt",0) ) m_restrictIndexdbForQuery=false;
		m_numTopicsToDisplay      = 0;
		m_doQueryHighlighting     = 0;
		m_spellCheck              = 0;
		m_refs_numToGenerate      = 0;
		m_refs_docsToScan         = 0;
		// default scoring info to off
		m_getDocIdScoringInfo = false;
	}
	else if ( m_siteLen > 0 ) {
		m_restrictIndexdbForQuery = false;
		m_doSiteClustering        = false;
		m_ipRestrictForTopics     = false;
	}
	else if ( m_whiteListBuf.length() > 0 ) {
		m_ipRestrictForTopics     = false;
	}

	m_doIpClustering          = false;
	//m_sitesQueryLen           = 0;

	// set the user ip, "uip"
	long uip = m_queryIP;
	char *uipStr = m_hr->getString ("uip" , NULL );
	long tmpIp = 0; if ( uipStr ) tmpIp = atoip(uipStr);
	if ( tmpIp ) uip = tmpIp;

	//
	//
	// BEGIN MAIN PARM SETTING LOOP
	//
	//

	// loop through all possible cgi parms to set SearchInput
	for ( long i = 0 ; i < g_parms.m_numSearchParms ; i++ ) {
		Parm *m = g_parms.m_searchParms[i];
		char *x = (char *)this + m->m_soff;
		// what is the parm's cgi name?
		char *cgi = m->m_scgi;
		if ( ! cgi ) cgi = m->m_cgi;
		// sanity check
		if ( ! m->m_sparm ) {
			log("query: Failed search input sanity check.");
			char *xx = NULL; *xx = 0;
		}
		// . break it down by type now
		// . get it from request and store it in SearchInput
		if ( m->m_type == TYPE_LONG ) {
			// default was set above
			long def = *(long *)x;
			// assume default
			long v = def;
			// but cgi parms override cookie
			v = r->getLong ( cgi , v );
			// but if its a privledged parm and we're not an admin
			// then do not allow overrides, but m_priv of 3 means
			// to not display for clients, but to allow overrides
			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
			// bounds checks
			if ( v < m->m_smin ) v = m->m_smin;
			if ( v > m->m_smax ) v = m->m_smax;
			if ( m->m_sminc >= 0 ) {
				long vmin = *(long *)((char *)cr+m->m_sminc);
				if ( v < vmin ) v = vmin;
			}
			if ( m->m_smaxc >= 0 ) {
				long vmax = *(long *)((char *)cr+m->m_smaxc);
				if ( v > vmax ) v = vmax;
			}
			// set it
			*(long *)x = v;
			// do not print start result num (m->m_sprop is 0 for 
			// "s" now)
			//if ( cgi[0] == 's' && cgi[1] == '\0' ) continue;
			// should we propagate it? true by default
			//if ( ! m->m_sprop ) continue;
			// if it is the same as its default, and the default is
			// always from m_def and never from the CollectionRec, 
			// then do not both storing it in here! what's the 
			// point?
			if ( v == def && m->m_off < 0 ) continue;
			// if not default do not propagate
			if ( v == def ) continue;
			// . include for sure if explicitly provided
			// . vp will be NULL if "cgi" is not explicitly listed 
			//   as a cgi parm. otherwise, even if *vp == '\0', vp
			//   is non-NULL.
			// . crap, it can be in the cookie now
			//char *vp = r->getValue(cgi, NULL, NULL);
			// if not given at all, do not propagate
			//if ( ! vp ) continue;
			// store in up if different from default, even if
			// same as default ("def") because default may be
			// changed by the admin since m->m_off >= 0
			//if ( m->m_sprpg && up + gbstrlen(cgi) + 20 < upend ) 
			//	up += sprintf ( up , "%s=%li&", cgi , v );
			//if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend )
			//	pp += sprintf ( pp , "<input type=hidden "
			//			"name=%s value=\"%li\">\n", 
			//			cgi , v );
		}
		else if ( m->m_type == TYPE_LONG_LONG ) {
			// default was set above
			long def = *(long long *)x;
			// assume default
			long long v = def;
			// but cgi parms override cookie
			v = r->getLongLong ( cgi , v );
			// but if its a privledged parm and we're not an admin
			// then do not allow overrides, but m_priv of 3 means
			// to not display for clients, but to allow overrides
			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
			// set it
			*(long long *)x = v;
			// if it is the same as its default, and the default is
			// always from m_def and never from the CollectionRec, 
			// then do not both storing it in here! what's the 
			// point?
			if ( v == def && m->m_off < 0 ) continue;
			// if not default do not propagate
			if ( v == def ) continue;
		}
		else if ( m->m_type == TYPE_FLOAT ) {
			// default was set above
			float def = *(float *)x;
			// get overriding from http request, if any
			float v;
			// but if its a privledged parm and we're not an admin
			// then do not allow overrides
			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
			else v = r->getFloat( cgi , def );
			// bounds checks
			if ( v < m->m_smin ) v = m->m_smin;
			if ( v > m->m_smax ) v = m->m_smax;
			if ( m->m_sminc >= 0 ) {
				float vmin = *(float *)((char *)cr+m->m_sminc);
				if ( v < vmin ) v = vmin;
			}
			if ( m->m_smaxc >= 0 ) {
				float vmax = *(float *)((char *)cr+m->m_smaxc);
				if ( v > vmax ) v = vmax;
			}
			// set it
			*(float *)x = v;
			// do not print start result num
			//if ( cgi[0] == 's' && cgi[1] == '\0' ) continue;

			// include for sure if explicitly provided
			char *vp = r->getValue(cgi, NULL, NULL);
			if ( ! vp ) continue;
			// unchanged from default?
			if ( v == def ) continue;
			// store in up different from default
			//if ((vp||v!= def) && up + gbstrlen(cgi)+20 < upend ) 
			//	up += sprintf ( up , "%s=%f&", cgi , v );
			//if ((vp||v!= def) && pp + gbstrlen(cgi)+20 < ppend )
			//	pp += sprintf ( pp , "<input type=hidden "
			//			"name=%s value=\"%f\">\n", 
			//			cgi , v );
		}
		else if ( m->m_type == TYPE_DOUBLE ) {
			// default was set above
			double def = *(double *)x;
			// get overriding from http request, if any
			double v;
			// but if its a privledged parm and we're not an admin
			// then do not allow overrides
			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
			else v = r->getDouble( cgi , def );
			// bounds checks
			if ( v < m->m_smin ) v = m->m_smin;
			if ( v > m->m_smax ) v = m->m_smax;
			if ( m->m_sminc >= 0 ) {
				double vmin=*(double *)((char *)cr+m->m_sminc);
				if ( v < vmin ) v = vmin;
			}
			if ( m->m_smaxc >= 0 ) {
				double vmax=*(double *)((char *)cr+m->m_smaxc);
				if ( v > vmax ) v = vmax;
			}
			// set it
			*(double *)x = v;
			// include for sure if explicitly provided
			char *vp = r->getValue(cgi, NULL, NULL);
			if ( ! vp ) continue;
			// unchanged from default?
			if ( v == def ) continue;
		}

		else if ( m->m_type == TYPE_BOOL ) {
			// default was set above
			long def = *(char *)x;
			if ( def != 0 ) def = 1; // normalize
			// assume default
			long v = def;
			// cgi parms override cookie
			v = r->getBool ( cgi , v );
			// but if no perm, use default
			if ( ! m_isAdmin && m->m_priv && m->m_priv!=3) v = def;
			if ( v != 0 ) v = 1; // normalize
			*(char *)x = v;
			// don't propagate rcache
			//if ( ! strcmp(cgi,"rcache") ) continue;
			// should we propagate it? true by default
			//if ( ! m->m_sprop ) continue;
			// if it is the same as its default, and the default is
			// always from m_def and never from the CollectionRec, 
			// then do not both storing it in here! what's the 
			// point?
			if ( v == def && m->m_off < 0 ) continue;
			// if not default do not propagate
			if ( v == def ) continue;
			// . include for sure if explicitly provided
			// . vp will be NULL if "cgi" is not explicitly listed 
			//   as a cgi parm. otherwise, even if *vp == '\0', vp 
			//   is non-NULL.
			// . crap, it can be in the cookie now!
			//char *vp = r->getValue(cgi, NULL, NULL);
			// if not given at all, do not propagate
			//if ( ! vp ) continue;
			// store in up if different from default, even if
			// same as default ("def") because default may be
			// changed by the admin since m->m_off >= 0
			//if ( m->m_sprpg && up + gbstrlen(cgi) + 10 < upend )
			//	up += sprintf ( up , "%s=%li&", cgi , v );
			//if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend )
			//	pp += sprintf ( pp , "<input type=hidden "
			//			"name=%s value=\"%li\">\n", 
			//			cgi , v );
		}
		else if ( m->m_type == TYPE_CHAR ) {
			// default was set above
			char def = *(char *)x;
			*(char *)x = r->getLong ( cgi, def );
			// use this
			long v = *(char *)x;
			// store in up if different from default, even if
			// same as default ("def") because default may be
			// changed by the admin since m->m_off >= 0. nah,
			// let's try to reduce cgi parm pollution...
			if ( v == def ) continue;
			//if ( m->m_sprpg && up + gbstrlen(cgi) + 10 < upend )
			//	up += sprintf ( up , "%s=%li&", cgi , v );
			//if ( m->m_sprpp && pp + gbstrlen(cgi) + 80 < ppend )
			//	pp += sprintf ( pp , "<input type=hidden "
			//			"name=%s value=\"%li\">\n", 
			//			cgi , v );
		}
		else if ( m->m_type == TYPE_STRING ||
			  m->m_type == TYPE_STRINGBOX ) {
			//if ( m->m_cgi && strcmp ( m->m_cgi, "qlang" ) == 0 )
			//	log("hey2");
			char *def = *(char **)x;
			// get overriding from http request, if any
			long len = 0;
			char *v = NULL;
			// . cgi parms override cookie
			// . is this url encoded?
			v = r->getString ( cgi , &len , v );
			// if not specified explicitly, default it and continue
			if ( ! v ) {
				// sanity
				if  ( ! def ) def = "";
				*(char **)x = def;
				// length preceeds char ptr in SearchInput
				*(long *)(x - 4) = gbstrlen(def);
				continue;
			}
			// if something was specified, override, it might
			// be length zero, too
			*(char **)x = v;
			// length preceeds char ptr in SearchInput
			*(long *)(x - 4) = len;
			// do not store if query, that needs to be last so
			// related topics can append to it
			//if ( cgi[0] == 'q' && cgi[1] == '\0' ) continue;
			// should we propagate it? true by default
			//if ( ! m->m_sprop ) continue;
			// if not given at all, do not propagate
			//if ( ! vp ) continue;
			// if it is the same as its default, and the default is
			// always from m_def and never from the CollectionRec, 
			// then do not both storing it in here! what's the 
			// point?
			//if ( v && v == def && !strcmp(def,v) && m->m_off < 0)
			//	continue;
			// Need to set qcs based on page encoding...
			// not propagated
			if (!strncmp(cgi, "qcs", 3))
				continue;
			// do not propagate defaults
			if ( v == def ) continue;
			// store in up if different from default, even if
			// same as default ("def") because default may be
			// changed by the admin since m->m_off >= 0
			//if( m->m_sprpg && up+gbstrlen(cgi)+len+6  < upend ) {
			//	up += sprintf ( up , "%s=", cgi );
			//	up  += urlEncode ( up , upend-up-2 , v , len );
			//	*up++ = '&';
			//}
			// propogate hidden inputs
			//if ( m->m_sprpp && up+gbstrlen(cgi)+len+80 < upend )
			//	pp += sprintf ( pp , "<input type=hidden "
			//			"name=%s value=\"%s\">\n", 
			//			cgi , v );
		}
	}

	// now add the special "qh" parm whose default value changes 
	// depending on if we are widget related or not
	long qhDefault = 1;
	m_doQueryHighlighting = r->getLong("qh",qhDefault);


	//
	// TODO: use Parms.cpp defaults
	//
	TopicGroup *tg = &m_topicGroups[0];

	//
	//
	// gigabits
	//
	//
	tg->m_numTopics = 50;
	tg->m_maxTopics = 50;
	tg->m_docsToScanForTopics = m_docsToScanForTopics;
	tg->m_minTopicScore = 0;
	tg->m_maxWordsPerTopic = 6;
	tg->m_meta[0] = '\0';
	tg->m_delimeter = '\0';
	tg->m_useIdfForTopics = false;
	tg->m_dedup = true;
	// need to be on at least 2 pages!
	tg->m_minDocCount = 2;
	tg->m_ipRestrict = true;
	tg->m_dedupSamplePercent = 80;
	tg->m_topicRemoveOverlaps = true;
	tg->m_topicSampleSize = 4096;
	// max sequential punct chars allowedin a topic
	tg->m_topicMaxPunctLen = 1;
	m_numTopicGroups = 1;

	// use "&dg=1" to debug gigabits
	m_debugGigabits = r->getLong("dg",0);

	// override
	m_format = format;

	// . omit scoring info from the xml feed for now
	// . we have to roll this out to gk144 net i think
	//if ( m_format != FORMAT_HTML )
	//	m_getDocIdScoringInfo = 0;

	// turn off by default!
	if ( ! r->getLong("gigabits",0) ) {
		m_numTopicGroups = 0;
	}

	//////////////////////////////////////
	//
	// transform input into classes
	//
	//////////////////////////////////////

	// USER_ADMIN, ...
	m_username = g_users.getUsername(r);
	// if collection is NULL default to one in g_conf
	//if ( ! m_coll2 || ! m_coll2[0] ) { 
	//	//m_coll = g_conf.m_defaultColl; 
	//	m_coll2 = g_conf.getDefaultColl(r->getHost(), r->getHostLen());
	//	m_collLen2 = gbstrlen(m_coll2); 
	//}

	// reset this
	m_gblang = 0;

	// use gblang then!
	long gglen;
	char *gg = r->getString ( "clang" , &gglen , NULL );
	if ( gg && gglen > 1 )
		m_gblang = getLanguageFromAbbr(gg);

	// allow for "qlang" if still don't have it
	//long gglen2;
	//char *gg2 = r->getString ( "qlang" , &gglen2 , NULL );
	//if ( m_gblang == 0 && gg2 && gglen2 > 1 )
	//	m_gblang = getLanguageFromAbbr(gg2);
	
	// fix query by removing lang:xx from ask.com queries
	//char *end = m_query + m_queryLen -8;
	//if ( m_queryLen > 8 && m_query && end > m_query && 
	//     strncmp(end," lang:",6)==0 ) {
	//	char *asklang = m_query+m_queryLen - 2;
	//	m_gblang = getLanguageFromAbbr(asklang);
	//	m_queryLen -= 8;
	//	m_query[m_queryLen] = 0;
	//	
	//}

	// . returns false and sets g_errno on error
	// . sets m_qbuf1 and m_qbuf2
	if ( ! setQueryBuffers (r) )
		return log("query: setQueryBuffers: %s",mstrerror(g_errno));

	/* --- Virtual host language detection --- */
	if(r->getHost()) {
		bool langset = getLanguageFromAbbr(m_defaultSortLanguage);
		char *cp;
		if(!langset && (cp = strrchr(r->getHost(), '.'))) {
			uint8_t lang = getLanguageFromUserAgent(++cp);
			if(lang) {
				// char langbuf[128];
				// sprintf(langbuf, "qlang=%s\0", getLanguageAbbr(lang));
				//m_defaultSortLanguage = getLanguageAbbr(lang);
                                char *tmp = getLanguageAbbr(lang);
                                strncpy(m_defaultSortLanguage, tmp, 6);
				// log(LOG_INFO,
				//	getLanguageString(lang), r->getHost(), this);
			}
		}
	}
	/* --- End Virtual host language detection --- */


	//char *qs1 = m_defaultSortLanguage;

	// this overrides though
	//long qlen2;
	//char *qs2 = r->getString ("qlang",&qlen2,NULL);
	//if ( qs2 ) qs1 = qs2;
	
	//m_queryLang = getLanguageFromAbbr ( qs1 );

	m_queryLang = detectQueryLanguage();

	char *qs1 = getLangAbbr(m_queryLang);

	log("query: using default lang of %s",getLangAbbr(m_queryLang));

	if ( qs1 && qs1[0] && ! m_queryLang )
		log("query: qlang of \"%s\" is NOT SUPPORTED",qs1);



	char *prepend = r->getString("prepend",NULL,NULL);

	// . the query to use for highlighting... can be overriden with "hq"
	// . we need the language id for doing synonyms
	if ( prepend && prepend[0] )
		m_hqq.set2 ( prepend , m_queryLang , true );
	else if ( m_highlightQuery && m_highlightQuery[0] )
		m_hqq.set2 ( m_highlightQuery , m_queryLang , true );
	else if ( m_query && m_query[0] )
		m_hqq.set2 ( m_query , m_queryLang , true );

	// log it here
	log("query: got query %s",m_sbuf1.getBufStart());

	// . now set from m_qbuf1, the advanced/composite query buffer
	// . returns false and sets g_errno on error (ETOOMANYOPERANDS)
	if ( ! m_q->set2 ( m_sbuf1.getBufStart(), 
			   m_queryLang , 
			   m_queryExpansion ) ) {
		g_msg = " (error: query has too many operands)";
		return false;
	}

	if ( m_q->m_truncated && m_q->m_isBoolean ) {
		g_errno = ETOOMANYOPERANDS;
		g_msg = " (error: query has too many operands)";
		return false;
	}


	// do not allow querier to use the links: query operator unless they 
	// are admin or the search controls explicitly allow links:
	//if ( m_q->m_hasLinksOperator && ! m_isAdmin  &&
	//     !cr->m_allowLinksSearch ) {
	//	g_errno = ENOPERM;
	//	g_msg = " (error: permission denied)";
	//	return false;
	//}

	// miscellaneous
	m_showBanned = false;
	//if ( m_isAdmin ) m_showBanned = true;
	// admin can say &sb=0 explicitly to not show banned results
	// . if you are searching a diffbot collection, you are the admin
	//   i guess...
	if ( m_isAdmin || cr->m_isCustomCrawl )
		m_showBanned = r->getLong("sb",m_showBanned);


	if ( m_q->m_hasUrlField  ) m_ipRestrictForTopics = false;
	if ( m_q->m_hasIpField   ) {
		m_ipRestrictForTopics = false;
		//if( m_isAdmin ) m_showBanned = true;
	}
	if ( m_q->m_hasPositiveSiteField ) {
		m_ipRestrictForTopics = false;
		m_doSiteClustering    = false;
	}
	if ( m_q->m_hasQuotaField ) {
		m_doSiteClustering    = false;
		m_doDupContentRemoval = false;
	}


	m_familyFilter = r->getLong("ff",0);

	long codeLen;
	char *code = r->getString ("code",&codeLen,NULL);
	// set m_endUser
	if ( ! codeLen || ! code || strcmp(code,"gbfront")==0 )
		m_endUser = true;
	else
		m_endUser = false;


	if(codeLen && !m_endUser) {
		m_maxResults = cr->m_maxSearchResultsForClients;
	}
	else {
		m_maxResults = cr->m_maxSearchResults;
	}
	// don't let admin bewilder himself
	if ( m_maxResults < 1 ) m_maxResults = 500;

	// we can't get this kind of constraint from generic Parms routines
	if ( m_firstResultNum + m_docsWanted > m_maxResults ) 
		m_firstResultNum = m_maxResults - m_docsWanted;
	if(m_firstResultNum < 0) m_firstResultNum = 0;

	// DEBUG: temp hack
	// static bool first = true;
	//  if ( first ) { 
	//  	first = false;
	//  	m_firstResultNum = 10;
	//  }


	// if useCache is -1 then pick a default value
	if ( m_useCache == -1 ) {
		// assume yes as default
		m_useCache = 1;
		// . if query has url: or site: term do NOT use cache by def.
		// . however, if spider is off then use the cache by default
		if ( g_conf.m_spideringEnabled ) {
			if      ( m_q->m_hasPositiveSiteField ) m_useCache = 0;
			else if ( m_q->m_hasIpField   ) m_useCache = 0;
			else if ( m_q->m_hasUrlField  ) m_useCache = 0;
			else if ( m_siteLen  > 0      ) m_useCache = 0;
			else if ( m_whiteListBuf.length() ) m_useCache = 0;
			else if ( m_urlLen   > 0      ) m_useCache = 0;
		}
	}
	// never use cache if doing a rerank (msg3b)
	//if ( m_rerankRuleset >= 0 ) m_useCache = 0;
	bool readFromCache = false;
	if ( m_useCache ==  1  ) readFromCache = true;
	if ( m_rcache   ==  0  ) readFromCache = false;
	if ( m_useCache ==  0  ) readFromCache = false;
	// if useCache is false, don't write to cache if it was not specified
	if ( m_wcache == -1 ) {
		if ( m_useCache ==  0 ) m_wcache = 0;
		else                    m_wcache = 1;
	}
	// save it
	m_rcache = readFromCache;

	/*
	m_language = 0;
	// convert m_languageCode to a number for m_language
	if ( m_languageCode ) {
		m_language = (unsigned char)atoi(m_languageCode);
		if ( m_language == 0 )
			m_language = getLanguageFromAbbr(m_languageCode);
	}
	*/

	// a hack for buzz for backwards compatibility
	//if ( strstr ( m_q->m_orig,"gbkeyword:r36p1" ) )
	//	m_ruleset = 36;

	//
	// . turn this off for now
	// . it is used in setClusterLevels() to use clusterdb to filter our
	//   search results via Msg39, so it is not the most efficient.
	// . plus i am deleting most foreign language pages from the index
	//   so we can just focus on english and that will give us more english
	//   pages that we could normally get. we don't have resources to
	//   de-spam the other languages, etc.
	// . turn it back on, i took out the setClusterLevels() use of that
	//   because we got the langid in the posdb keys now
	//
	//m_language = 0;

	// convert m_defaultSortCountry to a number for m_countryHint
	m_countryHint = g_countryCode.getIndexOfAbbr(m_defaultSortCountry);


	return true;
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: XmlNode.cpp Projeto: Doken-Tokuyama/open-source-search-engine

// Return the value of the specified "field" within this node.
// the case of "field" does not matter.
char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	int32_t flen = gbstrlen(field);
	char inQuotes = '\0';
	int32_t i;

	// scan the characters in the node, looking for the field name in ascii
	for ( i = 1; i + flen < m_nodeLen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (m_node[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (m_node[i]=='\"' || m_node[i]=='\'')){ 
			inQuotes = m_node[i];
			continue;
		} 
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( m_node[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}


	// return NULL if no matching field
	if ( i + flen >= m_nodeLen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
	
	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; 

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) {
		while (i<m_nodeLen && m_node[i] != inQuotes ) 
			i++;
	}
	else {
		while ( i<m_nodeLen &&
			!is_wspace_a(m_node[i])&&
			m_node[i]!='>')
			i++;
	}

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return m_node + start;
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: XmlNode.cpp Projeto: Doken-Tokuyama/open-source-search-engine

// . return the length of a node starting at "node"
int32_t getTagLen ( char *node ) { // , int32_t version ) {
	// see if it's not a node
	//if ( node[0] != '<' ) return 0;
	// skip over first <
	int32_t i ;
	// . keep looping until we hit a < or > OR while we're in quotes
	// . ignore < and > when they're in quotes
	for ( i = 1 ; node[i] ; i++ ) {
		// this switch should speed things up... no!
		if ( node[i] != '<'  &&
		     node[i] != '>'  &&
		     node[i] != '\"' &&
		     node[i] != '\''  )
			continue;
		// this is about 1.3 times faster than above (with -O2 on both)
		//if ( ! is_tag_control_char ( node[i] ) ) continue;
		if ( node[i] == '<' ) break;
		if ( node[i] == '>' ) {
			break;
			//if ( node[i-1]!='b') break;
			//if ( i -2 < 0      ) break;
			//if ( node[i-2]!='g') break;
			// we had a "gb>" which means that these 3 chars
			// we originally a &gt; html encoded entity which
			// we decoded for easier parsing
			//continue;
		}
		//if (version >= 70 && version < 77) continue;

		// we can have double quotes within single quotes
		if ( node [ i ] == '\"' ) {
			// scan back looking for equal sign...
			int32_t k; for ( k = i - 1 ; k > 1 ; k-- ) {
				if ( is_wspace_a(node[k]) ) continue;
				break;
			}
			if ( k <= 1 ) continue;
			// . if an equal sign did not immediately preceed
			//   this double quote then ignore the double quote
			// . this now fixes the harwoodmuseum.org issue
			//   talked about below
			if ( node[k] != '=' ) continue;
			// skip over this first quote
			i++;
			while ( node[i] && node[i]!='\"' ) {
				// crap some pages have unbalanced quotes.
				// see /test/doc.14541556377486183454.html
				if ( node[i  ]=='>' && 
				     node[i-1]=='\"' ) {
					i--;
					break;
				}
				// like an img tag hits a </a> for
				// http://www.harwoodmuseum.org/press_deta
				// il.php?ID=44
				// BUT this f***s up
				// onclick="tb_show('<b>Community Calendar</b>'
				// on the </b> which is legitamately in quotes
				//if ( node[i  ]=='<' && 
				//     node[i+1]=='/' ) {
				//	i--;
				//	break;
				//}
				if ( node[i  ]=='>' && 
				     node[i-1]==' ' &&
				     node[i-2]=='\"' ) {
					i--;
					break;
				}
				// skip this char
				i++;
			}
			// return the length if tag ended abuptly
			if ( ! node[i] ) return i;
			// back-to-back quotes? common mistake
			if ( node[i+1] == '\"' ) i++;
			continue;
		}
		// continue if we don't have a " '" or "='"
		if ( node [ i ] != '\'' ) continue;
		if ( node[i-1] != '=' && !is_wspace_a( node[i-1] ) ) continue;
		// skip to end of quote
		while ( node[i] && node[i]!='\'' ) i++;
	}
	// skip i over the >
	if ( node[i] == '>' ) i++; 
	// . else we found no closure outside of quotes so be more stringent
	// . look for closure with regard to quotes
	else for ( i=1; node[i] && node[i] != '>' && node[i] != '<';i++);
	// return the LENGTH of the whole node
	return i ;
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: Log.cpp Projeto: privacore/open-source-search-engine

bool Log::logR ( int64_t now, int32_t type, const char *msg, bool forced ) {
	if ( ! g_loggingEnabled ) {
		return true;
	}

	// return true if we should not log this
	if ( ! forced && ! shouldLog ( type , msg ) ) {
		return true;
	}

	// get "msg"'s length
	int32_t msgLen = strlen ( msg );

	ScopedLock sl(s_lock);

	// do a timestamp, too. use the time synced with host #0 because
	// it is easier to debug because all log timestamps are in sync.
	if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore();

	// . skip all logging if power out, we do not want to screw things up
	// . allow logging for 10 seconds after power out though
	if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){
		return false;
	}

	// chop off any spaces at the end of the msg.
	while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--;

	// a tmp buffer
	char tt [ MAX_LINE_LEN ];
	char *p    = tt;


	if (m_logPrefix) {
		if ( m_logTimestamps ) {
			if( m_logReadableTimestamps ) {
				time_t now_t = (time_t)(now / 1000);
				struct tm tm_buf;
				struct tm *stm = localtime_r(&now_t,&tm_buf);

				p += sprintf ( p , "%04d%02d%02d-%02d%02d%02d-%03d %04" PRId32" ", stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(now%1000), g_hostdb.m_hostId );
			} else {
				if ( g_hostdb.getNumHosts() <= 999 )
					p += sprintf ( p , "%" PRIu64 " %03" PRId32 " ", (uint64_t)now , g_hostdb.m_hostId );
				else if ( g_hostdb.getNumHosts() <= 9999 )
					p += sprintf ( p , "%" PRIu64" %04" PRId32" ", (uint64_t)now , g_hostdb.m_hostId );
				else if ( g_hostdb.getNumHosts() <= 99999 )
					p += sprintf ( p , "%" PRIu64" %05" PRId32" ", (uint64_t)now , g_hostdb.m_hostId );
			}
		}

		// Get thread id. pthread_self instead?
		unsigned tid=(unsigned)syscall(SYS_gettid);
		p += sprintf(p, "%06u ", tid);

		// Log level
		p += sprintf(p, "%s ", getTypeString(type));
	}

	// then message itself
	const char *x = msg;
	int32_t avail = (MAX_LINE_LEN) - (p - tt) - 1;
	if ( msgLen > avail ) msgLen = avail;
	if ( *x == ':' ) x++;
	if ( *x == ' ' ) x++;
	strncpy ( p , x , avail );
	// capitalize for consistency. no, makes grepping log msgs harder.
	//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
	p += strlen(p);
	// back up over spaces
	while ( p[-1] == ' ' ) p--;
	// end in period or ? or !
	//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
	//	*p++ = '.';
	*p ='\0';
	// the total length, not including the \0
	int32_t tlen = p - tt;

	// . filter out nasty chars from the message
	// . replace with ~'s
	char cs;
	char *ttp    = tt;
	char *ttpend = tt + tlen;
	for ( ; ttp < ttpend ; ttp += cs ) {
		cs = getUtf8CharSize ( ttp );
		if ( is_binary_utf8 ( ttp ) ) {
			for ( int32_t k = 0 ; k < cs ; k++ ) *ttp++ = '.';
			// careful not to skip the already skipped bytes
			cs = 0;
			continue;
		}
	}

	// . if filesize would be too big then make a new log file
	// . should make a new m_fd
	if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE && g_conf.m_logToFile )
		makeNewLogFile();

	if ( m_fd >= 0 ) {
		write ( m_fd , tt , tlen );
		write ( m_fd , "\n", 1 );
		m_logFileSize += tlen + 1;
	}
	else {
		// print it out for now
		fprintf ( stderr, "%s\n", tt );
	}

	return false;
}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: Test.cpp Projeto: BKJackson/open-source-search-engine

// come here once per second i guess
void Test::initTestRun ( ) {

	g_errno = 0;

	// . all hosts should have their g_conf.m_repairMode parm set
	// . it is global now, not collection based, since we need to
	//   lock down titledb for the scan and there could be recs from
	//   the collection we are repairing in titledb's rdbtree, which,
	//   when dumped, would mess up our scan.
	if ( ! g_conf.m_testSpiderEnabled && ! g_conf.m_testParserEnabled ) {
		char *xx=NULL;*xx=0; }

	// if both enabled, core
	if ( g_conf.m_testSpiderEnabled && g_conf.m_testParserEnabled ) {
		char *xx=NULL;*xx=0; }

	// if the power went off
	if ( ! g_process.m_powerIsOn ) return;

	// return if currently running
	// no, admin can re-init even if running now
	//if ( m_isRunning ) { char *xx=NULL;*xx=0; }//return;

	// must be host #0 only
	if ( g_hostdb.m_myHost->m_hostId != 0 ) return;
	
	// if was initially in this mode, don't do anything
	//if ( m_testSpiderEnabledSaved ) return;
	//if ( m_testParserEnabledSaved ) return;

	// you must have the "test" coll already setup!
	CollectionRec *cr = g_collectiondb.getRec("test");
	if ( ! cr ) {
		// note it
		log("test: please add a collection named \"test\" first.");
		// stop the test
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		// all done
		return;
	}

	char *testDir = getTestDir();

	// scan for file named "run.start.%li.txt" which is a dump of all
	// the conf and parms 
	char filename[100];
	File f;
	long i; for ( i = 0 ; i < 9999 ; i++ ) {
		// make filename. base it off working dir, g_hostdb.m_dir
		sprintf ( filename,"%s/%s/run.%li.collparms.txt",
			  g_hostdb.m_dir,testDir,i );
		// exist?
		f.set ( filename );
		// open files
		long status = f.doesExist();
		// error?
		if ( status == -1 ) {
			// note it in the log
			log("test: doesExist() returned -1");
			// end the test
			g_conf.m_testParserEnabled = false;
			g_conf.m_testSpiderEnabled = false;
			// all done
			return;
		}
		// try next i if this one in use
		if ( status ) continue;
		// got one
		break;
	}
	// close it
	f.close();

	// create the run.%li.version.txt file
	char cmd[1000];
	char vfile[200];
	sprintf(vfile,"%s/%s/run.%li.version.txt",g_hostdb.m_dir,testDir,i);
	sprintf(cmd,
		"%s/gb -v >& %s ; "
		"echo -n \"RUN START TIME: \" >> %s ; "
		"date >> %s",
		g_hostdb.m_dir,vfile,
		vfile,
		vfile);
	system(cmd);


	// save it
	m_runId = i;

	cr = g_collectiondb.getRec ( "test" );
	if ( ! cr ) {
		// and no more of this
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		return;
	}
	// set these
	m_coll = cr->m_coll;

	// turn on spiders
	//cr->m_spideringEnabled = 1;

	// crap i guess this too!!!
	//g_conf.m_spideringEnabled = 1;


	//
	// log out the global parms
	//
	char fbuf[100]; 
	// print our global parms into a file called run.%li.start.txt
	sprintf(fbuf,"%s/%s/run.%li.confparms.txt",g_hostdb.m_dir,testDir,i);
	// this saves it as xml i think
	g_parms.saveToXml ( (char *)&g_conf , fbuf );

	//
	// log out the coll specific parms
	//
	// update name
	sprintf(fbuf,"%s/%s/run.%li.collparms.txt",g_hostdb.m_dir,testDir,i);
	// save that
	g_parms.saveToXml ( (char *)cr , fbuf );

	// get the list of urls to download and inject in order
	sprintf(fbuf,"%s/%s/urls.txt",g_hostdb.m_dir,testDir);
	// set it
	f.set ( fbuf ) ;
	// read it in
	long fsize = f.getFileSize();
	// add one for \0 termination
	long need = fsize + 1;
	// read it in
	char *buf = (char *)mmalloc ( need ,"qatest");
	// error?
	if ( ! buf ) {
		// note it
		log("test: failed to alloc %li bytes for url buf",fsize);
		// disable testing
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		// all done
		return;
	}
	// open it
	f.open ( O_RDONLY );
	// read it in
	long rs = f.read ( buf , fsize , 0 ) ;
	// check it
	if ( rs != fsize ) {
		// note it
		log("test: failed to read %li bytes of urls.txt file",fsize);
		// disable testing
		g_conf.m_testParserEnabled = false;
		g_conf.m_testSpiderEnabled = false;
		// all done
		return;
	}
	// save it
	m_urlBuf = buf;
	// null term it just in case
	buf[need-1] = '\0';
	// end of it, including the terminating \0
	m_urlEnd = buf + need;
	// init url offset
	m_urlPtr = m_urlBuf;

	// reset just in case
	//m_spiderLinks = false;
	m_bypassMenuElimination = false;

	// first check for spiderlinks=1|true
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
		//if ( p[0] != 's' ) continue;
		//if ( p[1] != 'p' ) continue;
		//if ( ! strncmp(p,"spiderlinks",11) ) 
		//	m_spiderLinks = true;
		//if ( ! strncmp(p,"bypassmenuelimination",21) ) 
		//	m_bypassMenuElimination = true;
	}

	// force max spiders to one because one page is often dependent
	// on the previous page!
	//if ( ! m_spiderLinks ) cr->m_maxNumSpiders = 1;
	// need to make it 6 since some priorities essentially lock the
	// ips up that have urls in higher priorities. i.e. once we dole 
	// a url out for ip X, then if later we add a high priority url for
	// IP X it can't get spidered until the one that is doled does.
	//else                   cr->m_maxNumSpiders = 6;

	// . first space out all comments
	// . comments are nice because we know why the url is in urls.txt
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
		// skip if not start of a comment line
		if ( *p != '#' ) continue;
		// if not preceeded by a \n or start, skip
		if ( p > m_urlBuf && *(p-1) != '\n' ) continue;
		// ok, nuke it
		for ( ; *p && *p !='\n' ; p++ ) *p = ' ';
	}

	// if we hit "\nSTOP\n" then white out that and all past it
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ ) {
		// skip if not start of a comment line
		if ( *p != '\n' ) continue;
		// check it
		if ( strncmp(p,"\nSTOP\n",6) ) continue;
		// white out
		for ( ; *p ; p++ ) {
			// until we HIT RESUME
			if ( *p == '\n' && ! strncmp(p,"\nRESUME\n",8) ) {
				p[1] = ' ';
				p[2] = ' ';
				p[3] = ' ';
				p[4] = ' ';
				p[5] = ' ';
				p[6] = ' ';
				break;
			}
			*p = ' ';
		}
		// all done
		//break;
	}

	// then NULL terminate all urls by converting all white space to \0s
	for ( char *p = m_urlBuf ; p < m_urlEnd ; p++ )
		// all non url chars to \0
		if ( is_wspace_a(*p) ) *p = '\0';
	

	// flag this
	m_isRunning = true;

	// and this
	m_isAdding = true;

	m_testStartTime = gettimeofdayInMilliseconds();

	// set up dedup table
	m_dt.set ( 8,0,0,NULL,0,false,MAX_NICENESS,"testdedup");

	// remove all old files for now to avoid system diffs
	log("test: beginning injection");

	// . now inject each url in order, one at a time using msg7 i guess
	// . returns true if all done
	if ( ! injectLoop() ) return;
	// close it up
	//stopIt();
}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: HttpRequest.cpp Projeto: BKJackson/open-source-search-engine

 // . parse an incoming request
 // . return false and set g_errno on error
 // . CAUTION: we destroy "req" by replacing it's last char with a \0
 // . last char must be \n or \r for it to be a proper request anyway
 bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) {
	 // reset number of cgi field terms
	 reset();

	 if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) {
		 log("http: failed to copy request: %s",mstrerror(g_errno));
		 return false;
	 }

	 // copy it to avoid mangling it
	 m_reqBuf.safeMemcpy ( origReq , origReqLen );
	 // NULL term
	 m_reqBuf.pushChar('\0');

	 m_reqBufValid = true;

	 // and point to that
	 char *req    = m_reqBuf.getBufStart();
	 long  reqLen = m_reqBuf.length() - 1;

	 // save this
	 m_userIP = 0; if ( sock ) m_userIP = sock->m_ip;
	 m_isSSL  = 0; if ( sock ) m_isSSL = (bool)sock->m_ssl;

	 // TcpServer should always give us a NULL terminated request
	 if ( req[reqLen] != '\0' ) { char *xx = NULL; *xx = 0; }
	 
	 // how long is the first line, the primary request
	 long i;
	 // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && 
	 //	       req[i]!='\n' && req[i]!='\r'; i++);
	 // . now fill up m_buf, used to log the request
	 // . make sure the url was encoded correctly
	 // . we don't want assholes encoding every char so we can't see what
	 //   url they are submitting to be spidered/indexed
	 // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would
	 //   change the meaning of the url
	 // . and finally, non-ascii chars that don't display correctly
	 // . this should NULL terminate m_buf, too
	 // . turn this off for now, just try to log a different way
	 // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i );
	 // ensure it's big enough to be a valid request
	 if ( reqLen < 5 ) { 
		 log("http: got reqlen<5 = %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // or if first line too long
	 //if ( i >= 1024 )  { g_errno = EBADREQUEST; return false; }
	 // get the type, must be GET or HEAD
	 if      ( strncmp ( req , "GET "  , 4 ) == 0 ) m_requestType = 0;
	 // these means a compressed reply was requested. use by query
	 // compression proxies.
	 else if ( strncmp ( req , "ZET "  , 4 ) == 0 ) m_requestType = 0;
	 else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) m_requestType = 1;
	 else if ( strncmp ( req , "POST " , 5 ) == 0 ) m_requestType = 2;
	 else { 
		 log("http: got bad request cmd: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . NULL terminate the request (a destructive operation!)
	 // . this removes the last \n in the trailing \r\n 
	 // . shit, but it f***s up POST requests
	 if ( m_requestType != 2 ) { req [ reqLen - 1 ] = '\0'; reqLen--; }

	 // POST requests can be absolutely huge if you are injecting a 100MB
	 // file, so limit our strstrs to the end of the mime
	 char *d = NULL;
	 char  dc;
	 // check for body if it was a POST request
	 if ( m_requestType == 2 ) {
		 d = strstr ( req , "\r\n\r\n" );
		 if ( d ) { dc = *d; *d = '\0'; }
		 else log("http: Got POST request without \\r\\n\\r\\n.");
	 }

	 // . point to the file path 
	 // . skip over the "GET "
	 long filenameStart = 4 ;
	 // skip over extra char if it's a "HEAD " request
	 if ( m_requestType == 1 || m_requestType == 2 ) filenameStart++;

	 // are we a redirect?
	 i = filenameStart;
	 m_redirLen = 0;
	 if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) {
		 for ( long k = i+8; k<reqLen && m_redirLen<126 ; k++) {
			 if ( req[k] == '\r' ) break;
			 if ( req[k] == '\n' ) break;
			 if ( req[k] == '\t' ) break;
			 if ( req[k] ==  ' ' ) break;
			 m_redir[m_redirLen++] = req[k];
		 }
	 }
	 m_redir[m_redirLen] = '\0';

	 // find a \n space \r or ? that delimits the filename
	 for ( i = filenameStart ; i < reqLen ; i++ ) {
		 if ( is_wspace_a ( req [ i ] ) ) break;
		 if ( req [ i ] == '?' ) break;
	 }

	 // now calc the filename length
	 m_filenameLen = i - filenameStart;
	 // return false and set g_errno if it's 0
	 if ( m_filenameLen <= 0  ) { 
		 log("http: got filenameLen<=0: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . bitch if too big
	 // . leave room for strcatting "index.html" below
	 if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { 
		 log("http: got filenameLen>=max");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . decode the filename into m_filename and reassign it's length
	 // . decode %2F to / , etc...
	 m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen);
	 // NULL terminate m_filename
	 m_filename [ m_filenameLen ] = '\0';
	 // does it have a file extension AFTER the last / in the filename?
	 bool hasExtension = false;
	 for ( long j = m_filenameLen-1 ; j >= 0 ; j-- ) {
		 if ( m_filename[j] == '.' ) { hasExtension = true; break; }
		 if ( m_filename[j] == '/' ) break;
	 }
	 // if it has no file extension append a /index.html
	 if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) {
		 strcat ( m_filename , "index.html" );
		 m_filenameLen = gbstrlen ( m_filename );
	 }
	 // set file offset/size defaults
	 m_fileOffset = 0;
	 // -1 means ALL the file from m_fileOffset onwards
	 m_fileSize   = -1;  
	 // "e" points to where the range actually starts, if any
	 //char *e;
	 // . TODO: speed up by doing one strstr for Range: and maybe range:
	 // . do they have a Range: 0-100\n in the mime denoting a partial get?
	 //char *s = strstr ( req ,"Range:bytes=" );
	 //e = s + 12;
	 // try alternate formats
	 //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; }
	 //if ( ! s ) { s = strstr ( req ,"Range: "       ); e = s +  7; }
	 // parse out the range if we got one
	 //if ( s ) {
	 //	long x = 0;
	 //	sscanf ( e ,"%li-%li" , &m_fileOffset , &x );
	 //	// get all file if range's 2nd number is non-existant
	 //	if ( x == 0 ) m_fileSize = -1;
	 //	else          m_fileSize = x - m_fileOffset;
	 //	// ensure legitimacy
	 //	if ( m_fileOffset < 0 ) m_fileOffset = 0;
	 //}
	 // reset our hostname
	 m_hostLen = 0;
	 // assume request is NOT from local network
	 //m_isAdmin = false;
	 m_isLocal = false;
	 // get the virtual hostname they want to use
	 char *s = strstr ( req ,"Host:" );
	 // try alternate formats
	 if ( ! s ) s = strstr ( req , "host:" ); 
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // parse out the host if we got one
	 if ( s ) {
		 // skip field name, host:
		 s += 5;
		 // skip e to beginning of the host name after "host:"
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get host len
		 m_hostLen = end - s;
		 // truncate if too big
		 if ( m_hostLen >= 255 ) m_hostLen = 254;
		 // copy into hostname
		 memcpy ( m_host , s , m_hostLen );
	 }
	 // NULL terminate it
	 m_host [ m_hostLen ] = '\0';

	 // get Referer: field
	 s = strstr ( req ,"Referer:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"referer:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume no referer
	 m_refLen = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 8;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get len
		 m_refLen = end - s;
		 // truncate if too big
		 if ( m_refLen >= 255 ) m_refLen = 254;
		 // copy into m_ref
		 memcpy ( m_ref , s , m_refLen );
	 }
	 // NULL terminate it
	 m_ref [ m_refLen ] = '\0';

	 // get User-Agent: field
	 s = strstr ( req ,"User-Agent:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"user-agent:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume empty
	 long len = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 11;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the agent name
		 char *end = s;
		 while ( *end && *end!='\n' && *end!='\r' ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get agent len
		 len = end - s;
		 // truncate if too big
		 if ( len > 127 ) len = 127;
		 // copy into m_userAgent
		 memcpy ( m_userAgent , s , len );
	 }
	 // NULL terminate it
	 m_userAgent [ len ] = '\0';

	 m_isMSIE = false;
	 if ( strstr ( m_userAgent , "MSIE" ) )
		 m_isMSIE = true;

	 // get Cookie: field
	 s = strstr ( req, "Cookie:" );
	 // find another
	 if ( !s ) s = strstr ( req, "cookie:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) != '\n' ) s = NULL;
	 // assume empty
	 // m_cookieBufLen = 0;
	 m_cookiePtr = s;
	 // parse out the cookie if we got one
	 if ( s ) {
		 // skip field name, Cookie:
		 s += 7;
		 // skip s to beginning of cookie after ':'
		 while ( *s == ' ' || *s == '\t' ) s++;
		 // find end of the cookie
		 char *end = s;
		 while ( *end && *end != '\n' && *end != '\r' ) end++;
		 // save length
		 m_cookieLen = end - m_cookiePtr;
		 // get cookie len
		 //m_cookieBufLen = end - s;
		 // trunc if too big
		 //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023;
		 // copy into m_cookieBuf
		 //memcpy(m_cookieBuf, s, m_cookieBufLen);
	 }
	 // NULL terminate it
	 if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0';
	 //m_cookieBuf[m_cookieBufLen] = '\0';
	 // convert every '&' in cookie to a \0 for parsing the fields
	 // for ( long j = 0 ; j < m_cookieBufLen ; j++ ) 
	 //	 if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0';

	 // mark it as cgi if it has a ?
	 bool isCgi = ( req [ i ] == '?' ) ;
	 // reset m_filename length to exclude the ?* stuff
	 if ( isCgi ) {
		 // skip over the '?'
		 i++;
		 // find a space the delmits end of cgi
		 long j;
		 for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break;
		 // now add it
		 if ( ! addCgi ( &req[i] , j-i ) ) return false;
		 // update i
		 i = j;
	 }

	 // . set path ptrs
	 // . the whole /cgi/14.cgi?coll=xxx&..... thang
	 m_path = req + filenameStart;
	 m_plen = i - filenameStart;
	 // we're local if hostname is 192.168.[0|1].y
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) {
	 //	m_isAdmin = true; m_isLocal = true; }
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) {
	 //	m_isAdmin = true; m_isLocal = true; }
	 //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true;
	 //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) 
		 m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) 
		 m_isLocal = true;
	 // steve cook's comcast at home:
	 // if ( sock && strncmp(iptoa(sock->m_ip),"68.35.100.143",13) == 0) 
	 // m_isLocal = true;
	 // procog's ip
	 // if ( sock && strncmp(iptoa(sock->m_ip),"216.168.36.21",13) == 0) 
	 //	 m_isLocal = true;

	 // roadrunner ip
	 // if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0) 
	 //	 m_isLocal = true;

	 // cnsp ip
	 //if ( sock && strncmp(iptoa(sock->m_ip),"67.130.216.27",13) == 0) 
	 //	 m_isLocal = true;

	 // emily parker
	 //if ( sock && strncmp(iptoa(sock->m_ip),"69.92.68.202",12) == 0) 
	 //m_isLocal = true;
	 

	 // 127.0.0.1
	 if ( sock && sock->m_ip == 16777343 )
		 m_isLocal = true;
	 // steve cook's webserver
	 //if ( sock && strncmp(iptoa(sock->m_ip),"216.168.36.21",13) == 0) 
	 //	 m_isLocal = true;
	 // . also if we're coming from lenny at my house consider it local
	 // . this is a security risk, however... TODO: FIX!!!
	 //if ( sock->m_ip == atoip ("68.35.105.199" , 13 ) ) m_isAdmin = true;
	 // . TODO: now add any cgi data from a POST.....
	 // . look after the mime
	 //char *d = NULL;
	 // check for body if it was a POST request
	 //if ( m_requestType == 2 ) d = strstr ( req , "\r\n\r\n" );

	 // now put d's char back, just in case... does it really matter?
	 if ( d ) *d = dc;

	 // return true now if no cgi stuff to parse
	 if ( d ) {
		 char *post    = d + 4;
		 long  postLen = reqLen-(d+4-req) ;
		 // post sometimes has a \r or\n after it
		 while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--;
		 // add it to m_cgiBuf, filter and everything
		 if ( ! addCgi ( post , postLen ) ) return false;
	 }
	 // sometimes i don't want to be admin
	 //if ( getLong ( "admin" , 1 ) == 0 ) m_isAdmin = false;
	 // success
	 
	 /////
	 // Handle Extra parms...

	 char *ep = g_conf.m_extraParms;
	 char *epend = g_conf.m_extraParms + g_conf.m_extraParmsLen;

	 char *qstr = m_cgiBuf;
	 long qlen = m_cgiBufLen;

	 while (ep < epend){
		 char buf[AUTOBAN_TEXT_SIZE];
		 long bufLen = 0;
		 // get next substring
		 while (*ep && ep < epend && *ep != ' ' && *ep != '\n'){
			 buf[bufLen++] = *ep++;
		 }
		 // skip whitespace
		 while (*ep && ep < epend && *ep == ' '){
			 ep++;
		 }
		 // null terminate 
		 buf[bufLen] = '\0';

		
		 // No match
		 if (!bufLen ||
		     !strnstr(qstr, qlen, buf)){
			 // skip to end of line
			 while (*ep && ep < epend && *ep != '\n') ep++;
			 // skip newline
			 while (*ep && ep < epend && *ep == '\n') ep++;
			 // try next substr
			 continue;
		 }
		 // found a match...
		 // get parm string
		 bufLen = 0;
		 while (*ep && ep < epend && *ep != '\n'){
			 buf[bufLen++] = *ep++;
		 }
		 buf[bufLen] = '\0';
		 
		 // skip newline
		 while (*ep && ep < epend && *ep == '\n') ep++;

		 logf(LOG_DEBUG, "query: appending \"%s\" to query", buf);
		
		 long newSize = m_cgiBuf2Size + bufLen+1;
		 char *newBuf = (char*)mmalloc(newSize, "extraParms");
		 if (!newBuf){
			 return log("query: unable to allocate %ld bytes "
				    "for extraParms", newSize);
		 }
		 char *p = newBuf;
		 if (m_cgiBuf2Size) {
			 memcpy(newBuf, m_cgiBuf2, m_cgiBuf2Size);
			 p += m_cgiBuf2Size-1;
			 mfree(m_cgiBuf2, m_cgiBuf2Size, "extraParms");
			 m_cgiBuf2 = NULL;
			 m_cgiBuf2Size = 0;
		 }
		 memcpy(p, buf, bufLen);
		 m_cgiBuf2 = newBuf;
		 m_cgiBuf2Size = newSize;
		 p += bufLen;
		 *p = '\0';
	 }

	 // Put '\0' back into the HttpRequest buffer...
	 if (m_cgiBuf){
		 // do not mangle the "ucontent"!
		 long cgiBufLen = m_cgiBufLen;
		 cgiBufLen -= m_ucontentLen;
		 char *buf = m_cgiBuf;
		 for (long i = 0; i < cgiBufLen ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // don't decode the ucontent= field!
		 long decodeLen = m_cgiBufLen;
		 // so subtract that
		 if ( m_ucontent ) decodeLen -= m_ucontentLen;
		 // decode everything
		 long len = urlDecode ( m_cgiBuf , m_cgiBuf , decodeLen );
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len);
		 m_cgiBufLen = len;
		 // ensure that is null i guess
		 if ( ! m_ucontent ) m_cgiBuf[len] = '\0';
	 }
	
	 if (m_cgiBuf2){
		 char *buf = m_cgiBuf2;
		 for (long i = 0; i < m_cgiBuf2Size-1 ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 long len = urlDecode ( m_cgiBuf2 , m_cgiBuf2 , m_cgiBuf2Size);
		 memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len);
	 }
	 // . parse the fields after the ? in a cgi filename
	 // . or fields in the content if it's a POST
	 // . m_cgiBuf must be and is NULL terminated for this
	 parseFields ( m_cgiBuf , m_cgiBufLen );
	 // Add extra parms to the request.  
	 if (m_cgiBuf2Size){
		 parseFields(m_cgiBuf2, m_cgiBuf2Size);
	 }

	 // urldecode the cookie buf too!!
	 if ( m_cookiePtr ) {
		 char *p = m_cookiePtr;
		 for (long i = 0; i < m_cookieLen ; i++) {
			 //if (p[i] == '&') p[i] = '\0';
			 // cookies are separated with ';' in the request only
			 if (p[i] == ';') p[i] = '\0';
			 // a hack for the metacookie=....
			 // which uses &'s to separate its subcookies
			 // this is a hack for msie's limit of 50 cookies
			 if ( p[i] == '&' ) p[i] = '\0';
			 // set m_metaCookie to start of meta cookie
			 if ( p[i] == 'm' && p[i+1] == 'e' &&
			      strncmp(p,"metacookie",10) == 0 )
				 m_metaCookie = p;
		 }
		 long len = urlDecode ( m_cookiePtr , 
					m_cookiePtr,
					m_cookieLen );
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 memset(m_cookiePtr+len, '\0', m_cookieLen-len);
		 m_cookieLen = len;
	 }

	 return true;
 }

Exemplo n.º 17

0

Exibir arquivo

Arquivo: Log.cpp Projeto: FlavioFalcao/open-source-search-engine

bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
		 bool forced ) {

	// filter if we should
	//if ( forced ) goto skipfilter;

	// return true if we should not log this
	if ( ! forced && ! shouldLog ( type , msg ) ) return true;
	// skipfilter:
	// can we log if we're a sig handler? don't take changes
	if ( g_inSigHandler ) 
		return logLater ( now , type , msg , NULL );
	//if ( g_inSigHandler ) return false;
	// get "msg"'s length
	long msgLen = gbstrlen ( msg );

#ifdef PTHREADS
	// lock for threads
	pthread_mutex_lock ( &s_lock );
#endif

	// do a timestamp, too. use the time synced with host #0 because
	// it is easier to debug because all log timestamps are in sync.
	if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore();

	// . skip all logging if power out, we do not want to screw things up
	// . allow logging for 10 seconds after power out though
	if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){
#ifdef PTHREADS
		pthread_mutex_unlock ( &s_lock );
#endif
		return false;
	}

	//if ( now == 0 ) now  = g_nowApprox;
	// chop off any spaces at the end of the msg.
	while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--;
	// get this pid
	pid_t pid = getpidtid();
	// a tmp buffer
	char tt [ MAX_LINE_LEN ];
	char *p    = tt;
	char *pend = tt + MAX_LINE_LEN;
	/*
	// print timestamp, hostid, type
	if ( g_hostdb.m_numHosts <= 999 ) 
		sprintf ( p , "%llu %03li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	else if ( g_hostdb.m_numHosts <= 9999 ) 
		sprintf ( p , "%llu %04li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	else if ( g_hostdb.m_numHosts <= 99999 ) 
		sprintf ( p , "%llu %05li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	*/


	// print timestamp, hostid, type

	if ( m_logTimestamps ) {
		if ( g_hostdb.m_numHosts <= 999 ) 
			sprintf ( p , "%llu %03li ",
				  now , g_hostdb.m_hostId );
		else if ( g_hostdb.m_numHosts <= 9999 ) 
			sprintf ( p , "%llu %04li ",
				  now , g_hostdb.m_hostId );
		else if ( g_hostdb.m_numHosts <= 99999 ) 
			sprintf ( p , "%llu %05li ",
				  now , g_hostdb.m_hostId );
		p += gbstrlen ( p );
	}

	// msg resource
	char *x = msg;
	long cc = 7;
	// the first 7 bytes or up to the : must be ascii
	//while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; }
	// space pad
	//while ( cc-- > 0 ) *p++ = ' ';
	// ignore the label for now...
	while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; }
	// thread id if in "thread"
	if ( pid != s_pid && s_pid != -1 ) {
		//sprintf ( p , "[%li] " , (long)getpid() );
		sprintf ( p , "[%lu] " , (unsigned long)pid );
		p += gbstrlen ( p );
	}
	// then message itself
	long avail = (MAX_LINE_LEN) - (p - tt) - 1;
	if ( msgLen > avail ) msgLen = avail;
	if ( *x == ':' ) x++;
	if ( *x == ' ' ) x++;
	strncpy ( p , x , avail );
	// capitalize for consistency. no, makes grepping log msgs harder.
	//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
	p += gbstrlen(p);
	// back up over spaces
	while ( p[-1] == ' ' ) p--;
	// end in period or ? or !
	//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
	//	*p++ = '.';
	*p ='\0';
	// the total length, not including the \0
	long tlen = p - tt;

	// call sprintf, but first make sure we have room in m_buf and in
	// the arrays. who know how much room the sprintf is going to need???
	// NOTE: TODO: this is shaky -- fix it!
	if ( m_bufPtr + tlen  >= 1024 * 32 ||  m_numErrors  >= MAX_LOG_MSGS){
		// this sets m_bufPtr to 0
		if ( ! dumpLog ( ) ) {
			fprintf(stderr,"Log::log: could not dump to file!\n");
#ifdef PTHREADS
			pthread_mutex_unlock ( &s_lock );
#endif
			return false;
		}
	}
	// . filter out nasty chars from the message
	// . replace with ~'s
	char cs;
	char *ttp    = tt;
	char *ttpend = tt + tlen;
	for ( ; ttp < ttpend ; ttp += cs ) {
		cs = getUtf8CharSize ( ttp );
		if ( is_binary_utf8 ( ttp ) ) {
			for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.';
			// careful not to skip the already skipped bytes
			cs = 0;
			continue;
		}
		// convert \n's and \r's to spaces
		if ( *ttp == '\n' ) *ttp = ' ';
		if ( *ttp == '\r' ) *ttp = ' ';
		if ( *ttp == '\t' ) *ttp = ' ';
	}

	if ( m_fd >= 0 ) {
		write ( m_fd , tt , tlen );
		write ( m_fd , "\n", 1 );
	}
	else {
		// print it out for now
		fprintf ( stderr, "%s\n", tt );
	}

	// set the stuff in the array
	m_errorMsg      [m_numErrors] = msg;
	m_errorMsgLen   [m_numErrors] = msgLen;
	m_errorTime     [m_numErrors] = now;
	m_errorType     [m_numErrors] = type;
	// increase the # of errors
	m_numErrors++;

#ifdef PTHREADS
	// unlock for threads
	pthread_mutex_unlock ( &s_lock );
#endif
	return false;
}

Exemplo n.º 18

0

Exibir arquivo

Arquivo: HttpRequest.cpp Projeto: privacore/open-source-search-engine

// . parse an incoming request
// . return false and set g_errno on error
// . CAUTION: we destroy "req" by replacing it's last char with a \0
// . last char must be \n or \r for it to be a proper request anyway
bool HttpRequest::set ( char *origReq , int32_t origReqLen , TcpSocket *sock ) {
	// reset number of cgi field terms
	reset();

	 if ( ! m_reqBuf.reserve ( origReqLen + 1 ) ) {
		 log("http: failed to copy request: %s",mstrerror(g_errno));
		 return false;
	 }

	 // copy it to avoid mangling it
	 m_reqBuf.safeMemcpy ( origReq , origReqLen );
	 // NULL term
	 m_reqBuf.pushChar('\0');

	 m_reqBufValid = true;

	 // and point to that
	 char *req    = m_reqBuf.getBufStart();

	 if( !req ) { 
		 log(LOG_ERROR, "http: req is NULL");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }


	 int32_t  reqLen = m_reqBuf.length() - 1;

	 // save this
	 m_userIP = sock ? sock->m_ip : 0;
	 m_isSSL  = sock ? (sock->m_ssl!=NULL) : false;

	 // TcpServer should always give us a NULL terminated request
	 if ( req[reqLen] != '\0' ) { g_process.shutdownAbort(true); }
	 
	 // how long is the first line, the primary request
	 // int32_t i;
	 // for ( i = 0 ; i<reqLen && i<MAX_REQ_LEN && 
	 //	       req[i]!='\n' && req[i]!='\r'; i++);
	 // . now fill up m_buf, used to log the request
	 // . make sure the url was encoded correctly
	 // . we don't want assholes encoding every char so we can't see what
	 //   url they are submitting to be spidered/indexed
	 // . also, don't de-code encoded ' ' '+' '?' '=' '&' because that would
	 //   change the meaning of the url
	 // . and finally, non-ascii chars that don't display correctly
	 // . this should NULL terminate m_buf, too
	 // . turn this off for now, just try to log a different way
	 // m_bufLen = urlNormCode ( m_buf , MAX_REQ_LEN - 1 , req , i );
	 // ensure it's big enough to be a valid request
	 if ( reqLen < 5 ) { 
		 log(LOG_WARN, "http: got reqlen %" PRId32"<5 = %s",reqLen,req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }

	 int32_t cmdLen = 0;

	 // or if first line too long
	 //if ( i >= 1024 )  { g_errno = EBADREQUEST; return false; }
	 // get the type, must be GET or HEAD
	 if      ( strncmp ( req , "GET "  , 4 ) == 0 ) {
		 m_requestType = RT_GET;
		 cmdLen = 3;
	 }
	 // these means a compressed reply was requested. use by query
	 // compression proxies.
	 else if ( strncmp ( req , "ZET "  , 4 ) == 0 ) {
		 m_requestType = RT_GET;
		 cmdLen = 3;
	 }
	 else if ( strncmp ( req , "HEAD " , 5 ) == 0 ) {
		 m_requestType = RT_HEAD;
		 cmdLen = 4;
	 }
	 else if ( strncmp ( req , "POST " , 5 ) == 0 )  {
		 m_requestType = RT_POST;
		 cmdLen = 4;
	 }
	 else if ( strncmp ( req , "CONNECT " , 8 ) == 0 ) {
		 // take this out until it stops losing descriptors and works
		 //m_requestType = RT_CONNECT;
		 //cmdLen = 7;
		 // we no longer insert section info. emmanuel gets section
		 // info when injecting a doc now i think in PageInject.cpp.
		 // we do not proxy https requests because we can't
		 // decrypt the page contents to cache them or to insert
		 // the sectiondb voting markup, so it's kinda pointless...
		 // and i'm not aiming to be a full-fledge squid proxy.
		 log("http: CONNECT request not supported because we "
		   "can't insert section markup and we can't cache: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 else { 
		 log("http: got bad request cmd: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . NULL terminate the request (a destructive operation!)
	 // . this removes the last \n in the trailing \r\n 
	 // . shit, but it f***s up POST requests
	 if ( m_requestType != RT_POST ) { 
		 req [ reqLen - 1 ] = '\0'; 
		 reqLen--; 
	 }

	 // POST requests can be absolutely huge if you are injecting a 100MB
	 // file, so limit our strstrs to the end of the mime
	 char *d = NULL;
	 char  dc;
	 // check for body if it was a POST request
	 if ( m_requestType == RT_POST ) {
		 d = strstr ( req , "\r\n\r\n" );
		 if ( d ) { dc = *d; *d = '\0'; }
		 else log("http: Got POST request without \\r\\n\\r\\n.");
	 }

	 // is it a proxy request?
	 m_isSquidProxyRequest = false;
	 if ( strncmp ( req + cmdLen + 1, "http://" ,7) == 0 ||
	      strncmp ( req + cmdLen + 1, "https://",8) == 0 ) {
		 m_isSquidProxyRequest = true;
		 // set url parms for it
		 m_squidProxiedUrl = req + cmdLen + 1;
		 char *p = m_squidProxiedUrl + 7;
		 if ( *p == '/' ) p++; // https:// ?
		 // stop at whitespace or \0
		 for ( ; *p && ! is_wspace_a(*p) ; p++ );
		 // that's the length of it
		 m_squidProxiedUrlLen = p - m_squidProxiedUrl;
	 }
	 else if ( m_requestType == RT_CONNECT ) {
		 m_isSquidProxyRequest = true;
		 // set url parms for it
		 m_squidProxiedUrl = req + cmdLen + 1;
		 // usually its like CONNECT diffbot.com:443
		 char *p = m_squidProxiedUrl;
		 // stop at whitespace or \0
		 for ( ; *p && ! is_wspace_a(*p) ; p++ );
		 // that's the length of it
		 m_squidProxiedUrlLen = p - m_squidProxiedUrl;
	 }

	 // check authentication
	 char *auth = NULL;
	 if ( m_isSquidProxyRequest && req )
		 auth = strstr(req,"Proxy-authorization: Basic ");

	 //if ( m_isSquidProxyRequest && ! auth ) {
	 //	 log("http: no auth in proxy request %s",req);
	 //	 g_errno = EBADREQUEST; 
	 //	 return false; 
	 //}

	 SafeBuf tmp;
	 if ( auth ) {
		 // find end of it
		 char *p = auth;
		 for ( ; *p && *p != '\r' && *p != '\n' ; p++ );
		 tmp.base64Decode ( auth , p - auth );
	 }

	 // assume incorrect username/password
	 bool matched = false;
	 if ( m_isSquidProxyRequest ) {
		 // now try to match in g_conf.m_proxyAuth safebuf of
		 // username:password space-separated list
		 char *p = g_conf.m_proxyAuth.getBufStart();
		 // loop over those
		 for ( ; p && *p ; ) {
			 // skip initial white space
			 for ( ; *p && is_wspace_a(*p); p++ );
			 // skip to end of username:password thing
			 char *end = p;
			 for ( ; *end && !is_wspace_a(*end); end++);
			 // save
			 char *start = p;
			 // advance
			 p = end;
			 // this is always a match
			 if ( end-start == 3 && strncmp(start,"*:*",3) == 0 ) {
				 matched = true;
				 break;
			 }
			 // compare now
			 if ( tmp.length() != end-start ) 
				 continue;
			 if ( strncmp(tmp.getBufStart(),start,end-start) != 0 )
				 continue;
			 // we got a match
			 matched = true;
			 break;
		 }
	 }

	 // incorrect username:passwrod?
	 if ( m_isSquidProxyRequest && ! matched ) {
		 log("http: bad username:password in proxy request %s",req);
		 g_errno = EPERMDENIED;
		 return false; 
	 }

	 // if proxy request to download a url through us, we are done
	 if ( m_isSquidProxyRequest ) return true;

	 bool multipart = false;
	 if ( m_requestType == 2 ) { // is POST?
		 char *cd ;
		 cd = gb_strcasestr(req,"Content-Type: multipart/form-data");
		 if ( cd ) multipart = true;
	 }

	 // . point to the file path 
	 // . skip over the "GET "
	 int32_t filenameStart = 4 ;
	 // skip over extra char if it's a "HEAD " request
	 if ( m_requestType == RT_HEAD || m_requestType == RT_POST ) 
		 filenameStart++;

	 // are we a redirect?
	 int32_t i = filenameStart;
	 m_redirLen = 0;
	 if ( strncmp ( &req[i] , "/?redir=" , 8 ) == 0 ) {
		 for ( int32_t k = i+8; k<reqLen && m_redirLen<126 ; k++) {
			 if ( req[k] == '\r' ) break;
			 if ( req[k] == '\n' ) break;
			 if ( req[k] == '\t' ) break;
			 if ( req[k] ==  ' ' ) break;
			 m_redir[m_redirLen++] = req[k];
		 }
	 }
	 m_redir[m_redirLen] = '\0';

	 // find a \n space \r or ? that delimits the filename
	 for ( i = filenameStart ; i < reqLen ; i++ ) {
		 if ( is_wspace_a ( req [ i ] ) ) break;
		 if ( req [ i ] == '?' ) break;
	 }

	 // now calc the filename length
	 m_filenameLen = i - filenameStart;
	 // return false and set g_errno if it's 0
	 if ( m_filenameLen <= 0  ) { 
		 log("http: got filenameLen<=0: %s",req);
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . bitch if too big
	 // . leave room for strcatting "index.html" below
	 if ( m_filenameLen >= MAX_HTTP_FILENAME_LEN - 10 ) { 
		 log("http: got filenameLen>=max");
		 g_errno = EBADREQUEST; 
		 return false; 
	 }
	 // . decode the filename into m_filename and reassign it's length
	 // . decode %2F to / , etc...
	 m_filenameLen = urlDecode(m_filename,req+filenameStart,m_filenameLen);
	 // NULL terminate m_filename
	 m_filename [ m_filenameLen ] = '\0';
	 // does it have a file extension AFTER the last / in the filename?
	 bool hasExtension = false;
	 for ( int32_t j = m_filenameLen-1 ; j >= 0 ; j-- ) {
		 if ( m_filename[j] == '.' ) { hasExtension = true; break; }
		 if ( m_filename[j] == '/' ) break;
	 }
	 // if it has no file extension append a /index.html
	 if ( ! hasExtension && m_filename [ m_filenameLen - 1 ] == '/' ) {
		 strcat ( m_filename , "index.html" );
		 m_filenameLen = strlen ( m_filename );
	 }


	 // . uses the TcpSocket::m_readBuf
	 // . if *p was ? then keep going
	 m_origUrlRequest = origReq + filenameStart;
	 char *p = origReq + m_filenameLen;
	 for ( ; *p && ! is_wspace_a(*p) ; p++ );
	 m_origUrlRequestLen = p - m_origUrlRequest;

	 // set file offset/size defaults
	 m_fileOffset = 0;
	 // -1 means ALL the file from m_fileOffset onwards
	 m_fileSize   = -1;  
	 // "e" points to where the range actually starts, if any
	 //char *e;
	 // . TODO: speed up by doing one strstr for Range: and maybe range:
	 // . do they have a Range: 0-100\n in the mime denoting a partial get?
	 //char *s = strstr ( req ,"Range:bytes=" );
	 //e = s + 12;
	 // try alternate formats
	 //if ( ! s ) { s = strstr ( req ,"Range: bytes=" ); e = s + 13; }
	 //if ( ! s ) { s = strstr ( req ,"Range: "       ); e = s +  7; }
	 // parse out the range if we got one
	 //if ( s ) {
	 //	int32_t x = 0;
	 //	sscanf ( e ,"%" PRId32"-%" PRId32 , &m_fileOffset , &x );
	 //	// get all file if range's 2nd number is non-existant
	 //	if ( x == 0 ) m_fileSize = -1;
	 //	else          m_fileSize = x - m_fileOffset;
	 //	// ensure legitimacy
	 //	if ( m_fileOffset < 0 ) m_fileOffset = 0;
	 //}
	 // reset our hostname
	 m_hostLen = 0;
	 // assume request is NOT from local network
	 //m_isMasterAdmin = false;
	 m_isLocal = false;
	 // get the virtual hostname they want to use
	 char *s = strstr ( req ,"Host:" );
	 // try alternate formats
	 if ( ! s ) s = strstr ( req , "host:" ); 
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // parse out the host if we got one
	 if ( s ) {
		 // skip field name, host:
		 s += 5;
		 // skip e to beginning of the host name after "host:"
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get host len
		 m_hostLen = end - s;
		 // truncate if too big
		 if ( m_hostLen >= 255 ) m_hostLen = 254;
		 // copy into hostname
		 gbmemcpy ( m_host , s , m_hostLen );
	 }
	 // NULL terminate it
	 m_host [ m_hostLen ] = '\0';

	 // get Referer: field
	 s = strstr ( req ,"Referer:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"referer:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume no referer
	 m_refLen = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 8;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the host name
		 char *end = s;
		 while ( *end && !is_wspace_a(*end) ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get len
		 m_refLen = end - s;
		 // truncate if too big
		 if ( m_refLen >= 255 ) m_refLen = 254;
		 // copy into m_ref
		 gbmemcpy ( m_ref , s , m_refLen );
	 }
	 // NULL terminate it
	 m_ref [ m_refLen ] = '\0';

	 // get User-Agent: field
	 s = strstr ( req ,"User-Agent:" );
	 // find another
	 if ( ! s ) s = strstr ( req ,"user-agent:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) !='\n' ) s = NULL;
	 // assume empty
	 int32_t len = 0;
	 // parse out the referer if we got one
	 if ( s ) {
		 // skip field name, referer:
		 s += 11;
		 // skip e to beginning of the host name after ':'
		 while ( *s==' ' || *s=='\t' ) s++;
		 // find end of the agent name
		 char *end = s;
		 while ( *end && *end!='\n' && *end!='\r' ) end++;
		 // . now *end should be \0, \n, \r, ' ', ...
		 // . get agent len
		 len = end - s;
		 // truncate if too big
		 if ( len > 127 ) len = 127;
		 // copy into m_userAgent
		 gbmemcpy ( m_userAgent , s , len );
	 }
	 // NULL terminate it
	 m_userAgent [ len ] = '\0';

	 // get Cookie: field
	 s = strstr ( req, "Cookie:" );
	 // find another
	 if ( !s ) s = strstr ( req, "cookie:" );
	 // must be on its own line, otherwise it's not valid
	 if ( s && s > req && *(s-1) != '\n' ) s = NULL;
	 // assume empty
	 // m_cookieBufLen = 0;
	 m_cookiePtr = s;
	 // parse out the cookie if we got one
	 if ( s ) {
		 // skip field name, Cookie:
		 s += 7;
		 // skip s to beginning of cookie after ':'
		 while ( *s == ' ' || *s == '\t' ) s++;
		 // find end of the cookie
		 char *end = s;
		 while ( *end && *end != '\n' && *end != '\r' ) end++;
		 // save length
		 m_cookieLen = end - m_cookiePtr;
		 // get cookie len
		 //m_cookieBufLen = end - s;
		 // trunc if too big
		 //if (m_cookieBufLen > 1023) m_cookieBufLen = 1023;
		 // copy into m_cookieBuf
		 //gbmemcpy(m_cookieBuf, s, m_cookieBufLen);
	 }
	 // NULL terminate it
	 if ( m_cookiePtr ) m_cookiePtr[m_cookieLen] = '\0';
	 //m_cookieBuf[m_cookieBufLen] = '\0';
	 // convert every '&' in cookie to a \0 for parsing the fields
	 // for ( int32_t j = 0 ; j < m_cookieBufLen ; j++ ) 
	 //	 if ( m_cookieBuf[j] == '&' ) m_cookieBuf[j] = '\0';

	 // mark it as cgi if it has a ?
	 bool isCgi = ( req [ i ] == '?' ) ;
	 // reset m_filename length to exclude the ?* stuff
	 if ( isCgi ) {
		 // skip over the '?'
		 i++;
		 // find a space the delmits end of cgi
		 int32_t j;
		 for ( j = i; j < reqLen; j++) if (is_wspace_a(req[j])) break;
		 // now add it
		 if ( ! addCgi ( &req[i] , j-i ) ) return false;
		 // update i
		 i = j;
	 }

	 // . set path ptrs
	 // . the whole /cgi/14.cgi?coll=xxx&..... thang
	 m_path = req + filenameStart;
	 m_plen = i - filenameStart;
	 // we're local if hostname is 192.168.[0|1].y
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) {
	 //	m_isMasterAdmin = true; m_isLocal = true; }
	 //if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) {
	 //	m_isMasterAdmin = true; m_isLocal = true; }
	 //if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true;
	 //if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0) 
		 m_isLocal = true;
	 if ( sock && strncmp(iptoa(sock->m_ip),"10.",3) == 0) 
		 m_isLocal = true;

	 // gotta scan all ips in hosts.conf as well...
	 // if we are coming from any of our own hosts.conf c blocks
	 // consider ourselves local
	 uint32_t last = 0;
	 for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
		 Host *h = g_hostdb.getHost(i);
		 // save time with this check
		 if ( h->m_ip == last ) continue;
		 // update it
		 last = h->m_ip;
		 // returns number of top bytes in comon
		 int32_t nt = sock ? ipCmp ( sock->m_ip , h->m_ip ) : 0;
		 // at least be in the same c-block as a host in hosts.conf
		 if ( nt < 3 ) continue;
		 m_isLocal = true;
		 break;
	 }
		 

	 // connectips/adminips
	 // for ( int32_t i = 0 ; i < g_conf.m_numConnectIps ; i++ ) {
	 // 	 if ( sock->m_ip != g_conf.m_connectIps[i] ) continue;
	 // 	 m_isLocal = true;
	 // 	 break;
	 // }

	 // 127.0.0.1
	 if ( sock && sock->m_ip == 16777343 )
		 m_isLocal = true;

	 // . TODO: now add any cgi data from a POST.....
	 // . look after the mime
	 //char *d = NULL;
	 // check for body if it was a POST request
	 //if ( m_requestType == RT_POST ) d = strstr ( req , "\r\n\r\n" );

	 // return true now if no cgi stuff to parse
	 if ( d ) {
	 	// now put d's char back, just in case... does it really matter?
		*d = dc;

		 char *post    = d + 4;
		 int32_t  postLen = reqLen-(d+4-req) ;
		 // post sometimes has a \r or\n after it
		 while ( postLen > 0 && post[postLen-1]=='\r' ) postLen--;
		 // add it to m_cgiBuf, filter and everything
		 if ( ! addCgi ( post , postLen ) ) return false;
	 }

	 // Put '\0' back into the HttpRequest buffer...
	 // crap, not if we are multi-part unencoded stuff...
	 if ( m_cgiBuf && ! multipart ) {
		 // do not mangle the "ucontent"!
		 int32_t cgiBufLen = m_cgiBufLen;
		 cgiBufLen -= m_ucontentLen;
		 char *buf = m_cgiBuf;
		 for (int32_t i = 0; i < cgiBufLen ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // don't decode the ucontent= field!
		 int32_t decodeLen = m_cgiBufLen;
		 // so subtract that
		 if ( m_ucontent ) decodeLen -= m_ucontentLen;
		 // decode everything. fixed for %00 in &content= so it
		 // doesn't set our parms when injecting.
		 int32_t len = urlDecodeNoZeroes(m_cgiBuf,m_cgiBuf,decodeLen);
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 //memset(m_cgiBuf+len, '\0', m_cgiBufLen-len);
		 m_cgiBufLen = len;
		 // ensure that is null i guess
		 if ( ! m_ucontent ) m_cgiBuf[len] = '\0';
	 }
	
	 if (m_cgiBuf2){
		 char *buf = m_cgiBuf2;
		 for (int32_t i = 0; i < m_cgiBuf2Size-1 ; i++) 
			 if (buf[i] == '&') buf[i] = '\0';
		 // decode everything. fixed for %00 in &content= so it
		 // doesn't set our parms when injecting.
		 int32_t len = urlDecodeNoZeroes ( m_cgiBuf2 , 
						   m_cgiBuf2 , 
						   m_cgiBuf2Size);
		 memset(m_cgiBuf2+len, '\0', m_cgiBuf2Size-len);
	 }
	 // . parse the fields after the ? in a cgi filename
	 // . or fields in the content if it's a POST
	 // . m_cgiBuf must be and is NULL terminated for this
	 parseFields ( m_cgiBuf , m_cgiBufLen );
	 // Add extra parms to the request.  
	 if (m_cgiBuf2Size){
		 parseFields(m_cgiBuf2, m_cgiBuf2Size);
	 }

	 // urldecode the cookie buf too!!
	 if ( m_cookiePtr ) {
		 char *p = m_cookiePtr;
		 for (int32_t i = 0; i < m_cookieLen ; i++) {
			 //if (p[i] == '&') p[i] = '\0';
			 // cookies are separated with ';' in the request only
			 if (p[i] == ';') p[i] = '\0';
			 // a hack for the metacookie=....
			 // which uses &'s to separate its subcookies
			 // this is a hack for msie's limit of 50 cookies
			 if ( p[i] == '&' ) p[i] = '\0';
			 // set m_metaCookie to start of meta cookie
			 if ( p[i] == 'm' && p[i+1] == 'e' &&
			      strncmp(p,"metacookie",10) == 0 )
				 m_metaCookie = p;
		 }
		 int32_t len = urlDecode ( m_cookiePtr , 
					m_cookiePtr,
					m_cookieLen );
		 // we're parsing crap after the null if the last parm 
		 // has no value
		 memset(m_cookiePtr+len, '\0', m_cookieLen-len);
		 m_cookieLen = len;
	 }

	 return true;
 }

Exemplo n.º 19

0

Exibir arquivo

Arquivo: HttpMime.cpp Projeto: BKJackson/open-source-search-engine

// returns false on bad mime
bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) {
	// reset locUrl to 0
	m_locUrl.reset();
	// return if we have no valid complete mime
	if ( mimeLen == 0 ) return false;
	// status is on first line
	m_status = -1;
	// skip HTTP/x.x till we hit a space
	char *p = mime;
	char *pend = mime + mimeLen;
	while ( p < pend && !is_wspace_a(*p) ) p++;
	// then skip over spaces
	while ( p < pend &&  is_wspace_a(*p) ) p++;
	// return false on a problem
	if ( p == pend ) return false;
	// then read in the http status
	m_status = atol2 ( p , pend - p );
	// if no Content-Type: mime field was provided, assume html
	m_contentType = CT_HTML;
	// assume default charset
	m_charset    = NULL;
	m_charsetLen = 0;
	// set contentLen, lastModifiedDate, m_cookie
	p = mime;
	while ( p < pend ) {
		// compute the length of the string starting at p and ending
		// at a \n or \r
		long len = 0;
		while ( &p[len] < pend && p[len]!='\n' && p[len]!='\r' ) len++;
		// . if we could not find a \n or \r there was an error
		// . MIMEs must always end in \n or \r
		if ( &p[len] >= pend ) return false;
		// . stick a NULL at the end of the line 
		// . overwrites \n or \r TEMPORARILY
		char c = p [ len ];
		p [ len ] = '\0';
		// parse out some meaningful data
		if      ( strncasecmp ( p , "Content-Length:" ,15) == 0 ) {
			m_contentLengthPos = p + 15;
			m_contentLen = atol( m_contentLengthPos);
		}
		else if ( strncasecmp ( p , "Last-Modified:"  ,14) == 0 ) {
			m_lastModifiedDate=atotime(p+14);
			// do not let them exceed current time for purposes
			// of sorting by date using datedb (see Msg16.cpp)
			time_t now = time(NULL);
			if (m_lastModifiedDate > now) m_lastModifiedDate = now;
		}
		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) 
			m_contentType = getContentTypePrivate ( p + 13 );
		else if ( strncasecmp ( p , "Set-Cookie: "   ,11) == 0 ) {
			m_cookie = p + 11;
			m_cookieLen = gbstrlen ( p + 11 );
		}
		else if ( strncasecmp ( p , "Location:"       , 9) == 0 ) {
			// point to it
			char *tt = p + 9;
			// skip if space
			if ( *tt == ' ' ) tt++;
			if ( *tt == ' ' ) tt++;
			// at least set this for Msg13.cpp to use
			m_locationField    = tt;
			m_locationFieldLen = gbstrlen(tt);
			// . we don't add the "www." because of slashdot.com
			// . we skip initial spaces in this Url::set() routine
			if(url)
				m_locUrl.set ( url, p + 9, len - 9,
					       false/*addWWW?*/);
		}
		else if ( strncasecmp ( p , "Content-Encoding:", 17) == 0 ) {
			//only support gzip now, it doesn't seem like servers
			//implement the other types much
			m_contentEncodingPos = p+17;
			if(strstr(m_contentEncodingPos, "gzip")) {
				m_contentEncoding = ET_GZIP;
			}
			else if(strstr(m_contentEncodingPos, "deflate")) {
				//zlib's compression
				m_contentEncoding = ET_DEFLATE;
			}
		}
		//else if ( strncasecmp ( p, "Cookie:", 7) == 0 )
		//	log (LOG_INFO, "mime: Got Cookie = %s", (p+7));
		// re-insert the character that we replaced with a '\0'
		p [ len ] = c;
		// go to next line
		p += len;
		// skip over the cruft at the end of this line
		while ( p < pend && ( *p=='\r' || *p=='\n' ) ) p++;
	}
	return true;
}

Exemplo n.º 20

0

Exibir arquivo

Arquivo: HttpMime.cpp Projeto: privacore/open-source-search-engine

// returns false on bad mime
bool HttpMime::parse(char *mime, int32_t mimeLen, Url *url) {
#ifdef _VALGRIND_
	VALGRIND_CHECK_MEM_IS_DEFINED(mime,mimeLen);
#endif
	// reset locUrl to 0
	m_locUrl.reset();

	// return if we have no valid complete mime
	if (mimeLen == 0) {
		return false;
	}

	// status is on first line
	m_status = -1;

	// skip HTTP/x.x till we hit a space
	char *p = mime;
	char *pend = mime + mimeLen;
	while (p < pend && !is_wspace_a(*p)) p++;
	// then skip over spaces
	while (p < pend && is_wspace_a(*p)) p++;
	// return false on a problem
	if (p == pend) return false;
	// then read in the http status
	m_status = atol2(p, pend - p);
	// if no Content-Type: mime field was provided, assume html
	m_contentType = CT_HTML;
	// assume default charset
	m_charset = NULL;
	m_charsetLen = 0;

	// skip over first line
	getNextLine();

	while (getNextLine()) {
		const char *field = NULL;
		size_t fieldLen = 0;

		if (getField(&field, &fieldLen)) {
			if (parseContentEncoding(field, fieldLen)) {
				continue;
			}

			if (parseContentLength(field, fieldLen)) {
				continue;
			}

			if (parseContentType(field, fieldLen)) {
				continue;
			}

			if (parseLocation(field, fieldLen, url)) {
				continue;
			}

			if (parseSetCookie(field, fieldLen)) {
				continue;
			}

			// add parsing of other header here
		}
	}

	return true;
}

Exemplo n.º 21

0

Exibir arquivo

Arquivo: matches2.cpp Projeto: DeadNumbers/open-source-search-engine

char *getMatches2 ( Needle *needles          , 
		    int32_t    numNeedles       ,
		    char   *haystack         , 
		    int32_t    haystackSize     ,
		    char   *linkPos          ,
		    int32_t   *needleNum        ,
		    bool    stopAtFirstMatch ,
		    bool   *hadPreMatch      ,
		    bool    saveQuickTables  ,
		    int32_t    niceness         ) {

	// assume not
	if ( hadPreMatch ) *hadPreMatch = false;
	// empty haystack? then no matches
	if ( ! haystack || haystackSize <= 0 ) return NULL;
	// JAB: no needles? then no matches
	if ( ! needles  || numNeedles   <= 0 ) return NULL;

	//char tmp[8192];
	//char *t    = tmp;
	//char *tend = tmp + 8192;

	// reset counts to 0
	//if ( ! stopAtFirstMatch )
	//	for ( int32_t i=0 ; i < numNeedles ; i++ ) 
	//		needles[i].m_count = 0;

	// are we responsible for init'ing string lengths? this is much
	// faster than having to specify lengths manually.
	for ( int32_t i=0 ; i < numNeedles; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		// clear
		needles[i].m_count      = 0;
		needles[i].m_firstMatch = NULL;
		// set the string size in bytes if not provided
		if ( needles[i].m_stringSize == 0 )
			needles[i].m_stringSize = gbstrlen(needles[i].m_string);
	}

	// . set up the quick tables.
	// . utf16 is not as effective here because half the bytes are zeroes!
	// . TODO: use a static cache of like 4 of these tables where the key
	//         is the Needles ptr ... done
	int32_t numNeedlesToInit = numNeedles;
	char space[256 * 4 * sizeof(BITVEC)];
	char *buf = NULL;

	BITVEC *s0;
	BITVEC *s1;
	BITVEC *s2;
	BITVEC *s3;

	/*
	static bool s_quickTableInit = false;
	static char s_qtbuf[128*(12+1)*2];

	int32_t slot = -1;
	if(saveQuickTables) {
		if ( ! s_quickTableInit ) {
			s_quickTableInit = true;
			s_quickTables.set(8,4,128,s_qtbuf,256*13,false,0,"qx");
		}
		uint64_t key = (uint32_t)needles;
		slot = s_quickTables.getSlot(&key);
		if ( slot >= 0 ) {
			buf = s_quickTables.getValueFromSlot(slot);
			numNeedlesToInit = 0;
		}
	}
	*/

	if(!buf) {
		buf = space;
		memset ( buf , 0 , sizeof(BITVEC)*256*4);
	}

	/*
	if( useQuickTables && slot == -1 ) {
		//buf = (char*)mcalloc(sizeof(uint32_t)*256*5,
		//		     "matches");
		if(buf) s_quickTables.addKey(&key, &buf);
		//sanity check, no reason why there needs to be a 
		//limit, I just don't expect there to be this many
		//static needles at this point.
		if(s_quickTables.getNumSlotsUsed() > 32){
			char *xx=NULL; *xx = 0;
		}
	}
	*/

	// try 64 bit bit vectors now since we doubled # of needles
	int32_t offset = 0;
	s0 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s1 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s2 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;
	s3 = (BITVEC *)(buf + offset);
	offset += sizeof(BITVEC)*256;

	BITVEC mask;

	// set the letter tables, s0[] through sN[], for each needle
	for ( int32_t i = 0 ; i < numNeedlesToInit ; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		unsigned char *w    = (unsigned char *)needles[i].m_string;
		unsigned char *wend = w + needles[i].m_stringSize;
		// BITVEC is now 64 bits
		mask = (1<<(i&0x3f)); // (1<<(i%64));
		// if the needle is small, fill up the remaining letter tables
		// with its mask... so it matches any character in haystack.
		s0[(unsigned char)to_lower_a(*w)] |= mask;
		s0[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s1[j] |= mask;
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s1[(unsigned char)to_lower_a(*w)] |= mask;
		s1[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s2[j] |= mask;
				s3[j] |= mask;
			}
			continue;
		}

		s2[(unsigned char)to_lower_a(*w)] |= mask;
		s2[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
		if ( w >= wend ) {
			for ( int32_t j = 0 ; j < 256 ; j++ )  {
				s3[j] |= mask;
			}
			continue;
		}

		s3[(unsigned char)to_lower_a(*w)] |= mask;
		s3[(unsigned char)to_upper_a(*w)] |= mask;
		w += 1;//step;
	}

	// return a ptr to the first match if we should, this is it
	char *retVal = NULL;
	// debug vars
	//int32_t debugCount = 0;
	//int32_t pp = 0;
	// now find the first needle in the haystack
	unsigned char *p    = (unsigned char *)haystack;
	unsigned char *pend = (unsigned char *)haystack + haystackSize;
	char          *dend = (char *)pend;

	// do not breach!
	pend -= 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// analytics...
		
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		mask &= s1[*(p+1)];
		if ( ! mask ) continue;
		mask &= s2[*(p+2)];
		if ( ! mask ) continue;
		mask &= s3[*(p+3)];
		if ( ! mask ) continue;
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//gbmemcpy ( xx , p , 8 );
		for ( int32_t k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		gbmemcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain... 
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			int32_t msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection && 
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// see if we match another needle, fixes bug
			// of matching "anal" but not "analy[tics]"
			continue;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}

	//
	// HACK:
	// 
	// repeat above loop but for the last 4 characters in haystack!!
	// this fixes a electric fence mem breach core
	//
	// it is slower because we check for \0
	//
	pend += 4;

	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL(niceness);
		//if ( (char *)p - (char *)haystack >= 12508 )
		//	log("hey");
		// is this a possible match? (this should be VERY fast)
		mask  = s0[*(p+0)];
		if ( ! mask ) continue;
		if ( p+1 < pend ) {
			mask &= s1[*(p+1)];
			if ( ! mask ) continue;
		}
		if ( p+2 < pend ) {
			mask &= s2[*(p+2)];
			if ( ! mask ) continue;
		}
		if ( p+3 < pend ) {
			mask &= s3[*(p+3)];
			if ( ! mask ) continue;
		}
		//debugCount++;
		/*
		// display
		char oo[148];
		char *xx ;
		xx = oo;
		//gbmemcpy ( xx , p , 8 );
		for ( int32_t k = 0 ; k < 5 ; k++ ) {
			*xx++ = p[k];
		}
		gbmemcpy ( xx , "..." , 3 );
		xx += 3;
		*/
		//
		// XXX: do a hashtable lookup here so we have the candidate
		//      matches in a chain... 
		// XXX: for small needles which match frequently let's have
		//      a single char hash table, a 2 byte char hash table,
		//      etc. so if we have small needles we check the hash
		//      in those tables first, but only if mask & SMALL_NEEDLE
		//      is true! the single byte needle hash table can just
		//      be a lookup table. just XOR the bytes together for
		//      the hash.
		// XXX: just hash the mask into a table to get candidate
		//      matches in a chain? but there's 4B hashes!!
		// we got a good candidate, loop through all the needles
		for ( int32_t j = 0 ; j < numNeedles ; j++ ) {
			// skip if does not match mask, will save time
			if ( ! ((1<<(j&0x3f)) & mask) ) continue;
			if( needles[j].m_stringSize > 3) {
				// ensure first 4 bytes matches this needle's
				if (needles[j].m_string[0]!=to_lower_a(*(p+0)))
					continue;
				if (!p[1] ||
				    needles[j].m_string[1]!=to_lower_a(*(p+1)))
					continue;
				if (!p[2] ||
				    needles[j].m_string[2]!=to_lower_a(*(p+2)))
					continue;
				if (!p[3] ||
				    needles[j].m_string[3]!=to_lower_a(*(p+3)))
					continue;
			}
			// get needle size
			int32_t msize = needles[j].m_stringSize;
			// can p possibly be big enough?
			if ( pend - p < msize ) continue;
			// needle is "m" now
			char *m    = needles[j].m_string;
			char *mend = needles[j].m_stringSize + m;
			// use a tmp ptr for ptr into haystack
			char *d = (char *)p;
			// skip first 4 bytes since we know they match
			if(msize > 3) {
				d += 4;
				m += 4;
			}
			// loop over each char in "m"
			//for ( ; *m ; m++ ) {
			for ( ; m < mend ; m++ ) {
				//while ( ! *d && d < dend ) d++;
				//while ( ! *m && m < mend ) m++;
				// if we are a non alnum, that will match
				// any string of non-alnums, like a space
				// for instance. the 0 byte does not count
				// because it is used in utf16 a lot. this
				// may trigger some false matches in utf16
				// but, oh well... this way "link partner"
				// will match "link  - partner" in the haystk
				if ( is_wspace_a(*m) && m < mend ) {
					// skip all in "d" then.
					while (d<dend&&is_wspace_a(*d)) d++;
					// advance m then
					continue;
				}
				// make sure we match otherwise
				if ( *m != to_lower_a(*d) ) break;
				// ok, we matched, go to next
				d++;
			}
			// if not null, keep going
			if ( m < mend ) continue;
			// if this needle is "special" AND it occurs AFTER
			// linkPos, then do not consider it a match. this is
			// if we have a comment section indicator, like
			// "div id=\"comment" AND it occurs AFTER linkPos
			// (the char ptr to our link in the haystack) then
			// the match does not count.
			if ( linkPos && needles[j].m_isSection && 
			     (char *)p>linkPos ) {
				// record this for LinkText.cpp
				if ( hadPreMatch ) *hadPreMatch = true;
				continue;
			}
			// store ptr if NULL
			if ( ! needles[j].m_firstMatch )
				needles[j].m_firstMatch = (char *)p;
			// return ptr to needle in "haystack"
			if ( stopAtFirstMatch ) {
				// ok, we got a match
				if ( needleNum ) *needleNum = j;
				//return (char *)p;
				retVal = (char *)p;
				p = pend;
				break;
			}
			// otherwise, just count it
			needles[j].m_count++;
			// advance to next char in the haystack
			break;
		}
		// ok, we did not match any needles, advance p and try again
	}


	//if ( debugCount > 0 ) pp = haystackSize / debugCount;
	//log("build: debug count = %"INT32" uc=%"INT32" hsize=%"INT32" "
	//    "1 in %"INT32" chars matches.",
	//    debugCount,(int32_t)isHaystackUtf16,haystackSize,pp);

	// before we exit, clean up
	return retVal;
}

Exemplo n.º 22

0

Exibir arquivo

Arquivo: HttpMime.cpp Projeto: privacore/open-source-search-engine

bool HttpMime::getAttribute(const char **attribute, size_t *attributeLen, const char **attributeValue, size_t *attributeValueLen) {
	// initialize value
	*attribute = NULL;
	*attributeLen = 0;
	*attributeValue = NULL;
	*attributeValueLen = 0;

	// no attribute
	if (m_attributeStartPos == 0) {
		return false;
	}

	// strip starting whitespaces
	while (is_wspace_a(m_currentLine[m_attributeStartPos]) && (m_attributeStartPos < m_currentLineLen)) {
		++m_attributeStartPos;
	}

	*attribute = m_currentLine + m_attributeStartPos;
	*attributeLen = m_currentLineLen - m_attributeStartPos;

	// next attribute
	const char *semicolonPos = (const char *)memchr(*attribute, ';', *attributeLen);
	if (semicolonPos) {
		*attributeLen = semicolonPos - *attribute;
		m_attributeStartPos = semicolonPos - m_currentLine + 1;
	} else {
		m_attributeStartPos = 0;
	}

	// attribute value
	const char *equalPos = (const char *)memchr(*attribute, '=', *attributeLen);
	if (equalPos) {
		*attributeValueLen = *attributeLen;
		*attributeLen = equalPos - *attribute;
		*attributeValueLen -= *attributeLen + 1;
		*attributeValue = equalPos + 1;

		// strip ending attribute whitespace
		while (is_wspace_a((*attribute)[*attributeLen - 1])) {
			--(*attributeLen);
		}

		// strip starting attribute value whitespace/quote
		while (is_wspace_a((*attributeValue)[0]) || (*attributeValue)[0] == '"' || (*attributeValue)[0] == '\'') {
			++(*attributeValue);
			--(*attributeValueLen);
		}

		// strip ending attribute value whitespace/quote
		while (is_wspace_a((*attributeValue)[*attributeValueLen - 1]) || (*attributeValue)[*attributeValueLen - 1] == '"' || (*attributeValue)[*attributeValueLen - 1] == '\'') {
			--(*attributeValueLen);
		}
	}

	// cater for empty values between semicolon
	// eg: Set-Cookie: name=value; Path=/; ;SECURE; HttpOnly;
	if (*attributeLen == 0 && m_attributeStartPos) {
		return getAttribute(attribute, attributeLen, attributeValue, attributeValueLen);
	}

	logTrace(g_conf.m_logTraceHttpMime, "attribute='%.*s' value='%.*s'", static_cast<int>(*attributeLen), *attribute, static_cast<int>(*attributeValueLen), *attributeValue);

	return (*attributeLen > 0);
}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: Json.cpp Projeto: DeadNumbers/open-source-search-engine

JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {

	m_prev = NULL;

	m_stackPtr = 0;
	m_sb.purge();

	JsonItem *ji = NULL;

	if ( ! json ) return NULL;

	// how much space will we need to avoid any reallocs?
	char *p = json;
	bool inQuote = false;
	int32_t need = 0;
	for ( ; *p ; p++ ) {
		// ignore any escaped char. also \x1234
		if ( *p == '\\' ) {
			if ( p[1] ) p++;
			continue;
		}
		if ( *p == '\"' )
			inQuote = ! inQuote;
		if ( inQuote ) 
			continue;
		if ( *p == '{' ||
		     *p == ',' ||
		     *p == '[' ||
		     *p == ':' )
			// +1 for null terminating string of each item
			need += sizeof(JsonItem) +1;
	}
	// plus the length of the string to store it decoded etc.
	need += p - json;
	// plus a \0 for the value and a \0 for the name of each jsonitem
	need += 2;
	// prevent cores for now
	need += 10;
	// . to prevent safebuf from reallocating do this
	// . safeMemcpy() calls reserve(m_length+len) and reserves
	//   tries to alloc m_length + (m_length+len) so since,
	//   m_length+len should never be more than "need" we need to
	//   double up here
	need *= 2;
	// this should be enough
	if ( ! m_sb.reserve ( need ) ) return NULL;
	// for testing if we realloc
	char *mem = m_sb.getBufStart();

	int32_t  size;

	char *NAME = NULL;
	int32_t  NAMELEN = 0;

	// reset p
	p = json;
	// json maybe bad utf8 causing us to miss the \0 char, so use "pend"
	char *pend = json + gbstrlen(json);

	// scan
	for ( ; p < pend ; p += size ) {
		// get size
		size = getUtf8CharSize ( p );

		// skip spaces
		if ( is_wspace_a (*p) )
			continue;

		// skip commas
		if ( *p == ',' ) continue;

		// did we hit a '{'? that means the existing json item
		// is a parent of the item(s) inside the {}'s
		if ( *p == '{' ) {
			// if ji is non-null it must be a name like in
			// \"stats\":{\"fetchTime\":2069,....}
			// . this indicates the start of a json object
			// . addNewItem() will push the current item on stack
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// current ji is an object type then
			ji->m_type = JT_OBJECT;
			// set the name
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			// this goes on the stack
			if ( m_stackPtr >= MAXJSONPARENTS ) return NULL;
			m_stack[m_stackPtr++] = ji;
			// and null this
			ji = NULL;
			continue;
		}
		// pop the stack?
		if ( *p == '}' ) {
			// just pop it and restore name cursor
			if ( m_stackPtr > 0 ) {
				JsonItem *px = m_stack[m_stackPtr-1];
				NAME    = px->m_name;
				NAMELEN = px->m_nameLen;
				m_stackPtr--;
			}
			continue;
		}
		// array of things?
		if ( *p == '[' ) {
			// make a newitem to put on stack
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// current ji is an object type then
			ji->m_type = JT_ARRAY;
			// start of array hack. HACK!
			//ji->m_valueLong = (int32_t)p;
			ji->m_valueArray = p;
			// set the name
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			// init to a bogus value. should be set below.
			// at least this should avoid a core in XmlDoc.cpp
			// getTokenizedDiffbotReply()
			ji->m_valueLen = 0;
			// this goes on the stack
			if ( m_stackPtr >= MAXJSONPARENTS ) return NULL;
			m_stack[m_stackPtr++] = ji;
			ji = NULL;
			continue;
		}
		// pop the stack?
		if ( *p == ']' ) {
			// just pop it and restore name cursor
			if ( m_stackPtr > 0 ) {
				JsonItem *px = m_stack[m_stackPtr-1];
				NAME    = px->m_name;
				NAMELEN = px->m_nameLen;
				// start of array hack. HACK!
				char *start = (char *)px->m_valueArray;//Long;
				// include ending ']' in length of array
				px->m_valueLen = p - start + 1;
				m_stackPtr--;
			}
			continue;
		}

		// a quote?
		if ( *p == '\"' ) {
			// find end of quote
			char *end = p + 1;
			for ( ; *end ; end++ ) {
				// skip two chars if escaped
				if ( *end == '\\' && end[1] ) {
					end++; 
					continue;
				}
				// this quote is unescaped then
				if ( *end == '\"' ) break;
			}
			// field?
			char *x = end + 1;
			// skip spaces
			for ( ; *x && is_wspace_a(*x) ; x++ );
			// define the string
			char *str  = p + 1;
			int32_t  slen = end - str;
			// . if a colon follows, it was a field
			if ( *x == ':' ) {

				// we can't be the first thing in the safebuf
				// json must start with { or [ i guess
				// otherwise getFirstItem() won't work!
				if ( m_sb.m_length==0 ) {
					g_errno = EBADJSONPARSER;
					return NULL;
				}

				// let's push this now so we can \0 term
				char *savedStr = m_sb.getBuf();
				m_sb.safeMemcpy ( str , slen );
				m_sb.pushChar('\0');
				// just set the name cursor
				NAME    = savedStr;//str;
				NAMELEN = slen;
			}
			// . otherwise, it was field value, so index it
			// . TODO: later make field names compounded to
			//   better represent nesting?
			// . added 'else if (NAME){' fix for json=\"too small\"
			else if ( NAME ) {
				// make a new one in safebuf. our
				// parent will be the array type item.
				ji = addNewItem();
				if ( ! ji ) return NULL;
				// we are a string
				ji->m_type = JT_STRING;
				// use name cursor
				ji->m_name    = NAME;
				ji->m_nameLen = NAMELEN;
				// get length decoded
				int32_t curr = m_sb.length();
				// store decoded string right after jsonitem
				if ( !m_sb.safeDecodeJSONToUtf8 (str,slen,
								 niceness ))
					return NULL;
				// store length decoded json
				ji->m_valueLen = m_sb.length() - curr;
				// end with a \0
				m_sb.pushChar('\0');
				// ok, this one is done
				ji = NULL;
			}
			else {
				log("json: fieldless name in json");
				g_errno = EBADJSONPARSER;
				return NULL;
			}
			// skip over the string
			size = 0;
			p    = x;
			continue;
		}

		// true or false?
		if ( (*p == 't' && strncmp(p,"true",4)==0) ||
		     (*p == 'f' && strncmp(p,"false",5)==0) ) {
			// make a new one
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// copy the number as a string as well
			int32_t curr = m_sb.length();
			// what is the length of it?
			int32_t slen = 4;
			ji->m_valueLong = 1;
			ji->m_valueDouble = 1.0;
			if ( *p == 'f' ) {
				slen = 5;
				ji->m_valueLong = 0;
				ji->m_valueDouble = 0;
			}
			// store decoded string right after jsonitem
			if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness))
				return NULL;
			// store length decoded json
			ji->m_valueLen = m_sb.length() - curr;
			// end with a \0
			m_sb.pushChar('\0');
			ji->m_type = JT_NUMBER;
			// use name cursor
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			ji = NULL;
			// skip over the string
			size = 1;
			//p    = end;
			continue;
		}
			


		// if we hit a digit they might not be in quotes like
		// "crawled":123
		if ( is_digit ( *p ) ||
		     // like .123 ?
		     ( *p == '.' && is_digit(p[1]) ) ) {
			// find end of the number
			char *end = p + 1;
			// . allow '.' for decimal numbers
			// . TODO: allow E for exponent
			for ( ; *end && (is_digit(*end) || *end=='.');end++) ;
			// define the string
			char *str  = p;
			int32_t  slen = end - str;
			// make a new one
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// back up over negative sign?
			if ( str > json && str[-1] == '-' ) str--;
			// decode
			//char c = str[slen];
			//str[slen] = '\0';
			ji->m_valueLong = atol(str);
			ji->m_valueDouble = atof(str);
			// copy the number as a string as well
			int32_t curr = m_sb.length();
			// store decoded string right after jsonitem
			if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness))
				return NULL;
			// store length decoded json
			ji->m_valueLen = m_sb.length() - curr;
			// end with a \0
			m_sb.pushChar('\0');
			//str[slen] = c;
			ji->m_type = JT_NUMBER;
			// use name cursor
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			ji = NULL;
			// skip over the string
			size = 0;
			p    = end;
			continue;
		}
	}

	// for testing if we realloc
	char *memEnd = m_sb.getBufStart();
	if ( mem != memEnd ) { char *xx=NULL;*xx=0; }

	return (JsonItem *)m_sb.getBufStart();
}

Exemplo n.º 24

0

Exibir arquivo

Arquivo: HttpRequest.cpp Projeto: BKJackson/open-source-search-engine

// *next is set to ptr into m_cgiBuf so that the next successive call to
// getString with the SAME "field" will start at *next. that way you
// can use the same cgi parameter multiple times. (like strstr kind of)
char *HttpRequest::getStringFromCookie ( char *field      ,
					 long *len        ,
					 char *defaultStr ,
					 long *next       ) {
	// get field len
	long flen = gbstrlen(field);
	// assume none
	if ( len ) *len = 0;
	// if no cookie, forget it
	if ( ! m_cookiePtr ) return defaultStr;
	// the end of the cookie
	//char *pend = m_cookieBuf + m_cookieBufLen;
	char *pend = m_cookiePtr + m_cookieLen;
	char *p    = m_cookiePtr;
	// skip over spaces and punct
	for ( ; p && p < pend ; p++ ) 
		if ( is_alnum_a(*p) ) break;
	// skip "Cookie:"
	if ( p + 7 < pend && ! strncasecmp(p,"cookie:",7) ) p += 7;
	// skip spaces after that
	for ( ; p && p < pend ; p++ ) 
		if ( is_alnum_a(*p) ) break;
	// crazy?
	if ( p >= pend ) return defaultStr;

	char *savedVal = NULL;
	// so we do not skip the first cookie, jump right in!
	// otherwise we lose the calendar cookie for msie
	goto entryPoint;
	// . loop over all xxx=yyy\0 thingies in the cookie
	// . we converted every '&' to a \0 when the cookiebuf was set above
	//for ( char *p = m_cookieBuf ; *p ; p += gbstrlen(p) + 1 ) {
	// . no, we just keep them as &'s because seems like cookies use ;'s
	//   as delimeters not so much &'s. and when we log the cookie in the
	//   log, i wanted to see the whole cookie, so having \0's in the
	//   cookie was messing that up.
	for ( ; p < pend ; p++ ) { 
		// need a \0
		// fixes "display=0&map=0&calendar=0;" that is only one cookie.
		// so do not grap value of map or calendar from that!!
		if ( *p ) continue;
		// back to back \0's? be careful how we skip over them!
		if ( ! p[1] ) continue;
		// skip that
		if ( ++p >= pend ) break;
		// skip whitespace that follows
		for ( ; p < pend ; p++ ) 
			if ( ! is_wspace_a(*p) ) break;
		// end of cookie?
		if ( p >= pend ) break;
	entryPoint:
		// check first char
		if ( *p != *field ) continue;
		// does it match? continue if not a match
		if ( strncmp ( p , field , flen ) ) continue;
		// point to value
		char *val = p + flen;
		// must be an equal sign
		if ( *val != '=' ) continue;
		// skip that sign
		val++;
		// . cookies terminate fields by space or ; or &
		// . skip to end of cookie value for this field
		char *e = val;
		// skip over alnum. might also be \0 if this function
		// was already called somewhere else!
		// we NULL separated each cookie and then urldecoded each
		// cookie above in the m_cookieBuf logic. cookies can contain
		// encoded ;'s and &'s so i took this checks out of this while
		// loop. like the widgetHeader has semicolons in it and it
		// stores in the cookie.
		while ( e < pend && *e ) e++;
		// that is the length
		if ( len ) *len = e - val;
		// NULL terminate it, we should have already logged the cookie
		// so it should be ok to NULL terminate now. we already
		// call urlDecode() now above... and make the &'s into \0's
		*e = '\0';
		// if we were in the meta cookie, return that...
		// otherwise if you visited this site before metacookies
		// were used you might have the cookie outside the meta
		// cookie AND inside the metacookie, and only the value
		// inside the metacookie is legit...
		if ( val > m_metaCookie ) return val;
		// otherwise, save it and try to get from meta cookie
		savedVal = val;
		// length
		//if ( len ) *len = gbstrlen(val);
		// this is the value!
		//return val;
	}
	// did we save something?
	if ( savedVal ) return savedVal;
	// no match
	return defaultStr;
}

Exemplo n.º 25

0

Exibir arquivo

Arquivo: Words.cpp Projeto: BillWangCS/open-source-search-engine

// . return the value of the specified "field" within this html tag, "s"
// . the case of "field" does not matter
char *getFieldValue ( char *s , 
		      long  slen ,
		      char *field , 
		      long *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	long flen = gbstrlen(field);
	char inQuotes = '\0';
	long i;

	// make it sane
	if ( slen > 2000 ) slen = 2000;

	for ( i = 1; i + flen < slen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (s[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (s[i]=='\"' || s[i]=='\'')){ 
			inQuotes = s[i];
			continue;
		} 
		// if not in quote tag might end
		if ( s[i] == '>' && ! inQuotes ) return NULL;
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( s[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}
	
	
	// return NULL if no matching field
	if ( i + flen >= slen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < slen && s[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;
	
	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++]; 

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) while (i<slen && s[i] != inQuotes ) i++;
	else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++;

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return s + start;
}

Exemplo n.º 26

0

Exibir arquivo

Arquivo: Synonyms.cpp Projeto: BillWangCS/open-source-search-engine

// . so now this adds a list of Synonyms to the m_pools[] and returns a ptr
//   to the first one.
// . then the parent caller can store that ptr in the m_wordToSyn[] array
//   which we pre-alloc upon calling the set() function based on the # of
//   words we got
// . returns # of synonyms stored into "tmpBuf"
long Synonyms::getSynonyms ( Words *words , 
			     long wordNum , 
			     uint8_t langId ,
			     char *tmpBuf ,
			     long niceness ) {

	// punct words have no synoyms
	if ( ! words->m_wordIds[wordNum] ) return 0;

	// store these
	m_words     = words;
	m_docLangId = langId;
	m_niceness = niceness;

	// sanity check
	if ( wordNum > m_words->m_numWords ) { char *xx=NULL;*xx=0; }

	// init the dedup table to dedup wordIds
	HashTableX dt;
	char dbuf[512];
	dt.set(8,0,12,dbuf,512,false,m_niceness,"altwrds");


	long maxSyns = (long)MAX_SYNS;

	char *bufPtr = tmpBuf;

	// point into buffer
	m_aids = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// then the word ids
	m_wids0 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	// second word ids, for multi alnum word synonyms, i.e. "New Jersey"
	m_wids1 = (long long *)bufPtr;
	bufPtr += maxSyns * 8;

	m_termPtrs = (char **)bufPtr;
	bufPtr += maxSyns * 4;

	m_termLens = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWords = (long *)bufPtr;
	bufPtr += maxSyns * 4;

	m_numAlnumWordsInBase = (long *)bufPtr;
	bufPtr += maxSyns * 4;


	// source
	m_src = bufPtr;
	bufPtr += maxSyns;

	// cursors
	m_aidsPtr  = m_aids;
	m_wids0Ptr = m_wids0;
	m_wids1Ptr = m_wids1;
	m_srcPtr   = m_src;
	m_termPtrsPtr = m_termPtrs;
	m_termLensPtr = m_termLens;
	m_numAlnumWordsPtr = m_numAlnumWords;
	m_numAlnumWordsInBasePtr = m_numAlnumWordsInBase;

	
	char *w    = m_words->m_words   [wordNum];
	long  wlen = m_words->m_wordLens[wordNum];

	//
	// NOW hit wiktionary
	// Trust this less then our s_exceptions above, but more than
	// our morph computations below
	//

	char sourceId = SOURCE_WIKTIONARY;
	char *ss = NULL;
	long long bwid;
	char wikiLangId = m_docLangId;
	bool hadSpace ;
	long klen ;
	long baseNumAlnumWords;

 tryOtherLang:

	/*
	// if word only exists in one language, assume that language for word
	// even if m_docLangId is langUnknown (0)
	if ( ! ss &&
	     ! m_docLangId &&
	     ! wikiLangId ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		// each lang has its own bit
		long long bits = g_speller.getLangBits64 ( &bwid );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 )
			// get it. bit #0 is english, so add 1
			wikiLangId = getBitPosLL((uint8_t *)&bits) + 1;
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		else
			wikiLangId = getCharacterLanguage(w);
	}
	*/

	// try looking up bigram so "new jersey" gets "nj" as synonym
	if ( wikiLangId && 
	     wordNum+2< m_words->m_numWords &&
	     m_words->m_wordIds[wordNum+2]) {
		// get phrase id bigram then
		long conti = 0;
		bwid = hash64Lower_utf8_cont(w,wlen,0,&conti);
		// then the next word
		char *wp2 = m_words->m_words[wordNum+2];
		long  wlen2 = m_words->m_wordLens[wordNum+2];
		bwid = hash64Lower_utf8_cont(wp2,wlen2,bwid,&conti);
		baseNumAlnumWords = 2;
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
	}

	// need a language for wiktionary to work with
	if ( wikiLangId && ! ss ) {
		// get raw word id
		bwid = m_words->m_wordIds[wordNum];
		baseNumAlnumWords = 1;
		//if ( bwid == 1424622907102375150LL)
		//	log("a");
		ss = g_wiktionary.getSynSet( bwid, wikiLangId );
		// if that failed try removing 's from word if there
		if ( ! ss && 
		     wlen >= 3 &&
		     w[wlen-2]=='\'' && 
		     w[wlen-1]=='s' ) {
			long long cwid = hash64Lower_utf8(w,wlen-2);
			ss = g_wiktionary.getSynSet( cwid, wikiLangId );
		}
	}

	// even though a document may be in german it often has some
	// english words "pdf download" "copyright" etc. so if the word
	// has no synset in german, try it in english
	if ( //numPresets == 0 &&
	     ! ss &&
	     m_docLangId != langEnglish &&
	     wikiLangId  != langEnglish &&
	     m_docLangId &&
	     g_speller.getSynsInEnglish(w,wlen,m_docLangId,langEnglish) ) {
		// try english
		wikiLangId = langEnglish;
		sourceId   = SOURCE_WIKTIONARY_EN;
		goto tryOtherLang;
	}

	// if it was in wiktionary, just use that synset
	if ( ss ) {
		// prepare th
		HashTableX dedup;
		HashTableX *dd = NULL;
		char dbuf[512];
		long count = 0;
	addSynSet:
		// do we have another set following this
		char *next = g_wiktionary.getNextSynSet(bwid,m_docLangId,ss);
		// if so, init the dedup table then
		if ( next && ! dd ) {
			dd = &dedup;
			dd->set ( 8,0,8,dbuf,512,false,m_niceness,"sddbuf");
		}
		// skip over the pipe i guess
		char *pipe = ss + 2;
		// zh_ch?
		if ( *pipe == '_' ) pipe += 3;
		// sanity
		if ( *pipe != '|' ) { char *xx=NULL;*xx=0; }
		// point to word list
		char *p = pipe + 1;
		// hash up the list of words, they are in utf8 and
		char *e = p + 1;
		// save count in case we need to undo
		//long saved = m_numAlts[wordNum];
	hashLoop:


		// skip synonyms that are anagrams because its to ambiguous
		// the are mappings like
		// "PC" -> "PC,Personal Computer" 
		// "PC" -> "PC,Probable Cause" ... (lots more!)
		//bool isAnagram = true;
		for ( ; *e !='\n' && *e != ',' ; e++ ) ;
		//	if ( ! is_upper_a(*e) ) isAnagram = false;

		// get it
		long long h = hash64Lower_utf8_nospaces ( p , e - p );

		// skip if same as base word
		if ( h == bwid ) goto getNextSyn;

		// should we check for dups?
		if ( dd ) {
			// skip dups
			if ( dd->isInTable(&h) ) goto getNextSyn;
			// dedup. return false with g_errno set on error
			if ( ! dd->addKey(&h) ) return m_aidsPtr - m_aids;
		}
		// store it
		*m_aidsPtr++ = h;

		// store source
		*m_srcPtr++ = sourceId;

		hadSpace = false;
		klen = e - p;
		for ( long k = 0 ; k < klen ; k++ )
			if ( is_wspace_a(p[k]) ) hadSpace = true;

		*m_termPtrsPtr++ = p;
		*m_termLensPtr++ = e-p;

		// only for multi-word synonyms like "New Jersey"...
		*m_wids0Ptr = 0LL;
		*m_wids1Ptr = 0LL;
		*m_numAlnumWordsPtr = 1;

		// and for multi alnum word synonyms
		if ( hadSpace ) {
			Words sw;
			sw.setx ( p , e - p , m_niceness );
			*(long long *)m_wids0Ptr = sw.m_wordIds[0];
			*(long long *)m_wids1Ptr = sw.m_wordIds[2];
			*(long  *)m_numAlnumWordsPtr = sw.getNumAlnumWords();
		}

		m_wids0Ptr++;
		m_wids1Ptr++;
		m_numAlnumWordsPtr++;

		// how many words did we have to hash to find a synset?
		// i.e. "new jersey" would be 2, to get "nj"
		*m_numAlnumWordsInBasePtr++ = baseNumAlnumWords;

		// do not breach
		if ( ++count >= maxSyns ) goto done;
	getNextSyn:
		// loop for more
		if ( *e == ',' ) { e++; p = e; goto hashLoop; }
		// add in the next syn set, deduped
		if ( next ) { ss = next; goto addSynSet; }
		// wrap it up
	done:
		// all done
		return m_aidsPtr - m_aids;
	}


	// strip marks from THIS word, return -1 w/ g_errno set on error
	if ( ! addStripped ( w , wlen,&dt ) ) return m_aidsPtr - m_aids;

	// returns false with g_errno set
	if ( ! addAmpPhrase ( wordNum, &dt ) ) return m_aidsPtr - m_aids;

	// if we end in apostrophe, strip and add
	if ( wlen>= 3 &&
	     w[wlen-1] == 's' && 
	     w[wlen-2]=='\'' &&
	     ! addWithoutApostrophe ( wordNum, &dt ) )
		return m_aidsPtr - m_aids;

	return m_aidsPtr - m_aids;
}

Exemplo n.º 27

0

Exibir arquivo

Arquivo: SearchInput.cpp Projeto: BKJackson/open-source-search-engine

// . sets m_qbuf1[] and m_qbuf2[]
// . m_qbuf1[] is the advanced query
// . m_qbuf2[] is the query to be used for spell checking
// . returns false and set g_errno on error
bool SearchInput::setQueryBuffers ( ) {

	m_sbuf1.reset();
	m_sbuf2.reset();
	m_sbuf3.reset();

	short qcs = csUTF8;
	if (m_queryCharset && m_queryCharsetLen){
		// we need to convert the query string to utf-8
		qcs = get_iana_charset(m_queryCharset, m_queryCharsetLen);
		if (qcs == csUnknown) {
			//g_errno = EBADCHARSET;
			//g_msg = "(error: unknown query charset)";
			//return false;
			qcs = csUTF8;
		}
	}
	// prepend sites terms
	long numSites = 0;
	char *csStr = NULL;
	numSites = 0;
	csStr = get_charset_str(qcs);

	if ( m_sites && m_sites[0] ) {
		char *s = m_sites;
		char *t;
		long  len;
		m_sbuf1.pushChar('(');//*p++ = '(';
	loop:
		// skip white space
		while ( *s && ! is_alnum_a(*s) ) s++;
		// bail if done
		if ( ! *s ) goto done;
		// get length of it
		t = s;
		while ( *t && ! is_wspace_a(*t) ) t++;
		len = t - s;
		// add site: term
		//if ( p + 12 + len >= pend ) goto toobig;
		if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " );
		m_sbuf1.safeStrcpy ( "site:" );
		//p += ucToUtf8(p, pend-p,s, len, csStr, 0,0);
		m_sbuf1.safeMemcpy ( s , len );
		//memcpy ( p , s , len     ); p += len;
		//*p++ = ' ';
		m_sbuf1.pushChar(' ');
		s = t;
		numSites++;
		goto loop;
	done:
		m_sbuf1.safePrintf(") | ");
		// inc totalLen
		m_sitesQueryLen = m_sitesLen + (numSites * 10);
	}
	// append site: term
	if ( m_siteLen > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//memcpy ( p , "+site:" , 6 ); p += 6;
		m_sbuf1.safePrintf("+site:");
		//memcpy ( p , m_site , m_siteLen ); p += m_siteLen;
		m_sbuf1.safeMemcpy(m_site,m_siteLen);
	}


	// append gblang: term
	if( m_gblang > 0 ) {
		//if( p > pstart ) *p++ =  ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//p += sprintf( p, "+gblang:%li |", m_gblang );
		m_sbuf1.safePrintf( "+gblang:%li |", m_gblang );
	}
	// bookmark here so we can copy into st->m_displayQuery below
	//long displayQueryOffset = m_sbuf1.length();
	// append url: term
	if ( m_urlLen > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//memcpy ( p , "+url:" , 5 ); p += 5;
		m_sbuf1.safeStrcpy ( "+url:");
		//memcpy ( p , m_url , m_urlLen ); p += m_urlLen;
		m_sbuf1.safeMemcpy ( m_url , m_urlLen );
	}
	// append url: term
	if ( m_linkLen > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//memcpy ( p , "+link:" , 6 ); p += 6;
		m_sbuf1.safeStrcpy ( "+link:");
		//memcpy ( p , m_link , m_linkLen ); p += m_linkLen;
		m_sbuf1.safeMemcpy ( m_link , m_linkLen );
	}
	// append the natural query
	if ( m_queryLen > 0 ) {
		//if ( p  > pstart  ) *p++  = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0);
		m_sbuf1.safeMemcpy ( m_query , m_queryLen );
		//memcpy ( p  , m_query , m_queryLen ); p  += m_queryLen;
		// add to spell checked buf, too		
		//if ( p2 > pstart2 ) *p2++ = ' ';
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		//p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0);
		m_sbuf2.safeMemcpy ( m_query , m_queryLen );
		//memcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen;
	}
	if ( m_query2Len > 0 ) {
		//if ( p3 > pstart3 ) *p3++ = ' ';
		if ( m_sbuf3.length() ) m_sbuf3.pushChar(' ');
		//p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0);
		m_sbuf3.safeMemcpy ( m_query2 , m_query2Len );
	}
	//if (g_errno == EILSEQ){ // illegal character seq
	//	log("query: bad char set");
	//	g_errno = 0;
	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
	//}
	// append quoted phrases to query
	if ( m_quoteLen1 > 0 ) {
		//if ( p  > pstart  ) *p++  = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//*p++ = '+';
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("+\"");
		//p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0);
		m_sbuf1.safeMemcpy ( m_quote1 , m_quoteLen1 );
		//memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ;
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("\"");
		// add to spell checked buf, too
		//if ( p2 > pstart2 ) *p2++ = ' ';
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		//*p2++ = '+';
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("+\"");
		//p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0);
		m_sbuf2.safeMemcpy ( m_quote1 , m_quoteLen1 );
		//memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ;
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("\"");
	}
	//if (g_errno == EILSEQ){ // illegal character seq
	//	g_errno = 0;
	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
	//}
	if ( m_quoteLen2 > 0 ) {
		//if ( p  > pstart  ) *p++  = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//*p++ = '+';
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("+\"");
		//p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0);
		m_sbuf1.safeMemcpy ( m_quote2 , m_quoteLen2 );
		//memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ;
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("\"");
		// add to spell checked buf, too
		//if ( p2 > pstart2 ) *p2++ = ' ';
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		//*p2++ = '+';
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("+\"");
		//p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0);
		m_sbuf2.safeMemcpy ( m_quote2 , m_quoteLen2 );
		//memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ;
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("\"");
	}
	//if (g_errno == EILSEQ){ // illegal character seq
	//	g_errno = 0;
	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
	//}
	
	// append plus terms
	if ( m_plusLen > 0 ) {
		char *s = m_plus, *send = m_plus + m_plusLen;
		//if ( p > pstart && p < pend ) *p++  = ' ';
		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		while (s < send) {
			while (isspace(*s) && s < send) s++;
			char *s2 = s+1;
			if (*s == '\"') {
				// if there's no closing quote just treat
				// the end of the line as such
				while (*s2 != '\"' && s2 < send) s2++;
				if (s2 < send) s2++;
			} else {
				while (!isspace(*s2) && s2 < send) s2++;
			}
			if (s < send) break;
			//if (p < pend) *p++ = '+';
			//if (p2 < pend2) *p2++ = '+';
			m_sbuf1.pushChar('+');
			m_sbuf2.pushChar('+');
			//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
			//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
			m_sbuf1.safeMemcpy ( s , s2 - s );
			m_sbuf2.safeMemcpy ( s , s2 - s );
			/*
			if (g_errno == EILSEQ) { // illegal character seq
				g_errno = 0;
				if (qcs == csUTF8) {
					qcs = csISOLatin1;
					goto doOver;
				}
				if (qcs != csISOLatin1) {
					qcs = csUTF8;
					goto doOver;
				}
			}
			*/
			s = s2 + 1;
			if (s < send) {
				//if (p < pend) *p++ = ' ';
				//if (p2 < pend2) *p2++ = ' ';
				if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
				if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
			}
		}

	}  
	// append minus terms
	if ( m_minusLen > 0 ) {
		char *s = m_minus, *send = m_minus + m_minusLen;
		//if ( p > pstart && p < pend ) *p++  = ' ';
		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		while (s < send) {
			while (isspace(*s) && s < send) s++;
			char *s2 = s+1;
			if (*s == '\"') {
				// if there's no closing quote just treat
				// the end of the line as such
				while (*s2 != '\"' && s2 < send) s2++;
				if (s2 < send) s2++;
			} else {
				while (!isspace(*s2) && s2 < send) s2++;
			}
			if (s < send) break;
			//if (p < pend) *p++ = '-';
			//if (p2 < pend2) *p2++ = '-';
			m_sbuf1.pushChar('-');
			m_sbuf2.pushChar('-');
			//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
			//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
			m_sbuf1.safeMemcpy ( s , s2 - s );
			m_sbuf2.safeMemcpy ( s , s2 - s );
			/*
			if (g_errno == EILSEQ) { // illegal character seq
				g_errno = 0;
				if (qcs == csUTF8) {
					qcs = csISOLatin1;
					goto doOver;
				}
				if (qcs != csISOLatin1) {
					qcs = csUTF8;
					goto doOver;
				}
			}
			*/
			s = s2 + 1;
			if (s < send) {
				//if (p < pend) *p++ = ' ';
				//if (p2 < pend2) *p2++ = ' ';
				if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
				if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
			}
		}
	}
	// append gbkeyword:numinlinks if they have &mininlinks=X, X>0
	long minInlinks = m_hr->getLong("mininlinks",0);
	if ( minInlinks > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//char *str = "gbkeyword:numinlinks";
		//long  len = gbstrlen(str);
		//memcpy ( p , str , len );
		//p += len;
		m_sbuf1.safePrintf ( "gbkeyword:numinlinks");
	}

	// null terms
	m_sbuf1.pushChar('\0');
	m_sbuf2.pushChar('\0');
	m_sbuf3.pushChar('\0');

	// the natural query
	m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;

	if ( ! m_displayQuery ) m_displayQuery = "";

	while ( *m_displayQuery == ' ' ) m_displayQuery++;

	m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery


	//log("query: got query %s",m_sbuf1.getBufStart());
	//log("query: got display query %s",m_displayQuery);

	// urlencoded display query
	urlEncode(m_qe,
		  MAX_QUERY_LEN*2,
		  m_displayQuery,
		  m_displayQueryLen);
	
	return true;
}

Exemplo n.º 28

0

Exibir arquivo

Arquivo: SpiderProxy.cpp Projeto: privacore/open-source-search-engine

// . when the Conf::m_proxyIps parm is updated we call this to rebuild
//   s_iptab, our table of SpiderProxy instances, which has the proxies and 
//   their performance statistics.
// . we try to maintain stats of ip/ports that did NOT change when rebuilding.
bool buildProxyTable ( ) {

	// scan the NEW list of proxy ip/port pairs in g_conf
	char *p = g_conf.m_proxyIps.getBufStart();

	HashTableX tmptab;
	tmptab.set(8,0,16,NULL,0,false,"tmptab");

	// scan the user inputted space-separated list of ip:ports
	// (optional username:password@ip:port)
	for ( ; *p ; ) {
		// skip white space
		if ( is_wspace_a(*p) ) { p++; continue; }

		// skip http://
		if ( strncasecmp(p,"http://",7) == 0 ) { p += 7; continue; }

		// scan in an ip:port
		char *s = p; char *portStr = NULL;
		int32_t dc = 0, pc = 0, gc = 0, bc = 0;
		const char *msg;

		char *usernamePwd = NULL;
		int32_t usernamePwdLen = 0;
		char *ipStart = p;

		// scan all characters until we hit \0 or another whitespace
		for ( ; *s && !is_wspace_a(*s); s++) {

			if ( *s == '@' ) {
				// must be username:pwd
				if ( pc != 1 ) {
					msg = "bad username:password";
					goto hadError;
				}
				usernamePwd = p;
				usernamePwdLen = s - p;
				if ( usernamePwdLen >= MAXUSERNAMEPWD-2 ) {
					msg = "username:password too long";
					goto hadError;
				}
				dc = 0;
				gc = 0;
				bc = 0;
				pc = 0;
				portStr = NULL;
				ipStart = s+1;
				continue;
			}

			if ( *s == '.' ) { dc++; continue; }
			if ( *s == ':' ) { portStr=s; pc++; continue; }
			if ( is_digit(*s) ) { gc++; continue; }
			bc++;
			continue;
		}
		// ensure it is a legit ip:port combo
		msg = NULL;
		if ( gc < 4 ) 
			msg = "not enough digits for an ip";
		if ( pc > 1 )
			msg = "too many colons";
		if ( dc != 3 )
			msg = "need 3 dots for an ip address";
		if ( bc )
			msg = "got illegal char in ip:port listing";
		if ( msg ) {
		hadError:
			char c = *s;
			*s = '\0';
			log("buf: %s for %s",msg,p);
			*s = c;
			return false;
		}

		// convert it
		int32_t iplen = s - ipStart;
		if ( portStr ) iplen = portStr - ipStart;
		int32_t ip = atoip(ipStart,iplen);
		// another sanity check
		if ( ip == 0 || ip == -1 ) {
			log("spider: got bad proxy ip for %s",p);
			return false;
		}

		// and the port default is 80
		int32_t port = 80;
		if ( portStr ) port = atol2(portStr+1,s-portStr-1);
		if ( port < 0 || port > 65535 ) {
			log("spider: got bad proxy port for %s",p);
			return false;
		}


		// . we got a legit ip:port
		// . see if already in our table
		uint64_t ipKey = (uint32_t)ip;
		ipKey <<= 16;
		ipKey |= (uint16_t)(port & 0xffff);

		// also store into tmptable to see what we need to remove
		tmptab.addKey(&ipKey);

		// see if in table
		int32_t islot = s_iptab.getSlot( &ipKey);

		// advance p
		p = s;

		// if in there, keep it as is
		if ( islot >= 0 ) continue;

		// otherwise add new entry
		SpiderProxy newThing;
		memset ( &newThing , 0 , sizeof(SpiderProxy));
		newThing.m_ip = ip;
		newThing.m_port = port;
		newThing.m_lastDownloadTookMS = -1;
		newThing.m_lastSuccessfulTestMS = -1;

		gbmemcpy(newThing.m_usernamePwd,usernamePwd,usernamePwdLen);
		// ensure it is NULL terminated
		newThing.m_usernamePwd[usernamePwdLen] = '\0';

		if ( ! s_iptab.addKey ( &ipKey, &newThing ) )
			return false;
	}		

 redo:
	int32_t removed = 0;
	// scan all SpiderProxies in tmptab
	for ( int32_t i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
		// skip empty buckets in hashtable s_iptab
		if ( ! s_iptab.m_flags[i] ) continue;
		// get the key
		int64_t key = *(int64_t *)s_iptab.getKeyFromSlot(i);
		// must also exist in tmptab, otherwise it got removed by user
		if ( tmptab.isInTable ( &key ) ) continue;
		// skip if not in table
		if ( s_iptab.getSlot ( &key ) < 0 ) {
			log("sproxy: iptable hashing messed up");
			continue;
		}
		// shoot, it got removed. not in the new list of ip:ports
		s_iptab.removeKey ( &key );
		removed++;
		// hashtable is messed up now, start over
		//goto redo;
	}
	if ( removed ) goto redo;
	return true;
}

Exemplo n.º 29

0

Exibir arquivo

Arquivo: HttpMime.cpp Projeto: privacore/open-source-search-engine

void HttpMime::addCookie(const httpcookie_t &cookie, const Url &currentUrl, SafeBuf *cookieJar) {
	// don't add expired cookie into cookie jar
	if (cookie.m_expired) {
		return;
	}

	if (cookie.m_domain) {
		cookieJar->safeMemcpy(cookie.m_domain, cookie.m_domainLen);
		cookieJar->pushChar('\t');
		cookieJar->safeStrcpy(cookie.m_defaultDomain ? "FALSE\t" : "TRUE\t");
	} else {
		cookieJar->safeMemcpy(currentUrl.getHost(), currentUrl.getHostLen());
		cookieJar->pushChar('\t');

		cookieJar->safeStrcpy("FALSE\t");
	}

	if (cookie.m_path) {
		cookieJar->safeMemcpy(cookie.m_path, cookie.m_pathLen);
		cookieJar->pushChar('\t');
	} else {
		if (currentUrl.getPathLen()) {
			cookieJar->safeMemcpy(currentUrl.getPath(), currentUrl.getPathLen());
		} else {
			cookieJar->pushChar('/');
		}
		cookieJar->pushChar('\t');
	}

	if (cookie.m_secure) {
		cookieJar->safeStrcpy("TRUE\t");
	} else {
		cookieJar->safeStrcpy("FALSE\t");
	}

	// we're not using expiration field
	cookieJar->safeStrcpy("0\t");

	int32_t currentLen = cookieJar->length();
	cookieJar->safeMemcpy(cookie.m_cookie, cookie.m_cookieLen);

	// cater for multiline cookie
	const char *currentPos = cookieJar->getBufStart() + currentLen;
	const char *delPosStart = NULL;
	int32_t delLength = 0;
	while (currentPos < cookieJar->getBufPtr() - 1) {
		if (delPosStart) {
			if (is_wspace_a(*currentPos) || *currentPos == '\n' || *currentPos == '\r') {
				++delLength;
			} else {
				break;
			}
		} else {
			if (*currentPos == '\n' || *currentPos == '\r') {
				delPosStart = currentPos;
				++delLength;
			}
		}

		++currentPos;
	}
	cookieJar->removeChunk1(delPosStart, delLength);

	/// @todo ALC handle httpOnly attribute

	cookieJar->pushChar('\n');
}