C++ (Cpp) is_alnum_a Examples

Example #1

0

Show file

File: Entities.cpp Project: BlaBlaNet/open-source-search-engine

// . s[maxLen] should be the NULL
// . returns full length of entity @ "s" if there is a valid one, 0 otherwise
// . sets *c to the iso character the entity represents (if there is one)
// JAB: const-ness for optimizer...
int32_t getEntity_a ( char *s , int32_t maxLen , uint32_t *c ) {
	// ensure there's an & as first char
	if ( s[0] != '&' ) return 0;
	// compute maximum length of entity, if it's indeed an entity
	int32_t len = 1;
	if ( s[len]=='#' ) len++;
	// cut it off after 9 chars to save time
	while ( len < maxLen && len < 9 && is_alnum_a(s[len]) ) len++;
	// include the ending ; if any
	if ( len < maxLen && s[len]==';' ) len++;
	//	char d = s[len];
	//	s[len]='\0';
	//	fprintf(stderr,"got entity %s \n",s);
	//	s[len]=d;
	// we don't have entities longer than "&curren;"
	if ( len > 10 ) return 0;
	// all entites are 3 or more chars (&gt)
	if ( len < 3 ) return 0;
	// . if it's a numeric entity like &#123 use this routine
	// . pass in the whole she-bang: "&#12...;" or "&acute...;
	if ( s[1] == '#' ) {
		if ( s[2] == 'x' ) *c = getHexadecimalEntity (s, len );
		else               *c = getDecimalEntity     (s, len );
	}
	// otherwise, it's text
	else *c = getTextEntity ( s , len );
	// return 0 if not an entity, length of entity if it is an entity
	if ( *c ) return len;
	else      return 0;
}

Example #2

0

Show file

File: Entities.cpp Project: lemire/open-source-search-engine

// . s[maxLen] should be the NULL
// . returns full length of entity @ "s" if there is a valid one, 0 otherwise
// . sets *c to the iso character the entity represents (if there is one)
// JAB: const-ness for optimizer...
int32_t getEntity_a ( const char *s , int32_t maxLen , uint32_t *c ) {
	// ensure there's an & as first char
	if ( s[0] != '&' ) {
		return 0;
	}

	// compute maximum length of entity, if it's indeed an entity
	int32_t len = 1;
	if ( s[len] == '#' ) {
		len++;
	}

	// cut it off after 9 chars to save time
	while ( len < maxLen && len < 9 && is_alnum_a( s[len] ) ) {
		len++;
	}

	// character entity reference must end with a semicolon.
	// some browsers have lenient parsing, but we don't accept invalid
	// references.
	if ( len == maxLen || s[len] != ';' ) {
		//not a valid character entity reference
		return 0;
	}
	len++;

	// we don't have entities longer than "&curren;"
	if ( len > 10 ) {
		return 0;
	}

	// all entites are 3 or more chars (&gt)
	if ( len < 3 ) {
		return 0;
	}

	// . if it's a numeric entity like &#123 use this routine
	// . pass in the whole she-bang: "&#12...;" or "&acute...;
	if ( s[1] == '#' ) {
		if ( s[2] == 'x' ) {
			*c = getHexadecimalEntity( s, len );
		} else {
			*c = getDecimalEntity( s, len );
		}
	} else {
		// otherwise, it's text
		*c = getTextEntity( s, len );
	}

	// return 0 if not an entity, length of entity if it is an entity
	if ( *c ) {
		return len;
	} else {
		return 0;
	}
}

Example #3

0

Show file

File: fctypes.cpp Project: privacore/open-source-search-engine

// . get the # of words in this string
int32_t getNumWords ( char *s , int32_t len, int32_t titleVersion ) {

	int32_t wordCount = 0;
	bool inWord   = false;
	for ( int32_t i = 0 ; i < len ; i++ ) {
		if ( ! is_alnum_a ( s[i] ) && s[i]!='\'' ) {
			inWord = false;
			continue;
		}
		if ( ! inWord ) {
			inWord = true;
			wordCount++;
		}
	}
	return wordCount;
}

Example #4

0

Show file

File: Words.cpp Project: BillWangCS/open-source-search-engine

unsigned char Words::isBounded(int wordi) {
	if(wordi+1 < m_numWords &&
	   getWord(wordi)[getWordLen(wordi)] == '/' //||
	    //getWord(wordi)[getWordLen(wordi)] == '?'
	   )
		return(true);
	if(wordi+1 < m_numWords &&
	   (getWord(wordi)[getWordLen(wordi)] == '.' ||
	    getWord(wordi)[getWordLen(wordi)] == '?') &&
	   is_alnum_a(getWord(wordi)[getWordLen(wordi)+1]) )
		return(true);
	if(wordi > 0 &&
	   (getWord(wordi)[-1] == '/' ||
	    getWord(wordi)[-1] == '?'))
		return(true);

	return(false);
}

Example #5

0

Show file

File: XmlNode.cpp Project: Doken-Tokuyama/open-source-search-engine

nodeid_t getTagId ( char *s , NodeType **retp ) {

	// init table?
	static bool s_init = false;
	static HashTableX  s_ht;
	static char s_buf[10000];
	if ( ! s_init ) {
		s_init = true;
		s_ht.set ( 4 ,4,1024,s_buf,10000,false,0,"tagids");//niceness=0
		// how many NodeTypes do we have in g_nodes?
		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			char *name = g_nodes[i].m_nodeName;
			int32_t  nlen = gbstrlen(name);
			int64_t h = hash64Upper_a ( name,nlen,0LL );
			NodeType *nt = &g_nodes[i];
			if ( ! s_ht.addKey(&h,&nt) ) { 
				char *xx=NULL;*xx=0; }
		}
		// sanity
		if ( s_ht.m_numSlots != 1024 ) { char *xx=NULL;*xx=0; }
		// sanity test
		nodeid_t tt = getTagId ( "br" );
		if ( tt != TAG_BR ) { char *xx=NULL;*xx=0; }
	}


	// find end of tag name. hyphens are ok to be in name.
	// facebook uses underscores like <start_time>
	char *e = s; for ( ; *e && (is_alnum_a(*e) || *e=='-'|| *e=='_'); e++);
	// hash it for lookup
	int64_t h = hash64Upper_a ( s , e - s , 0 );
	// look it up
	NodeType **ntp = (NodeType **)s_ht.getValue(&h);
	// assume none
	if ( retp ) *retp = NULL;
	// none?
	if ( ! ntp ) return 0;
	// got one
	if ( retp ) *retp = *ntp;
	// get id otherwise
	return (*ntp)->m_nodeId;
}

Example #6

0

Show file

File: HttpRequest.cpp Project: BKJackson/open-source-search-engine

// *next is set to ptr into m_cgiBuf so that the next successive call to
// getString with the SAME "field" will start at *next. that way you
// can use the same cgi parameter multiple times. (like strstr kind of)
char *HttpRequest::getStringFromCookie ( char *field      ,
					 long *len        ,
					 char *defaultStr ,
					 long *next       ) {
	// get field len
	long flen = gbstrlen(field);
	// assume none
	if ( len ) *len = 0;
	// if no cookie, forget it
	if ( ! m_cookiePtr ) return defaultStr;
	// the end of the cookie
	//char *pend = m_cookieBuf + m_cookieBufLen;
	char *pend = m_cookiePtr + m_cookieLen;
	char *p    = m_cookiePtr;
	// skip over spaces and punct
	for ( ; p && p < pend ; p++ ) 
		if ( is_alnum_a(*p) ) break;
	// skip "Cookie:"
	if ( p + 7 < pend && ! strncasecmp(p,"cookie:",7) ) p += 7;
	// skip spaces after that
	for ( ; p && p < pend ; p++ ) 
		if ( is_alnum_a(*p) ) break;
	// crazy?
	if ( p >= pend ) return defaultStr;

	char *savedVal = NULL;
	// so we do not skip the first cookie, jump right in!
	// otherwise we lose the calendar cookie for msie
	goto entryPoint;
	// . loop over all xxx=yyy\0 thingies in the cookie
	// . we converted every '&' to a \0 when the cookiebuf was set above
	//for ( char *p = m_cookieBuf ; *p ; p += gbstrlen(p) + 1 ) {
	// . no, we just keep them as &'s because seems like cookies use ;'s
	//   as delimeters not so much &'s. and when we log the cookie in the
	//   log, i wanted to see the whole cookie, so having \0's in the
	//   cookie was messing that up.
	for ( ; p < pend ; p++ ) { 
		// need a \0
		// fixes "display=0&map=0&calendar=0;" that is only one cookie.
		// so do not grap value of map or calendar from that!!
		if ( *p ) continue;
		// back to back \0's? be careful how we skip over them!
		if ( ! p[1] ) continue;
		// skip that
		if ( ++p >= pend ) break;
		// skip whitespace that follows
		for ( ; p < pend ; p++ ) 
			if ( ! is_wspace_a(*p) ) break;
		// end of cookie?
		if ( p >= pend ) break;
	entryPoint:
		// check first char
		if ( *p != *field ) continue;
		// does it match? continue if not a match
		if ( strncmp ( p , field , flen ) ) continue;
		// point to value
		char *val = p + flen;
		// must be an equal sign
		if ( *val != '=' ) continue;
		// skip that sign
		val++;
		// . cookies terminate fields by space or ; or &
		// . skip to end of cookie value for this field
		char *e = val;
		// skip over alnum. might also be \0 if this function
		// was already called somewhere else!
		// we NULL separated each cookie and then urldecoded each
		// cookie above in the m_cookieBuf logic. cookies can contain
		// encoded ;'s and &'s so i took this checks out of this while
		// loop. like the widgetHeader has semicolons in it and it
		// stores in the cookie.
		while ( e < pend && *e ) e++;
		// that is the length
		if ( len ) *len = e - val;
		// NULL terminate it, we should have already logged the cookie
		// so it should be ok to NULL terminate now. we already
		// call urlDecode() now above... and make the &'s into \0's
		*e = '\0';
		// if we were in the meta cookie, return that...
		// otherwise if you visited this site before metacookies
		// were used you might have the cookie outside the meta
		// cookie AND inside the metacookie, and only the value
		// inside the metacookie is legit...
		if ( val > m_metaCookie ) return val;
		// otherwise, save it and try to get from meta cookie
		savedVal = val;
		// length
		//if ( len ) *len = gbstrlen(val);
		// this is the value!
		//return val;
	}
	// did we save something?
	if ( savedVal ) return savedVal;
	// no match
	return defaultStr;
}

Example #7

0

Show file

File: XmlNode.cpp Project: Doken-Tokuyama/open-source-search-engine

// Return the value of the specified "field" within this node.
// the case of "field" does not matter.
char *XmlNode::getFieldValue ( char *field , int32_t *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	int32_t flen = gbstrlen(field);
	char inQuotes = '\0';
	int32_t i;

	// scan the characters in the node, looking for the field name in ascii
	for ( i = 1; i + flen < m_nodeLen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (m_node[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (m_node[i]=='\"' || m_node[i]=='\'')){ 
			inQuotes = m_node[i];
			continue;
		} 
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( m_node[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (m_node[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (m_node[i+flen]!='='&&!is_wspace_a(m_node[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &m_node[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}


	// return NULL if no matching field
	if ( i + flen >= m_nodeLen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < m_nodeLen && m_node[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < m_nodeLen && is_wspace_a ( m_node[i] ) ) i++;
	
	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( m_node[i]=='\"' || m_node[i]=='\'') inQuotes = m_node[i++]; 

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) {
		while (i<m_nodeLen && m_node[i] != inQuotes ) 
			i++;
	}
	else {
		while ( i<m_nodeLen &&
			!is_wspace_a(m_node[i])&&
			m_node[i]!='>')
			i++;
	}

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return m_node + start;
}

Example #8

0

Show file

File: XmlNode.cpp Project: Doken-Tokuyama/open-source-search-engine

// . called by Xml class
// . returns the length of the node
// . TODO: "node" is now guaranteed to be \0 terminated -- make this faster
int32_t XmlNode::set ( char *node , bool pureXml , int32_t version ) {
	// save head of node
	m_node        = node;

	// sanity check
	static bool s_check = false;
	if ( ! s_check ) {
		s_check = true;
		// how many NodeTypes do we have in g_nodes?
		static int32_t nn = sizeof(g_nodes) / sizeof(NodeType);
		// set the hash table
		for ( int32_t i = 0 ; i < nn ; i++ ) {
			// sanity
			if ( g_nodes[i].m_nodeId != i ) { char *xx=NULL;*xx=0;}
		}
	}


	// . reset this
	// . need to do here instead of in Links.cpp because sometimes
	//   we think an anchor tag indicates a link, but it is really
	//   just an <a href="javascript:..."> function call and Links.cpp
	//   ignored it but we are expecting this to be valid!
	m_isSelfLink = 0;

	// reset
	//m_linkNum = -1;

	// CDATA tag was identified in earlier versions as a text node. Now 
	// it is identified as a CDATA tag node. But gb.conf and others always
	// pass their version as 0
	if ( node[0] == '<' &&
	     node[1] == '!' &&
	     node[2] == '[' &&
	     node[3] == 'C' &&
	     node[4] == 'D' &&
	     node[5] == 'A' &&
	     node[6] == 'T' &&
	     node[7] == 'A' &&
	     node[8] == '[' ) 
		return setCDATANode ( node );

	// if "node" isn't the start of a tag then set it as a Text Node
	if ( *node != '<' || ! isTagStart ( node ) ) {//, 0, version ) ) {
		// . set this node as a text node!
		// . nodeId for text nodes is 0
		m_nodeId     = 0;
		m_node       = node;
		m_hasBackTag = false;
		m_hash       = 0;
		int32_t i = 0;
		//char inCDATA = 0;
		// inc i as int32_t as it's NOT the beginning of a tag
		while ( node[i] && 
			(node[i] != '<' || ! isTagStart ( node+i)))//,versin)))
			i++;
		m_nodeLen = i;
		m_pairTagNum = -1;
		return m_nodeLen;
	}

	// . see if it's a comment (node end is "-->" for comments)
	// . comments are special cases
	if  ( node[1] == '!' ) {
		if ( node[2]=='-' && node[3]=='-' ) 
			return setCommentNode ( node );
		// this means comment too:
		// <![if ....]>
		if ( node[2]=='[' )
			return setCommentNode2 ( node );
	}

	// . otherwise it's a regular tag
	// . might be <!DOCTYPE ...> or something though
	m_nodeLen = getTagLen ( node );//, version );
	// . get the node's name's length (i-1)
	// . node name ends at non alnum char 
	// . we can have hyphens in node name (TODO: even at beginning???)
	int32_t tagNameStart = 1;
	// . skip over backslash in the back tags
	// . or skip over / or ? or ! now
	// . tag names must start with a letter, fwiw
	if ( ! is_alnum_a(node[tagNameStart]) /* == '/'*/ ) tagNameStart++;
	int32_t i = tagNameStart;
	// skip i to end of tagName. this should only allow ascii chars
	// to be "tag name chars"
	for ( ; i < m_nodeLen && is_tagname_char(node[i]) ; i++ );
	// set the tagName and tagNameLen
	m_tagName    = &node [ tagNameStart ];
	m_tagNameLen = i - tagNameStart;

	// break point
	//if ( m_tagNameLen == 3 && m_tagName[0]=='!' && 
	//     m_tagName[1]=='-' && m_tagName[2]=='-' )
	//	fprintf(stderr,"man!");
	// . set the node's hash -- used cuz it's faster than strcmp
	// . just hash the letters as upper case
	// . tag names are never utf8, so use the ascii ha
	m_hash = hash64Upper_a ( m_tagName , m_tagNameLen , 0LL);

	// if we're pure xml, don't allow any html tags accept <!-- -->
	if ( pureXml ) {
		m_hasBackTag = true;
		m_isBreaking = true;
		m_isVisible  = true;
		//m_nodeId     = TAG_XMLTAG;//1;
		// this returns 1 if tag is not in the list
		m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , 
	}
	// . determine if the nodeId for this node
	// . determine if it breaks lines (for phrasing purposes)
	else 
		m_nodeId = setNodeInfo ( m_hash );//&m_hasBackTag , 
	                                //&m_isBreaking , &m_isVisible );


	// . no back tag if / follow name
	// . this was only for "pureXml" but now i do it for all tags!
	if ( m_node [ m_nodeLen - 2 ] == '/' ) 	m_hasBackTag = false;
	if ( m_node [ m_nodeLen - 2 ] == '?' ) 	m_hasBackTag = false;

	return m_nodeLen;
}

Example #9

0

Show file

File: Words.cpp Project: BillWangCS/open-source-search-engine

bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
	long  i = 0;
	long  j;
	//long  k = 0;
	long  wlen;
	//unsigned long e;
	//long  skip;
	long badCount = 0;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) goto done;

	if ( ! s[i] ) goto done;

	if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) {

		if ( m_numWords >= m_preCount ) goto done;

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if ( s[i+1]=='/' ) {
				// skip over /
				m_tagIds [m_numWords] = ::getTagId(s+i+2);
				m_tagIds [m_numWords] |= BACKBIT;
			}
			else
				m_tagIds [m_numWords] = ::getTagId(s+i+1);
			// word start
			m_words    [m_numWords] = s + i;
			m_wordIds  [m_numWords] = 0LL;
			// skip till end
			long tagLen = getTagLen(s+i); // ,niceness);
			m_wordLens [m_numWords] = tagLen;
			m_numWords++;
			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		//for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i));
		for ( ; s[i] ; i += getUtf8CharSize(s+i)){
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) break;
			// breathe
			QUICKPOLL(niceness);
			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) continue;
				// update
				oldScript = ucScriptCommon;
				// otherwise, stop we got alnum
				break;
			}
			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );
			// stop if word char
			if ( ! ucIsWordChar ( c ) ) continue;
			// update first though
			oldScript = ucGetScript ( c );
			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		if (m_tagIds) m_tagIds[m_numWords] = 0;
		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	//for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) );
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// breathe
		QUICKPOLL(niceness);
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanes chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}
	
	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
	
	// allow for words like we're dave's and i'm
	if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;
	
	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;
	// . Lars says it's better to leave the accented chars intact
	// . google agrees
	// . but what about "re'sume"?
	if ( computeWordIds ) {
		long long h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
		// until we get an accent removal algo, comment this
		// out and possibly use the query synonym pipeline
		// to search without accents. MDW
		//long long h2 = hash64AsciiLowerE(&s[j],wlen);
		//if ( h2 != h ) m_stripWordIds [m_numWords] = h2;
		//else           m_stripWordIds [m_numWords] = 0LL;
		//m_stripWordIds[m_numWords] = 0;
	}
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// break on \0 or MAX_WORDS
	//if ( ! s[i] ) goto done;
	// get a punct word
	goto uptop;
	/*
	  j = i;
	  // delineate the "punctuation" word
	  for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i));
	  // bad utf8 could cause us to breach the node, so watch out!
	  if ( i > nodeLen ) {
	  badCount++;
	  i = nodeLen;
	  }
	  // get word length
	  wlen = i - j;
	  if ( m_numWords >= m_preCount ) goto done;
	  m_words        [m_numWords  ] = &s[j];
	  m_wordLens     [m_numWords  ] = wlen;
	  m_wordIds      [m_numWords  ] = 0LL;
	  if (m_tagIds) m_tagIds[m_numWords] = 0;
	  m_numWords++;
	*/

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC,
		    "build: words: set: Fix counting routine.");
		char *xx = NULL; *xx = 0;
	}
	// compute total length
	if ( m_numWords <= 0 ) m_totalLen = 0;
	else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1];

	if ( badCount )
		log("words: had %li bad utf8 chars",badCount);

	return true;
}

Example #10

0

Show file

File: Words.cpp Project: BillWangCS/open-source-search-engine

// . return the value of the specified "field" within this html tag, "s"
// . the case of "field" does not matter
char *getFieldValue ( char *s , 
		      long  slen ,
		      char *field , 
		      long *valueLen ) {
	// reset this to 0
	*valueLen = 0;
	// scan for the field name in our node
	long flen = gbstrlen(field);
	char inQuotes = '\0';
	long i;

	// make it sane
	if ( slen > 2000 ) slen = 2000;

	for ( i = 1; i + flen < slen ; i++ ) {
		// skip the field if it's quoted
		if ( inQuotes) {
			if (s[i] == inQuotes ) inQuotes = 0;
			continue;
		}
		// set inQuotes to the quote if we're in quotes
		if ( (s[i]=='\"' || s[i]=='\'')){ 
			inQuotes = s[i];
			continue;
		} 
		// if not in quote tag might end
		if ( s[i] == '>' && ! inQuotes ) return NULL;
		// a field name must be preceeded by non-alnum
		if ( is_alnum_a ( s[i-1] ) ) continue;
		// the first character of this field shout match field[0]
		if ( to_lower_a (s[i]) != to_lower_a(field[0] )) continue;
		// field just be immediately followed by an = or space
		if (s[i+flen]!='='&&!is_wspace_a(s[i+flen]))continue;
		// field names must match
		if ( strncasecmp ( &s[i], field, flen ) != 0 ) continue;
		// break cuz we got a match for our field name
		break;
	}
	
	
	// return NULL if no matching field
	if ( i + flen >= slen ) return NULL;

	// advance i over the fieldname so it pts to = or space
	i += flen;

	// advance i over spaces
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;

	// advance over the equal sign, return NULL if does not exist
	if ( i < slen && s[i++] != '=' ) return NULL;

	// advance i over spaces after the equal sign
	while ( i < slen && is_wspace_a ( s[i] ) ) i++;
	
	// now parse out the value of this field (could be in quotes)
	inQuotes = '\0';

	// set inQuotes to the quote if we're in quotes
	if ( s[i]=='\"' || s[i]=='\'') inQuotes = s[i++]; 

	// mark this as the start of the value
	int start=i;

	// advance i until we hit a space, or we hit a that quote if inQuotes
	if (inQuotes) while (i<slen && s[i] != inQuotes ) i++;
	else while ( i<slen &&!is_wspace_a(s[i])&&s[i]!='>')i++;

	// set the length of the value
	*valueLen = i - start;

	// return a ptr to the value
	return s + start;
}

Example #11

0

Show file

File: Collectiondb.cpp Project: BKJackson/open-source-search-engine

// . MDW: TODO: bring this back when we have a subdir for each collection
// . add a new rec
// . returns false and sets g_errno on error
// . use a collnum_t of -1 if it is new
bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
			    collnum_t collnum , bool isDump ,
			    bool saveIt ) {
	// sanity check
	if ( ( isNew && collnum >= 0) ||
	     (!isNew && collnum <  0) ) {
		log(LOG_LOGIC,"admin: Bad parms passed to addRec.");
		char *xx = NULL; *xx = 0;
	}
	// ensure coll name is legit
	char *p = coll;
	for ( ; *p ; p++ ) {
		if ( is_alnum_a(*p) ) continue;
		if ( *p == '-' ) continue;
		break;
	}
	if ( *p ) {
		g_errno = EBADENGINEER;
		log("admin: \"%s\" is a malformed collection name because it "
		    "contains the '%c' character.",coll,*p);
		return false;
	}
	// . scan for holes
	// . i is also known as the collection id
	long i ;
	if ( collnum >= 0 ) i = (long)collnum;
	else for ( i = 0 ; i < m_numRecs ; i++ ) if ( ! m_recs[i] ) break;
	// ceiling?
	if ( i >= MAX_COLLS ) {
		g_errno = ENOBUFS;
		return log("admin: Limit of %li collection reached. "
			   "Collection not created.",(long)MAX_COLLS);
	}
	// if empty... bail, no longer accepted, use "main"
	if ( ! coll || !coll[0] ) {
		g_errno = EBADENGINEER;
		return log("admin: Trying to create a new collection "
			   "but no collection name provided. Use the \"c\" "
			   "cgi parameter to specify it.");
	}
	// or if too big
	if ( gbstrlen(coll) > MAX_COLL_LEN ) {
		g_errno = ENOBUFS;
		return log("admin: Trying to create a new collection "
			   "whose name \"%s\" of %i chars is longer than the "
			   "max of %li chars.",coll,gbstrlen(coll),
			   (long)MAX_COLL_LEN);
	}
		
	// ensure does not already exist in memory
	if ( getCollnum ( coll ) >= 0 ) {
		g_errno = EEXIST;
		return log("admin: Trying to create collection \"%s\" but "
			   "already exists in memory.",coll);
	}
	// MDW: ensure not created on disk since time of last load
	char dname[512];
	sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i);
	if ( isNew && opendir ( dname ) ) {
		g_errno = EEXIST;
		return log("admin: Trying to create collection %s but "
			   "directory %s already exists on disk.",coll,dname);
	}
	//char fname[512];
	// ending '/' is ALWAYS included in g_hostdb.m_dir
	//sprintf ( fname , "%s%li.%s.conf",g_hostdb.m_dir,i,coll);
	//File f;
	//f.set ( fname );
	//if ( f.doesExist() ) {
	//	g_errno = EEXIST;
	//	return log("admin: Trying to create collection \"%s\" but "
	//		   "file %s already exists on disk.",coll,fname);
	//}
	// create the record in memory
	m_recs[i] = new (CollectionRec);
	if ( ! m_recs[i] ) 
		return log("admin: Failed to allocated %li bytes for new "
			   "collection record for \"%s\".",
			   (long)sizeof(CollectionRec),coll);
	mnew ( m_recs[i] , sizeof(CollectionRec) , "CollectionRec" ); 
	// get copy collection
	CollectionRec *cpcrec = NULL;
	if ( cpc && cpc[0] ) cpcrec = getRec ( cpc , cpclen );
	if ( cpc && cpc[0] && ! cpcrec )
		log("admin: Collection \"%s\" to copy config from does not "
		    "exist.",cpc);
	// get the default.conf from working dir if there
	g_parms.setToDefault( (char *)m_recs[i] );

	if ( isNew ) {
		// the default conf file
		char tmp1[1024];
		sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
		// . set our parms from the file.
		// . accepts OBJ_COLLECTIONREC or OBJ_CONF
		g_parms.setFromFile ( m_recs[i] , NULL , tmp1 );
	}

	// this will override all
	if ( cpcrec ) {
		// copy it, but not the timedb hashtable, etc.
		long size = (char *)&(cpcrec->m_END_COPY) - (char *)cpcrec;
		// JAB: bad memcpy - no donut!
		// this is not how objects are supposed to be copied!!!
		memcpy ( m_recs[i] , cpcrec , size);//sizeof(CollectionRec) );
		// perform the cleanup that a copy constructor might do...
		//for (int rx = 0; rx < MAX_FILTERS; rx++)
		//	m_recs[i]->m_pRegExParser[rx] = NULL;
		// don't NUKE the filters!
		// m_recs[i]->m_numRegExs = 0;
		// OK - done with cleaning up...
		// but never copy over the collection hostname, that is
		// problematic
		m_recs[i]->m_collectionHostname [0] = '\0';
		m_recs[i]->m_collectionHostname1[0] = '\0';
		m_recs[i]->m_collectionHostname2[0] = '\0';
	}

	// set coll id and coll name for coll id #i
	strcpy ( m_recs[i]->m_coll , coll );
	m_recs[i]->m_collLen = gbstrlen ( coll );
	m_recs[i]->m_collnum = i;

	// point to this, so Rdb and RdbBase can reference it
	coll = m_recs[i]->m_coll;

	// . if has no password or ip add the default password, footbar
	// . no, just don't have any password, just use the 127.0.0.1 ip
	//   that is the loopback
	/*
	if ( m_recs[i]->m_numAdminIps  == 0 &&
	     m_recs[i]->m_numAdminPwds == 0    ) {
		m_recs[i]->m_numAdminIps = 1;
		m_recs[i]->m_adminIps[0] = atoip("0.0.0.0",7);
		//strcpy ( m_recs[i]->m_adminPwds[0] , "footbar23" );
		//m_recs[i]->m_numAdminPwds = 1;
		//log("admin: Using default password for new collection of "
		//    "'footbar23'.");
	}
	*/

	// collection name HACK for backwards compatibility
	//if ( strcmp ( coll , "main" ) == 0 ) {
	//	m_recs[i]->m_coll[0] = '\0';
	//	m_recs[i]->m_collLen = 0;
	//	//coll[0] = '\0';
	//}

	// MDW: create the new directory
	if ( isNew ) {
	retry22:
		if ( ::mkdir ( dname , 
			       S_IRUSR | S_IWUSR | S_IXUSR | 
			       S_IRGRP | S_IWGRP | S_IXGRP | 
			       S_IROTH | S_IXOTH ) ) {
			// valgrind?
			if ( errno == EINTR ) goto retry22;
			g_errno = errno;
			mdelete ( m_recs[i] , sizeof(CollectionRec) , 
				  "CollectionRec" ); 
			delete ( m_recs[i]);
			m_recs[i] = NULL;
			return log("admin: Creating directory %s had error: "
				   "%s.", dname,mstrerror(g_errno));
		}
		// save it into this dir... might fail!
		if ( ! m_recs[i]->save() ) {
			mdelete ( m_recs[i] , sizeof(CollectionRec) , 
				  "CollectionRec" ); 
			delete ( m_recs[i]);
			m_recs[i] = NULL;
			return log("admin: Failed to save file %s: %s",
				   dname,mstrerror(g_errno));
		}
	}
	// load if not new
	if ( ! isNew && ! m_recs[i]->load ( coll , i ) ) {
		mdelete ( m_recs[i], sizeof(CollectionRec), "CollectionRec" ); 
		delete ( m_recs[i]);
		m_recs[i] = NULL;
		return log("admin: Failed to load conf for collection "
			   "\"%s\".",coll);
	}
	// mark it as needing to be saved instead
	m_recs[i]->m_needsSave = false;
	// force this to off for now
	//m_recs[i]->m_queryExpansion = false;
	// reserve it
	if ( i >= m_numRecs ) m_numRecs = i + 1;
	// count it
	m_numRecsUsed++;
	// update the time
	updateTime();
	// if we are doing a dump from the command line, skip this stuff
	if ( isDump ) return true;
	bool verify = true;
	if(isNew) verify = false;
	// tell rdbs to add one, too
	//if ( ! g_indexdb.addColl    ( coll, verify ) ) goto hadError;
	if ( ! g_posdb.addColl    ( coll, verify ) ) goto hadError;
	//if ( ! g_datedb.addColl     ( coll, verify ) ) goto hadError;

	if ( ! g_titledb.addColl    ( coll, verify ) ) goto hadError;
	//if ( ! g_revdb.addColl      ( coll, verify ) ) goto hadError;
	//if ( ! g_sectiondb.addColl  ( coll, verify ) ) goto hadError;
	if ( ! g_tagdb.addColl      ( coll, verify ) ) goto hadError;
	//if ( ! g_catdb.addColl      ( coll, verify ) ) goto hadError;
	//if ( ! g_checksumdb.addColl ( coll, verify ) ) goto hadError;
	if ( ! g_spiderdb.addColl   ( coll, verify ) ) goto hadError;
	if ( ! g_doledb.addColl     ( coll, verify ) ) goto hadError;
	//if ( ! g_tfndb.addColl      ( coll, verify ) ) goto hadError;
	if ( ! g_clusterdb.addColl  ( coll, verify ) ) goto hadError;
	if ( ! g_linkdb.addColl     ( coll, verify ) ) goto hadError;
	// debug message
	log ( LOG_INFO, "admin: added collection \"%s\" (%li).",coll,(long)i);
	// tell SpiderCache about this collection, it will create a 
	// SpiderCollection class for it.
	//g_spiderCache.reset1();

	// . make it set is CollectionRec::m_sortByDateTable now
	// . everyone else uses setTimeOfDayInMilliseconds() in fctypes.cpp
	//   to call this function once their clock is synced with host #0
	//if ( g_hostdb.m_initialized && g_hostdb.m_hostId == 0 )
	//	initSortByDateTable(coll);
	//else if ( g_hostdb.m_initialized && isClockInSync() )
	//	initSortByDateTable(coll);
	// . do it for all regard-less
	// . once clock is in sync with host #0 we may do it again!
	//if ( g_hostdb.m_initialized )
	//	initSortByDateTable(coll);

	// success
	return true;
 hadError:
	log("admin: Had error adding new collection: %s.",mstrerror(g_errno));
	// do not delete it, might have failed to add because not enough
	// memory to read in the tree *-saved.dat file on disk!! and if
	// you delete in then core the *-saved.dat file gets overwritten!!!
	return false;
	/*
	g_indexdb.getRdb()->delColl    ( coll );
	g_datedb.getRdb()->delColl     ( coll );
	g_timedb.getRdb()->delColl     ( coll );
	g_titledb.getRdb()->delColl    ( coll );
	g_revdb.getRdb()->delColl      ( coll );
	g_sectiondb.getRdb()->delColl  ( coll );
	g_placedb.getRdb()->delColl    ( coll );
	g_tagdb.getRdb()->delColl      ( coll );
	//g_catdb.getRdb()->delColl      ( coll );
	//g_checksumdb.getRdb()->delColl ( coll );
	g_spiderdb.getRdb()->delColl   ( coll );
	g_doledb.getRdb()->delColl     ( coll );
	g_tfndb.getRdb()->delColl      ( coll );
	g_clusterdb.getRdb()->delColl  ( coll );
	g_linkdb.getRdb()->delColl     ( coll );
	deleteRec                      ( coll );
	return false;
	*/
}

Example #12

0

Show file

File: Entities.cpp Project: privacore/open-source-search-engine

// . s[maxLen] should be the NULL
// . returns full length of entity @ "s" if there is a valid one, 0 otherwise
// . sets *c to the iso character the entity represents (if there is one)
// JAB: const-ness for optimizer...
int32_t getEntity_a ( const char *s, int32_t maxLen, uint32_t codepoint[2], int32_t *codepointCount, int32_t *utf8Len ) {
	//TODO: handle multi-codepoint entitites
	*utf8Len=0;

	// ensure there's an & as first char
	if ( s[0] != '&' ) {
		return 0;
	}

	// compute maximum length of entity, if it's indeed an entity
	int32_t len = 1;
	if ( s[len] == '#' ) {
		len++;
	}

	// cut it off after <32> chars to save time and also to avoid parsing
	// obscenely long incorrect entitites (eg an ampersand followed by 2MB of letters)
	while ( len < maxLen && len < max_entity_name_len && is_alnum_a( s[len] ) ) {
		len++;
	}

	// character entity reference must end with a semicolon.
	// some browsers have lenient parsing, but we don't accept invalid
	// references.
	if ( len == maxLen || s[len] != ';' ) {
		//not a valid character entity reference
		return 0;
	}
	len++;

	// we don't have entities longer than what w3c specified
	if ( len > max_entity_name_len+1 ) {
		return 0;
	}

	// all entites are 3 or more chars (&gt)
	if ( len < 3 ) {
		return 0;
	}

	// . if it's a numeric entity like &#123 use this routine
	// . pass in the whole she-bang: "&#12...;" or "&acute...;
	if ( s[1] == '#' ) {
		if ( s[2] == 'x' ) {
			codepoint[0] = getHexadecimalEntity( s, len );
			*codepointCount = 1;
		} else {
			codepoint[0] = getDecimalEntity( s, len );
			*codepointCount = 1;
		}
	} else {
		// otherwise, it's a named entity
		const Entity *entity = getTextEntity( s, len );
		if(entity) {
			memcpy(codepoint, entity->codepoint, entity->codepoints*sizeof(int32_t));
			*codepointCount = entity->codepoints;
			*utf8Len = (int32_t)entity->utf8Len;
			return len;
		} else {
			return 0; //unknown named entity
		}
	}

	// return 0 if not an entity, length of entity if it is an entity
	if ( codepoint[0] ) {
		return len;
	} else {
		return 0;
	}
}

Example #13

0

Show file

File: Domains.cpp Project: lemire/open-source-search-engine

static bool isTLD ( char *tld , int32_t tldLen ) {

	int32_t pcount = 0;
	// now they are random!
	for ( int32_t i = 0 ; i < tldLen ; i++ ) {
		// period count
		if ( tld[i] == '.' ) { pcount++; continue; }
		if ( ! is_alnum_a(tld[i]) && tld[i] != '-' ) return false;
	}

	if ( pcount == 0 ) return true;
	if ( pcount >= 2 ) return false;

	// otherwise, if one period, check table to see if qualified

	// we use this as our hashtable
	static bool       s_isInitialized = false;
	// . i shrunk this list a lot
	// . see backups for the hold list
	static const char * const s_tlds[] = {

	  // From: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
	"AAA",
	"AARP",
	"ABB",
	"ABBOTT",
	"ABBVIE",
	"ABOGADO",
	"ABUDHABI",
	"AC",
	"ACADEMY",
	"ACCENTURE",
	"ACCOUNTANT",
	"ACCOUNTANTS",
	"ACO",
	"ACTIVE",
	"ACTOR",
	"AD",
	"ADAC",
	"ADS",
	"ADULT",
	"AE",
	"AEG",
	"AERO",
	"AF",
	"AFL",
	"AG",
	"AGAKHAN",
	"AGENCY",
	"AI",
	"AIG",
	"AIRFORCE",
	"AIRTEL",
	"AKDN",
	"AL",
	"ALIBABA",
	"ALIPAY",
	"ALLFINANZ",
	"ALLY",
	"ALSACE",
	"AM",
	"AMICA",
	"AMSTERDAM",
	"ANALYTICS",
	"ANDROID",
	"ANQUAN",
	"AO",
	"APARTMENTS",
	"APP",
	"APPLE",
	"AQ",
	"AQUARELLE",
	"AR",
	"ARAMCO",
	"ARCHI",
	"ARMY",
	"ARPA",
	"ARTE",
	"AS",
	"ASIA",
	"ASSOCIATES",
	"AT",
	"ATTORNEY",
	"AU",
	"AUCTION",
	"AUDI",
	"AUDIO",
	"AUTHOR",
	"AUTO",
	"AUTOS",
	"AVIANCA",
	"AW",
	"AWS",
	"AX",
	"AXA",
	"AZ",
	"AZURE",
	"BA",
	"BABY",
	"BAIDU",
	"BAND",
	"BANK",
	"BAR",
	"BARCELONA",
	"BARCLAYCARD",
	"BARCLAYS",
	"BAREFOOT",
	"BARGAINS",
	"BAUHAUS",
	"BAYERN",
	"BB",
	"BBC",
	"BBVA",
	"BCG",
	"BCN",
	"BD",
	"BE",
	"BEATS",
	"BEER",
	"BENTLEY",
	"BERLIN",
	"BEST",
	"BET",
	"BF",
	"BG",
	"BH",
	"BHARTI",
	"BI",
	"BIBLE",
	"BID",
	"BIKE",
	"BING",
	"BINGO",
	"BIO",
	"BIZ",
	"BJ",
	"BLACK",
	"BLACKFRIDAY",
	"BLOOMBERG",
	"BLUE",
	"BM",
	"BMS",
	"BMW",
	"BN",
	"BNL",
	"BNPPARIBAS",
	"BO",
	"BOATS",
	"BOEHRINGER",
	"BOM",
	"BOND",
	"BOO",
	"BOOK",
	"BOOTS",
	"BOSCH",
	"BOSTIK",
	"BOT",
	"BOUTIQUE",
	"BR",
	"BRADESCO",
	"BRIDGESTONE",
	"BROADWAY",
	"BROKER",
	"BROTHER",
	"BRUSSELS",
	"BS",
	"BT",
	"BUDAPEST",
	"BUGATTI",
	"BUILD",
	"BUILDERS",
	"BUSINESS",
	"BUY",
	"BUZZ",
	"BV",
	"BW",
	"BY",
	"BZ",
	"BZH",
	"CA",
	"CAB",
	"CAFE",
	"CAL",
	"CALL",
	"CAMERA",
	"CAMP",
	"CANCERRESEARCH",
	"CANON",
	"CAPETOWN",
	"CAPITAL",
	"CAR",
	"CARAVAN",
	"CARDS",
	"CARE",
	"CAREER",
	"CAREERS",
	"CARS",
	"CARTIER",
	"CASA",
	"CASH",
	"CASINO",
	"CAT",
	"CATERING",
	"CBA",
	"CBN",
	"CC",
	"CD",
	"CEB",
	"CENTER",
	"CEO",
	"CERN",
	"CF",
	"CFA",
	"CFD",
	"CG",
	"CH",
	"CHANEL",
	"CHANNEL",
	"CHASE",
	"CHAT",
	"CHEAP",
	"CHLOE",
	"CHRISTMAS",
	"CHROME",
	"CHURCH",
	"CI",
	"CIPRIANI",
	"CIRCLE",
	"CISCO",
	"CITIC",
	"CITY",
	"CITYEATS",
	"CK",
	"CL",
	"CLAIMS",
	"CLEANING",
	"CLICK",
	"CLINIC",
	"CLINIQUE",
	"CLOTHING",
	"CLOUD",
	"CLUB",
	"CLUBMED",
	"CM",
	"CN",
	"CO",
	"COACH",
	"CODES",
	"COFFEE",
	"COLLEGE",
	"COLOGNE",
	"COM",
	"COMMBANK",
	"COMMUNITY",
	"COMPANY",
	"COMPARE",
	"COMPUTER",
	"COMSEC",
	"CONDOS",
	"CONSTRUCTION",
	"CONSULTING",
	"CONTACT",
	"CONTRACTORS",
	"COOKING",
	"COOL",
	"COOP",
	"CORSICA",
	"COUNTRY",
	"COUPON",
	"COUPONS",
	"COURSES",
	"CR",
	"CREDIT",
	"CREDITCARD",
	"CREDITUNION",
	"CRICKET",
	"CROWN",
	"CRS",
	"CRUISES",
	"CSC",
	"CU",
	"CUISINELLA",
	"CV",
	"CW",
	"CX",
	"CY",
	"CYMRU",
	"CYOU",
	"CZ",
	"DABUR",
	"DAD",
	"DANCE",
	"DATE",
	"DATING",
	"DATSUN",
	"DAY",
	"DCLK",
	"DE",
	"DEALER",
	"DEALS",
	"DEGREE",
	"DELIVERY",
	"DELL",
	"DELOITTE",
	"DELTA",
	"DEMOCRAT",
	"DENTAL",
	"DENTIST",
	"DESI",
	"DESIGN",
	"DEV",
	"DIAMONDS",
	"DIET",
	"DIGITAL",
	"DIRECT",
	"DIRECTORY",
	"DISCOUNT",
	"DJ",
	"DK",
	"DM",
	"DNP",
	"DO",
	"DOCS",
	"DOG",
	"DOHA",
	"DOMAINS",
	"DOWNLOAD",
	"DRIVE",
	"DUBAI",
	"DURBAN",
	"DVAG",
	"DZ",
	"EARTH",
	"EAT",
	"EC",
	"EDEKA",
	"EDU",
	"EDUCATION",
	"EE",
	"EG",
	"EMAIL",
	"EMERCK",
	"ENERGY",
	"ENGINEER",
	"ENGINEERING",
	"ENTERPRISES",
	"EPSON",
	"EQUIPMENT",
	"ER",
	"ERNI",
	"ES",
	"ESQ",
	"ESTATE",
	"ET",
	"EU",
	"EUROVISION",
	"EUS",
	"EVENTS",
	"EVERBANK",
	"EXCHANGE",
	"EXPERT",
	"EXPOSED",
	"EXPRESS",
	"EXTRASPACE",
	"FAGE",
	"FAIL",
	"FAIRWINDS",
	"FAITH",
	"FAMILY",
	"FAN",
	"FANS",
	"FARM",
	"FASHION",
	"FAST",
	"FEEDBACK",
	"FERRERO",
	"FI",
	"FILM",
	"FINAL",
	"FINANCE",
	"FINANCIAL",
	"FIRESTONE",
	"FIRMDALE",
	"FISH",
	"FISHING",
	"FIT",
	"FITNESS",
	"FJ",
	"FK",
	"FLICKR",
	"FLIGHTS",
	"FLORIST",
	"FLOWERS",
	"FLSMIDTH",
	"FLY",
	"FM",
	"FO",
	"FOO",
	"FOOTBALL",
	"FORD",
	"FOREX",
	"FORSALE",
	"FORUM",
	"FOUNDATION",
	"FOX",
	"FR",
	"FRESENIUS",
	"FRL",
	"FROGANS",
	"FRONTIER",
	"FTR",
	"FUND",
	"FURNITURE",
	"FUTBOL",
	"FYI",
	"GA",
	"GAL",
	"GALLERY",
	"GALLO",
	"GALLUP",
	"GAME",
	"GARDEN",
	"GB",
	"GBIZ",
	"GD",
	"GDN",
	"GE",
	"GEA",
	"GENT",
	"GENTING",
	"GF",
	"GG",
	"GGEE",
	"GH",
	"GI",
	"GIFT",
	"GIFTS",
	"GIVES",
	"GIVING",
	"GL",
	"GLASS",
	"GLE",
	"GLOBAL",
	"GLOBO",
	"GM",
	"GMAIL",
	"GMBH",
	"GMO",
	"GMX",
	"GN",
	"GOLD",
	"GOLDPOINT",
	"GOLF",
	"GOO",
	"GOOG",
	"GOOGLE",
	"GOP",
	"GOT",
	"GOV",
	"GP",
	"GQ",
	"GR",
	"GRAINGER",
	"GRAPHICS",
	"GRATIS",
	"GREEN",
	"GRIPE",
	"GROUP",
	"GS",
	"GT",
	"GU",
	"GUCCI",
	"GUGE",
	"GUIDE",
	"GUITARS",
	"GURU",
	"GW",
	"GY",
	"HAMBURG",
	"HANGOUT",
	"HAUS",
	"HDFCBANK",
	"HEALTH",
	"HEALTHCARE",
	"HELP",
	"HELSINKI",
	"HERE",
	"HERMES",
	"HIPHOP",
	"HITACHI",
	"HIV",
	"HK",
	"HM",
	"HN",
	"HOCKEY",
	"HOLDINGS",
	"HOLIDAY",
	"HOMEDEPOT",
	"HOMES",
	"HONDA",
	"HORSE",
	"HOST",
	"HOSTING",
	"HOTELES",
	"HOTMAIL",
	"HOUSE",
	"HOW",
	"HR",
	"HSBC",
	"HT",
	"HTC",
	"HU",
	"HYUNDAI",
	"IBM",
	"ICBC",
	"ICE",
	"ICU",
	"ID",
	"IE",
	"IFM",
	"IINET",
	"IL",
	"IM",
	"IMAMAT",
	"IMMO",
	"IMMOBILIEN",
	"IN",
	"INDUSTRIES",
	"INFINITI",
	"INFO",
	"ING",
	"INK",
	"INSTITUTE",
	"INSURANCE",
	"INSURE",
	"INT",
	"INTERNATIONAL",
	"INVESTMENTS",
	"IO",
	"IPIRANGA",
	"IQ",
	"IR",
	"IRISH",
	"IS",
	"ISELECT",
	"ISMAILI",
	"IST",
	"ISTANBUL",
	"IT",
	"ITAU",
	"IWC",
	"JAGUAR",
	"JAVA",
	"JCB",
	"JCP",
	"JE",
	"JETZT",
	"JEWELRY",
	"JLC",
	"JLL",
	"JM",
	"JMP",
	"JNJ",
	"JO",
	"JOBS",
	"JOBURG",
	"JOT",
	"JOY",
	"JP",
	"JPMORGAN",
	"JPRS",
	"JUEGOS",
	"KAUFEN",
	"KDDI",
	"KE",
	"KERRYHOTELS",
	"KERRYLOGISTICS",
	"KERRYPROPERTIES",
	"KFH",
	"KG",
	"KH",
	"KI",
	"KIA",
	"KIM",
	"KINDER",
	"KITCHEN",
	"KIWI",
	"KM",
	"KN",
	"KOELN",
	"KOMATSU",
	"KP",
	"KPMG",
	"KPN",
	"KR",
	"KRD",
	"KRED",
	"KUOKGROUP",
	"KW",
	"KY",
	"KYOTO",
	"KZ",
	"LA",
	"LACAIXA",
	"LAMBORGHINI",
	"LAMER",
	"LANCASTER",
	"LAND",
	"LANDROVER",
	"LANXESS",
	"LASALLE",
	"LAT",
	"LATROBE",
	"LAW",
	"LAWYER",
	"LB",
	"LC",
	"LDS",
	"LEASE",
	"LECLERC",
	"LEGAL",
	"LEXUS",
	"LGBT",
	"LI",
	"LIAISON",
	"LIDL",
	"LIFE",
	"LIFEINSURANCE",
	"LIFESTYLE",
	"LIGHTING",
	"LIKE",
	"LIMITED",
	"LIMO",
	"LINCOLN",
	"LINDE",
	"LINK",
	"LIPSY",
	"LIVE",
	"LIVING",
	"LIXIL",
	"LK",
	"LOAN",
	"LOANS",
	"LOCUS",
	"LOL",
	"LONDON",
	"LOTTE",
	"LOTTO",
	"LOVE",
	"LR",
	"LS",
	"LT",
	"LTD",
	"LTDA",
	"LU",
	"LUPIN",
	"LUXE",
	"LUXURY",
	"LV",
	"LY",
	"MA",
	"MADRID",
	"MAIF",
	"MAISON",
	"MAKEUP",
	"MAN",
	"MANAGEMENT",
	"MANGO",
	"MARKET",
	"MARKETING",
	"MARKETS",
	"MARRIOTT",
	"MBA",
	"MC",
	"MD",
	"ME",
	"MED",
	"MEDIA",
	"MEET",
	"MELBOURNE",
	"MEME",
	"MEMORIAL",
	"MEN",
	"MENU",
	"MEO",
	"MG",
	"MH",
	"MIAMI",
	"MICROSOFT",
	"MIL",
	"MINI",
	"MK",
	"ML",
	"MLS",
	"MM",
	"MMA",
	"MN",
	"MO",
	"MOBI",
	"MOBILY",
	"MODA",
	"MOE",
	"MOI",
	"MOM",
	"MONASH",
	"MONEY",
	"MONTBLANC",
	"MORMON",
	"MORTGAGE",
	"MOSCOW",
	"MOTORCYCLES",
	"MOV",
	"MOVIE",
	"MOVISTAR",
	"MP",
	"MQ",
	"MR",
	"MS",
	"MT",
	"MTN",
	"MTPC",
	"MTR",
	"MU",
	"MUSEUM",
	"MUTUAL",
	"MUTUELLE",
	"MV",
	"MW",
	"MX",
	"MY",
	"MZ",
	"NA",
	"NADEX",
	"NAGOYA",
	"NAME",
	"NATURA",
	"NAVY",
	"NC",
	"NE",
	"NEC",
	"NET",
	"NETBANK",
	"NETWORK",
	"NEUSTAR",
	"NEW",
	"NEWS",
	"NEXT",
	"NEXTDIRECT",
	"NEXUS",
	"NF",
	"NG",
	"NGO",
	"NHK",
	"NI",
	"NICO",
	"NIKON",
	"NINJA",
	"NISSAN",
	"NISSAY",
	"NL",
	"NO",
	"NOKIA",
	"NORTHWESTERNMUTUAL",
	"NORTON",
	"NOWRUZ",
	"NP",
	"NR",
	"NRA",
	"NRW",
	"NTT",
	"NU",
	"NYC",
	"NZ",
	"OBI",
	"OFFICE",
	"OKINAWA",
	"OLAYAN",
	"OM",
	"OMEGA",
	"ONE",
	"ONG",
	"ONL",
	"ONLINE",
	"OOO",
	"ORACLE",
	"ORANGE",
	"ORG",
	"ORGANIC",
	"ORIGINS",
	"OSAKA",
	"OTSUKA",
	"OVH",
	"PA",
	"PAGE",
	"PAMPEREDCHEF",
	"PANERAI",
	"PARIS",
	"PARS",
	"PARTNERS",
	"PARTS",
	"PARTY",
	"PASSAGENS",
	"PE",
	"PET",
	"PF",
	"PG",
	"PH",
	"PHARMACY",
	"PHILIPS",
	"PHOTO",
	"PHOTOGRAPHY",
	"PHOTOS",
	"PHYSIO",
	"PIAGET",
	"PICS",
	"PICTET",
	"PICTURES",
	"PID",
	"PIN",
	"PING",
	"PINK",
	"PIZZA",
	"PK",
	"PL",
	"PLACE",
	"PLAY",
	"PLAYSTATION",
	"PLUMBING",
	"PLUS",
	"PM",
	"PN",
	"POHL",
	"POKER",
	"P**N",
	"POST",
	"PR",
	"PRAXI",
	"PRESS",
	"PRO",
	"PROD",
	"PRODUCTIONS",
	"PROF",
	"PROGRESSIVE",
	"PROMO",
	"PROPERTIES",
	"PROPERTY",
	"PROTECTION",
	"PS",
	"PT",
	"PUB",
	"PW",
	"PWC",
	"PY",
	"QA",
	"QPON",
	"QUEBEC",
	"QUEST",
	"RACING",
	"RE",
	"READ",
	"REALTOR",
	"REALTY",
	"RECIPES",
	"RED",
	"REDSTONE",
	"REDUMBRELLA",
	"REHAB",
	"REISE",
	"REISEN",
	"REIT",
	"REN",
	"RENT",
	"RENTALS",
	"REPAIR",
	"REPORT",
	"REPUBLICAN",
	"REST",
	"RESTAURANT",
	"REVIEW",
	"REVIEWS",
	"REXROTH",
	"RICH",
	"RICOH",
	"RIO",
	"RIP",
	"RO",
	"ROCHER",
	"ROCKS",
	"RODEO",
	"ROOM",
	"RS",
	"RSVP",
	"RU",
	"RUHR",
	"RUN",
	"RW",
	"RWE",
	"RYUKYU",
	"SA",
	"SAARLAND",
	"SAFE",
	"SAFETY",
	"SAKURA",
	"SALE",
	"SALON",
	"SAMSUNG",
	"SANDVIK",
	"SANDVIKCOROMANT",
	"SANOFI",
	"SAP",
	"SAPO",
	"SARL",
	"SAS",
	"SAXO",
	"SB",
	"SBI",
	"SBS",
	"SC",
	"SCA",
	"SCB",
	"SCHAEFFLER",
	"SCHMIDT",
	"SCHOLARSHIPS",
	"SCHOOL",
	"SCHULE",
	"SCHWARZ",
	"SCIENCE",
	"SCOR",
	"SCOT",
	"SD",
	"SE",
	"SEAT",
	"SECURITY",
	"SEEK",
	"SELECT",
	"SENER",
	"SERVICES",
	"SEVEN",
	"SEW",
	"SEX",
	"SEXY",
	"SFR",
	"SG",
	"SH",
	"SHARP",
	"SHAW",
	"SHELL",
	"SHIA",
	"SHIKSHA",
	"SHOES",
	"SHOUJI",
	"SHOW",
	"SHRIRAM",
	"SI",
	"SINA",
	"SINGLES",
	"SITE",
	"SJ",
	"SK",
	"SKI",
	"SKIN",
	"SKY",
	"SKYPE",
	"SL",
	"SM",
	"SMILE",
	"SN",
	"SNCF",
	"SO",
	"SOCCER",
	"SOCIAL",
	"SOFTBANK",
	"SOFTWARE",
	"SOHU",
	"SOLAR",
	"SOLUTIONS",
	"SONG",
	"SONY",
	"SOY",
	"SPACE",
	"SPIEGEL",
	"SPOT",
	"SPREADBETTING",
	"SR",
	"SRL",
	"ST",
	"STADA",
	"STAR",
	"STARHUB",
	"STATEBANK",
	"STATEFARM",
	"STATOIL",
	"STC",
	"STCGROUP",
	"STOCKHOLM",
	"STORAGE",
	"STORE",
	"STREAM",
	"STUDIO",
	"STUDY",
	"STYLE",
	"SU",
	"SUCKS",
	"SUPPLIES",
	"SUPPLY",
	"SUPPORT",
	"SURF",
	"SURGERY",
	"SUZUKI",
	"SV",
	"SWATCH",
	"SWISS",
	"SX",
	"SY",
	"SYDNEY",
	"SYMANTEC",
	"SYSTEMS",
	"SZ",
	"TAB",
	"TAIPEI",
	"TALK",
	"TAOBAO",
	"TATAMOTORS",
	"TATAR",
	"TATTOO",
	"TAX",
	"TAXI",
	"TC",
	"TCI",
	"TD",
	"TEAM",
	"TECH",
	"TECHNOLOGY",
	"TEL",
	"TELECITY",
	"TELEFONICA",
	"TEMASEK",
	"TENNIS",
	"TEVA",
	"TF",
	"TG",
	"TH",
	"THD",
	"THEATER",
	"THEATRE",
	"TICKETS",
	"TIENDA",
	"TIFFANY",
	"TIPS",
	"TIRES",
	"TIROL",
	"TJ",
	"TK",
	"TL",
	"TM",
	"TMALL",
	"TN",
	"TO",
	"TODAY",
	"TOKYO",
	"TOOLS",
	"TOP",
	"TORAY",
	"TOSHIBA",
	"TOTAL",
	"TOURS",
	"TOWN",
	"TOYOTA",
	"TOYS",
	"TR",
	"TRADE",
	"TRADING",
	"TRAINING",
	"TRAVEL",
	"TRAVELERS",
	"TRAVELERSINSURANCE",
	"TRUST",
	"TRV",
	"TT",
	"TUBE",
	"TUI",
	"TUNES",
	"TUSHU",
	"TV",
	"TVS",
	"TW",
	"TZ",
	"UA",
	"UBS",
	"UG",
	"UK",
	"UNICOM",
	"UNIVERSITY",
	"UNO",
	"UOL",
	"US",
	"UY",
	"UZ",
	"VA",
	"VACATIONS",
	"VANA",
	"VC",
	"VE",
	"VEGAS",
	"VENTURES",
	"VERISIGN",
	"VERSICHERUNG",
	"VET",
	"VG",
	"VI",
	"VIAJES",
	"VIDEO",
	"VIG",
	"VIKING",
	"VILLAS",
	"VIN",
	"VIP",
	"VIRGIN",
	"VISION",
	"VISTA",
	"VISTAPRINT",
	"VIVA",
	"VLAANDEREN",
	"VN",
	"VODKA",
	"VOLKSWAGEN",
	"VOTE",
	"VOTING",
	"VOTO",
	"VOYAGE",
	"VU",
	"VUELOS",
	"WALES",
	"WALTER",
	"WANG",
	"WANGGOU",
	"WARMAN",
	"WATCH",
	"WATCHES",
	"WEATHER",
	"WEATHERCHANNEL",
	"WEBCAM",
	"WEBER",
	"WEBSITE",
	"WED",
	"WEDDING",
	"WEIBO",
	"WEIR",
	"WF",
	"WHOSWHO",
	"WIEN",
	"WIKI",
	"WILLIAMHILL",
	"WIN",
	"WINDOWS",
	"WINE",
	"WME",
	"WOLTERSKLUWER",
	"WORK",
	"WORKS",
	"WORLD",
	"WS",
	"WTC",
	"WTF",
	"XBOX",
	"XEROX",
	"XIHUAN",
	"XIN",
	"XN--11B4C3D",
	"XN--1CK2E1B",
	"XN--1QQW23A",
	"XN--30RR7Y",
	"XN--3BST00M",
	"XN--3DS443G",
	"XN--3E0B707E",
	"XN--3PXU8K",
	"XN--42C2D9A",
	"XN--45BRJ9C",
	"XN--45Q11C",
	"XN--4GBRIM",
	"XN--55QW42G",
	"XN--55QX5D",
	"XN--5TZM5G",
	"XN--6FRZ82G",
	"XN--6QQ986B3XL",
	"XN--80ADXHKS",
	"XN--80AO21A",
	"XN--80ASEHDB",
	"XN--80ASWG",
	"XN--8Y0A063A",
	"XN--90A3AC",
	"XN--90AIS",
	"XN--9DBQ2A",
	"XN--9ET52U",
	"XN--9KRT00A",
	"XN--B4W605FERD",
	"XN--BCK1B9A5DRE4C",
	"XN--C1AVG",
	"XN--C2BR7G",
	"XN--CCK2B3B",
	"XN--CG4BKI",
	"XN--CLCHC0EA0B2G2A9GCD",
	"XN--CZR694B",
	"XN--CZRS0T",
	"XN--CZRU2D",
	"XN--D1ACJ3B",
	"XN--D1ALF",
	"XN--E1A4C",
	"XN--ECKVDTC9D",
	"XN--EFVY88H",
	"XN--ESTV75G",
	"XN--FCT429K",
	"XN--FHBEI",
	"XN--FIQ228C5HS",
	"XN--FIQ64B",
	"XN--FIQS8S",
	"XN--FIQZ9S",
	"XN--FJQ720A",
	"XN--FLW351E",
	"XN--FPCRJ9C3D",
	"XN--FZC2C9E2C",
	"XN--G2XX48C",
	"XN--GCKR3F0F",
	"XN--GECRJ9C",
	"XN--H2BRJ9C",
	"XN--HXT814E",
	"XN--I1B6B1A6A2E",
	"XN--IMR513N",
	"XN--IO0A7I",
	"XN--J1AEF",
	"XN--J1AMH",
	"XN--J6W193G",
	"XN--JLQ61U9W7B",
	"XN--JVR189M",
	"XN--KCRX77D1X4A",
	"XN--KPRW13D",
	"XN--KPRY57D",
	"XN--KPU716F",
	"XN--KPUT3I",
	"XN--L1ACC",
	"XN--LGBBAT1AD8J",
	"XN--MGB9AWBF",
	"XN--MGBA3A3EJT",
	"XN--MGBA3A4F16A",
	"XN--MGBA7C0BBN0A",
	"XN--MGBAAM7A8H",
	"XN--MGBAB2BD",
	"XN--MGBAYH7GPA",
	"XN--MGBB9FBPOB",
	"XN--MGBBH1A71E",
	"XN--MGBC0A9AZCG",
	"XN--MGBCA7DZDO",
	"XN--MGBERP4A5D4AR",
	"XN--MGBPL2FH",
	"XN--MGBT3DHD",
	"XN--MGBTX2B",
	"XN--MGBX4CD0AB",
	"XN--MIX891F",
	"XN--MK1BU44C",
	"XN--MXTQ1M",
	"XN--NGBC5AZD",
	"XN--NGBE9E0A",
	"XN--NODE",
	"XN--NQV7F",
	"XN--NQV7FS00EMA",
	"XN--NYQY26A",
	"XN--O3CW4H",
	"XN--OGBPF8FL",
	"XN--P1ACF",
	"XN--P1AI",
	"XN--PBT977C",
	"XN--PGBS0DH",
	"XN--PSSY2U",
	"XN--Q9JYB4C",
	"XN--QCKA1PMC",
	"XN--QXAM",
	"XN--RHQV96G",
	"XN--ROVU88B",
	"XN--S9BRJ9C",
	"XN--SES554G",
	"XN--T60B56A",
	"XN--TCKWE",
	"XN--UNUP4Y",
	"XN--VERMGENSBERATER-CTB",
	"XN--VERMGENSBERATUNG-PWB",
	"XN--VHQUV",
	"XN--VUQ861B",
	"XN--W4R85EL8FHU5DNRA",
	"XN--WGBH1C",
	"XN--WGBL6A",
	"XN--XHQ521B",
	"XN--XKC2AL3HYE2A",
	"XN--XKC2DL3A5EE0H",
	"XN--Y9A3AQ",
	"XN--YFRO4I67O",
	"XN--YGBI2AMMX",
	"XN--ZFR164B",
	"XPERIA",
	"XXX",
	"XYZ",
	"YACHTS",
	"YAHOO",
	"YAMAXUN",
	"YANDEX",
	"YE",
	"YODOBASHI",
	"YOGA",
	"YOKOHAMA",
	"YOU",
	"YOUTUBE",
	"YT",
	"YUN",
	"ZA",
	"ZARA",
	"ZERO",
	"ZIP",
	"ZM",
	"ZONE",
	"ZUERICH",
	"ZW",


	"AB.CA",
	"AC.AE",
	"AC.AT",
	"AC.CN",
	"AC.CR",
	"AC.CY",
	"AC.FJ",
	"AC.GG",
	"AC.ID",
	"AC.IL",
	"AC.IM",
	"AC.IN",
	"AC.JE",
	"AC.JP",
	"AC.KR",
	"AC.NZ",
	"AC.PA",
	"AC.TH",
	"AC.UG",
	"AC.UK",
	"AC.YU",
	"AC.ZA",
	"AD.JP",
	"AH.CN",
	"ALDERNEY.GG",
	"ALT.ZA",
	"ART.BR",
	"ART.DO",
	"ARTS.CO",
	"ARTS.VE",
	"ASN.AU",
	"ASN.LV",
	"BBS.TR",
	"BC.CA",
	"BIB.VE",
	"BJ.CN",
	"CO.AT",
	"CO.AO",
	"CO.CK",
	"CO.CR",
	"CO.GG",
	"CO.HU",
	"CO.ID",
	"CO.IL",
	"CO.IM",
	"CO.IN",
	"CO.JE",
	"CO.JP",
	"CO.KR",
	"COM.AR",
	"COM.AU",
	"COM.AZ",
	"COM.BB",
	"COM.BM",
	"COM.BR",
	"COM.BS",
	"COM.CN",
	"COM.CO",
	"COM.CU",
	"COM.CY",
	"COM.DO",
	"COM.EC",
	"COM.EG",
	"COM.FJ",
	"COM.GE",
	"COM.GU",
	"COM.HK",
	"COM.JO",
	"COM.KH",
	"COM.LA",
	"COM.LB",
	"COM.LC",
	"COM.LV",
	"COM.LY",
	"COM.MM",
	"COM.MO",
	"COM.MT",
	"COM.MX",
	"COM.MY",
	"COM.NA",
	"COM.NC",
	"COM.NI",
	"COM.NP",
	"COM.PA",
	"COM.PE",
	"COM.PH",
	"COM.PL",
	"COM.PY",
	"COM.RU",
	"COM.SG",
	"COM.SH",
	"COM.SY",
	"COM.TN",
	"COM.TR",
	"COM.TW",
	"COM.UA",
	"COM.UY",
	"COM.VE",
	"CONF.AU",
	"CONF.LV",
	"CO.NZ",
	"COOP",
	"CO.AE",
	"CO.SV",
	"CO.TH",
	"CO.UG",
	"CO.UK",
	"CO.VE",
	"CO.VI",
	"CO.YU",
	"CO.ZA",
	"CQ.CN",
	"CSIRO.AU",
	"ED.CR",
	"EDU.BM",
	"EDU.AR",
	"EDU.CN",
	"EDU.CO",
	"EDU.DO",
	"EDU.EC",
	"EDU.EG",
	"EDU.GE",
	"EDU.GU",
	"EDU.JO",
	"EDU.LC",
	"EDU.LV",
	"EDU.MM",
	"EDU.MO",
	"EDU.MY",
	"EDUNET.TN",
	"EDU.PA",
	"EDU.PY",
	"EDU.SG",
	"EDU.SH",
	"EDU.TR",
	"EDU.TW",
	"EDU.UY",
	"EDU.VE",
	"EDU.YU",
	"EDU.ZA",
	"ENS.TN",
	"ERNET.IN",
	"ESP.BR",
	"ETC.BR",
	"EUN.EG",
	"FI.CR",
	"FIN.EC",
	"FIN.TN",
	"FIRM.CO",
	"FIRM.VE",
	"G12.BR",
	"GD.CN",
	"GEN.NZ",
	"GOB.PA",
	"GO.CR",
	"GO.ID",
	"GO.KR",
	"GO.TH",
	"GO.UG",
	"GOV.AE",
	"GOV.AR",
	"GOV.AU",
	"GOV.BM",
	"GOV.BR",
	"GOV.CN",
	"GOV.CO",
	"GOV.CY",
	"GOV.DO",
	"GOV.EC",
	"GOV.EG",
	"GOVE.TW",
	"GOV.FJ",
	"GOV.GE",
	"GOV.GG",
	"GOV.GU",
	"GOV.IL",
	"GOV.IM",
	"GOV.IN",
	"GOV.JE",
	"GOV.JO",
	"GOV.JP",
	"GOV.LB",
	"GOV.LC",
	"GOV.LV",
	"GOV.MM",
	"GOV.MO",
	"GOV.MY",
	"GOV.SG",
	"GOV.SH",
	"GOV.TN",
	"GOVT.NZ",
	"GOV.TR",
	"GOV.UA",
	"GOV.UK",
	"GOV.VE",
	"GOV.ZA",
	"GS.CN",
	"GUERNSEY.GG",
	"GX.CN",
	"GZ.CN",
	"HB.CN",
	"HE.CN",
	"HI.CN",
	"HK.CN",
	"HL.CN",
	"HN.CN",
	"ID.AU",
	"ID.FJ",
	"ID.LV",
	"IND.BR",
	"IND.GG",
	"IND.JE",
	"IND.TN",
	"INF.BR",
	"INFO.AU",
	"INFO.CO",
	"INFO.HU",
	"INFO.TN",
	"INFO.VE",
	"INT.CO",
	"INTL.TN",
	"INT.VE",
	"JERSEY.JE",
	"JL.CN",
	"JS.CN",
	"K12.EC",
	"K12.IL",
	"K12.TR",
	"LKD.CO.IM",
	"LN.CN",
	"LTD.GG",
	"LTD.JE",
	"LTD.UK",
	"MB.CA",
	"MED.EC",
	"MIL.BR",
	"MIL.CO",
	"MIL.DO",
	"MIL.EC",
	"MIL.GE",
	"MIL.GU",
	"MIL.ID",
	"MIL.LB",
	"MIL.LV",
	"MIL.PH",
	"MIL.SH",
	"MIL.TR",
	"MIL.VE",
	"MIL.ZA",
	"MO.CN",
	"MOD.UK",
	"MUNI.IL",
	"MUSEUM",
	"NAME",
	"NAT.TN",
	"NB.CA",
	"NET.AR",
	"NET.AU",
	"NET.AZ",
	"NET.BB",
	"NET.BM",
	"NET.BR",
	"NET.BS",
	"NET.CN",
	"NET.CU",
	"NET.CY",
	"NET.DO",
	"NET.EC",
	"NET.EG",
	"NET.GE",
	"NET.GG",
	"NET.GU",
	"NET.HK",
	"NET.ID",
	"NET.IL",
	"NET.IM",
	"NET.IN",
	"NET.JE",
	"NET.JO",
	"NET.JP",
	"NET.KH",
	"NET.LA",
	"NET.LB",
	"NET.LC",
	"NET.LV",
	"NET.LY",
	"NET.MM",
	"NET.MO",
	"NET.MT",
	"NET.MX",
	"NET.MY",
	"NET.NA",
	"NET.NC",
	"NET.NP",
	"NET.NZ",
	"NET.PA",
	"NET.PE",
	"NET.PH",
	"NET.PL",
	"NET.PY",
	"NET.RU",
	"NET.SG",
	"NET.SH",
	"NET.SY",
	"NET.TH",
	"NET.TN",
	"NET.TR",
	"NET.TW",
	"NET.UA",
	"NET.UK",
	"NET.UY",
	"NET.VE",
	"NET.VI",
	"NET.ZA",
	"NF.CA",
	"NGO.PH",
	"NGO.ZA",
	"NHS.UK",
	"NIC.IM",
	"NIC.IN",
	"NM.CN",
	"NM.KR",
	"NOM.CO",
	"NOM.VE",
	"NOM.ZA",
	"NS.CA",
	"NSK.SU",
	"NT.CA",
	"NUI.HU",
	"NX.CN",
	"ON.CA",
	"OR.CR",
	"ORG.AE",
	"ORG.AR",
	"ORG.AU",
	"ORG.AZ",
	"ORG.BB",
	"ORG.BM",
	"ORG.BR",
	"ORG.BS",
	"ORG.CN",
	"ORG.CO",
	"ORG.CU",
	"ORG.CY",
	"ORG.DO",
	"ORG.EC",
	"ORG.EG",
	"ORG.FJ",
	"ORG.GE",
	"ORG.GG",
	"ORG.GU",
	"ORG.HK",
	"ORG.HU",
	"ORG.IL",
	"ORG.IM",
	"ORG.JE",
	"ORG.JP",
	"ORG.KH",
	"ORG.LA",
	"ORG.LB",
	"ORG.LC",
	"ORG.LV",
	"ORG.LY",
	"ORG.MM",
	"ORG.MO",
	"ORG.MT",
	"ORG.MX",
	"ORG.MY",
	"ORG.NA",
	"ORG.NC",
	"ORG.NZ",
	"ORG.PA",
	"ORG.PE",
	"ORG.PH",
	"ORG.PL",
	"ORG.PY",
	"ORG.RU",
	"ORG.SG",
	"ORG.SH",
	"ORG.SY",
	"ORG.TN",
	"ORG.TR",
	"ORG.TW",
	"ORG.UK",
	"ORG.UY",
	"ORG.VE",
	"ORG.VI",
	"ORG.YU",
	"ORG.ZA",
	"OR.ID",
	"OR.KR",
	"OR.TH",
	"ORT.NP",
	"OR.UG",
	"OZ.AU",
	"PE.CA",
	"PLC.CO.IM",
	"PLC.UK",
	"POLICE.UK",
	"PRIV.HU",
	"PSI.BR",
	"PVT.GE",
	"QC.CA",
	"QH.CN",
	"REC.BR",
	"REC.CO",
	"REC.VE",
	"RE.KR",
	"RES.IN",
	"RNRT.TN",
	"RNS.TN",
	"RNU.TN",
	"SA.CR",
	"SARK.GG",
	"SC.CN",
	"SCH.GG",
	"SCH.JE",
	"SCHOOL.FJ",
	"SCHOOL.ZA",
	"SCH.UK",
	"SCI.EG",
	"SH.CN",
	"SK.CA",
	"SLD.PA",
	"SN.CN",
	"STORE.CO",
	"STORE.VE",
	"SX.CN",
	"TEC.VE",
	"TELEMEMO.AU",
	"TJ.CN",
	"TM.HU",
	"TMP.BR",
	"TM.ZA",
	"TOURISM.TN",
	"TW.CN",
	"WEB.CO",
	"WEB.DO",
	"WEB.VE",
	"WEB.ZA",
	"XJ.CN",
	"XZ.CN",
	"YK.CA",
	"YN.CN",
	"ZJ.CN"
};

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
				     "tldtbl") ) 
			return log("build: Could not init table of TLDs.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char      *d    = s_tlds[i];
			int32_t       dlen = gbstrlen ( d );
			int64_t  dh   = hash64Lower_a ( d , dlen );
			if ( ! s_table.addKey (&dh,NULL) )
				return log("build: dom table failed");
		}
		s_isInitialized = true;
	} 
	int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
	return s_table.isInTable ( &h );//getScoreFromTermId ( h );
}

Example #14

0

Show file

File: SiteGetter.cpp Project: Doken-Tokuyama/open-source-search-engine

//
// hardcoded support for popular formats and sites
//
bool SiteGetter::setRecognizedSite ( ) {

	// clear just in case
	g_errno = 0;

	// get path of url
	char *p = m_url;
	for ( ; *p && *p != ':' ; p++ );
	// error?
	if ( *p != ':' ) return false;
	// skip ://
	p += 3;
	// save host ptr
	char *host = p;
	// then another / for the path
	for ( ; *p && *p != '/' ; p++ );
	// error?
	if ( *p != '/' ) return false;
	//
	// ok, "p" now points to the path
	//
	char *path = p;

	// convenience vars
	int32_t  len = 0;

	// . deal with site indicators
	// . these are applied to all domains uniformly
	// . if it is xyz.com/users/  use xyz.com/users/fred/ as the site

	// a lot of times these were not indivual blogs, but the blog subsite
	// of a site... http://dccc.org/blog/P4575/
	//if ( strncasecmp(p,"/blogs/"       , 7) == 0 ) len = 7;
	//if ( strncasecmp(p,"/blog/"        , 6) == 0 ) len = 6;
	// commented out a bunch cuz they were profiles mostly, not blogs...
	if ( strncasecmp(p,"/~"            , 2) == 0 ) len = 2;
	// assume this is a username. skip the first /
	//if ( sitepathdepth == 1                      ) len = 1;
	if ( strncasecmp(p,"/users/"       , 7) == 0 ) len = 7;
	if ( strncasecmp(p,"/user/"        , 6) == 0 ) len = 6;
	if ( strncasecmp(p,"/members/"     , 9) == 0 ) len = 9;
	if ( strncasecmp(p,"/membres/"     , 9) == 0 ) len = 9;
	if ( strncasecmp(p,"/member/"      , 8) == 0 ) len = 8;
	if ( strncasecmp(p,"/membre/"      , 8) == 0 ) len = 8;
	if ( strncasecmp(p,"/member.php?u=",14) == 0 ) len = 14;

	// point to after the /users/, /blogs/, /user/, /blog/ or /~xxx/
	p += len;
	// assume there is NOT an alpha char after this
	char username = false;
	// . skip to next / OR ?
	// . stop at . or -, because we do not allow those in usernames and
	//   they are often indicative of filenames without file extensions
	// . no, fix http://www.rus-obr.ru/users/maksim-sokolov (no - or _ or.)
	while ( len && *p && *p!= '/'&&*p!='?' ) {
		// sometimes usernames are numbers!!!
		//if ( is_alpha_a(*p) ) username = true;
		// http://stackoverflow.com/users/271376/sigterm
		if ( is_alnum_a(*p) ) username = true;
		p++;
	}
	// if we hit this, not a username
	//if ( *p=='.' || *p == '-' || *p == '_' ) username = false;
	// did we get a match?
	// . www.cits.ucsb.edu/users/michael-osborne
	// . www.cits.ucsb.edu/users/michael-osborne/
	// . after /blog/ or /~ should be another / or \0, not a period,
	//   because that indicates probably a filename, which is not right,
	//   because we are expecting a username!
	if ( username && p - host + 6 < MAX_SITE_LEN ) {
		// jump up here to store
	storeIt:
		// for parsing
		char *x = m_site;
		// store www first if its a domain only url
		if ( ! m_hasSubdomain ) {
			gbmemcpy ( x , "www." , 4 );
			x += 4;
		}
		// store it
		gbmemcpy ( x , host , p - host );
		x += p - host;
		// set the length of it
		m_siteLen = x - m_site;
		// make it end on a '/' if we can
		if ( m_site[m_siteLen-1] != '/' &&
		     // watch out for /?uid=xxxx crap
		     m_site[m_siteLen-1] != '=' ) {
			// force the / then
			m_site[m_siteLen] = '/';
			m_siteLen++;
		}
		// null term the site
		m_site [ m_siteLen ] = '\0';
		return true;
	}


	//
	// popular homesteads
	//
	int32_t depth = 0;
	// term host
	char c = *path;
	*path = '\0';
	if ( strstr(host,"vimeo.com"      ) ) depth = 1;
	if ( strstr(host,"www.myspace.com") ) depth = 1;
	if ( strstr(host,"twitter.com"    ) ) depth = 1;
	if ( strstr(host,"www.facebook.com") ) depth = 1;
	// revert
	*path = c;

	// return false to indicate no recognized site detected
	if ( ! depth ) return false;

	// skip over the initial root / after the hostname
	p = path + 1;

	// no path really? root path? just return the hostname then
	if ( ! *p  && path - host + 6 < MAX_SITE_LEN ) {
		// for parsing
		char *x = m_site;
		// store www first if its a domain only url
		if ( ! m_hasSubdomain ) {
			gbmemcpy ( x , "www." , 4 );
			x += 4;
		}
		// store it
		gbmemcpy ( x , host , path - host );
		x += path - host;
		m_siteLen = x - m_site;
		m_site [ m_siteLen ] = '\0';
		return true;
	}

	// for depth
	for ( ; *p ; p++ ) 
		if ( *p == '/' && --depth == 0 ) break;

	if ( p - host + 6 >= MAX_SITE_LEN ) return false;

	goto storeIt;

	return true;
}

Example #15

0

Show file

File: Words.cpp Project: privacore/open-source-search-engine

bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) {
	int32_t  i = 0;
	int32_t  j;
	int32_t  wlen;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) {
		goto done;
	}

	if ( ! s[i] ) {
		goto done;
	}

	if ( !is_alnum_utf8( s + i ) ) {
		if ( m_numWords >= m_preCount ) {
			goto done;
		}

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if( m_tagIds ) {
				if ( s[i + 1] == '/' ) {
					// skip over /
					m_tagIds[m_numWords] = ::getTagId( s + i + 2 );
					m_tagIds[m_numWords] |= BACKBIT;
				} else {
					m_tagIds[m_numWords] = ::getTagId( s + i + 1 );
				}
			}

			m_words[m_numWords] = s + i;
			m_wordIds[m_numWords] = 0LL;

			// skip till end
			int32_t tagLen = getTagLen( s + i );
			m_wordLens[m_numWords] = tagLen;
			m_nodes[m_numWords] = 0;
			m_numWords++;

			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		for ( ; s[i] ; i += getUtf8CharSize(s+i)) {
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) {
				break;
			}

			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) {
					continue;
				}

				// update
				oldScript = ucScriptCommon;

				// otherwise, stop we got alnum
				break;
			}

			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );

			// stop if word char
			if ( ! ucIsWordChar ( c ) ) {
				continue;
			}

			// update first though
			oldScript = ucGetScript ( c );

			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		m_nodes        [ m_numWords  ] = 0;

		if (m_tagIds) {
			m_tagIds[m_numWords] = 0;
		}

		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanes chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}
	
	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;

	// comma is ok if like ,ddd!d
	if ( s[i]==',' && 
	     i-j <= 3 &&
	     is_digit(s[i-1]) ) {
		// if word so far is 2 or 3 chars, make sure digits
		if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
		if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
		// scan forward
		while ( s[i] == ',' &&
		        is_digit(s[i+1]) &&
		        is_digit(s[i+2]) &&
		        is_digit(s[i+3]) &&
		        ! is_digit(s[i+4]) ) {
			i += 4;
		}
	}

	// decimal point?
	if ( s[i] == '.' &&
	     is_digit(s[i-1]) &&
	     is_digit(s[i+1]) ) {
		// allow the decimal point
		i++;
		// skip over string of digits
		while ( is_digit(s[i]) ) i++;
	}
	
 nogo:

	// allow for words like we're dave's and i'm
	if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) {
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;
	
	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;

	if ( computeWordIds ) {
		int64_t h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
	}

	m_nodes[m_numWords] = 0;
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// get a punct word
	goto uptop;

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC, "build: words: set: Fix counting routine.");
		gbshutdownLogicError();
	}

	return true;
}

Example #16

0

Show file

File: Log.cpp Project: FlavioFalcao/open-source-search-engine

bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
		 bool forced ) {

	// filter if we should
	//if ( forced ) goto skipfilter;

	// return true if we should not log this
	if ( ! forced && ! shouldLog ( type , msg ) ) return true;
	// skipfilter:
	// can we log if we're a sig handler? don't take changes
	if ( g_inSigHandler ) 
		return logLater ( now , type , msg , NULL );
	//if ( g_inSigHandler ) return false;
	// get "msg"'s length
	long msgLen = gbstrlen ( msg );

#ifdef PTHREADS
	// lock for threads
	pthread_mutex_lock ( &s_lock );
#endif

	// do a timestamp, too. use the time synced with host #0 because
	// it is easier to debug because all log timestamps are in sync.
	if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore();

	// . skip all logging if power out, we do not want to screw things up
	// . allow logging for 10 seconds after power out though
	if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){
#ifdef PTHREADS
		pthread_mutex_unlock ( &s_lock );
#endif
		return false;
	}

	//if ( now == 0 ) now  = g_nowApprox;
	// chop off any spaces at the end of the msg.
	while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--;
	// get this pid
	pid_t pid = getpidtid();
	// a tmp buffer
	char tt [ MAX_LINE_LEN ];
	char *p    = tt;
	char *pend = tt + MAX_LINE_LEN;
	/*
	// print timestamp, hostid, type
	if ( g_hostdb.m_numHosts <= 999 ) 
		sprintf ( p , "%llu %03li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	else if ( g_hostdb.m_numHosts <= 9999 ) 
		sprintf ( p , "%llu %04li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	else if ( g_hostdb.m_numHosts <= 99999 ) 
		sprintf ( p , "%llu %05li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	*/


	// print timestamp, hostid, type

	if ( m_logTimestamps ) {
		if ( g_hostdb.m_numHosts <= 999 ) 
			sprintf ( p , "%llu %03li ",
				  now , g_hostdb.m_hostId );
		else if ( g_hostdb.m_numHosts <= 9999 ) 
			sprintf ( p , "%llu %04li ",
				  now , g_hostdb.m_hostId );
		else if ( g_hostdb.m_numHosts <= 99999 ) 
			sprintf ( p , "%llu %05li ",
				  now , g_hostdb.m_hostId );
		p += gbstrlen ( p );
	}

	// msg resource
	char *x = msg;
	long cc = 7;
	// the first 7 bytes or up to the : must be ascii
	//while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; }
	// space pad
	//while ( cc-- > 0 ) *p++ = ' ';
	// ignore the label for now...
	while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; }
	// thread id if in "thread"
	if ( pid != s_pid && s_pid != -1 ) {
		//sprintf ( p , "[%li] " , (long)getpid() );
		sprintf ( p , "[%lu] " , (unsigned long)pid );
		p += gbstrlen ( p );
	}
	// then message itself
	long avail = (MAX_LINE_LEN) - (p - tt) - 1;
	if ( msgLen > avail ) msgLen = avail;
	if ( *x == ':' ) x++;
	if ( *x == ' ' ) x++;
	strncpy ( p , x , avail );
	// capitalize for consistency. no, makes grepping log msgs harder.
	//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
	p += gbstrlen(p);
	// back up over spaces
	while ( p[-1] == ' ' ) p--;
	// end in period or ? or !
	//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
	//	*p++ = '.';
	*p ='\0';
	// the total length, not including the \0
	long tlen = p - tt;

	// call sprintf, but first make sure we have room in m_buf and in
	// the arrays. who know how much room the sprintf is going to need???
	// NOTE: TODO: this is shaky -- fix it!
	if ( m_bufPtr + tlen  >= 1024 * 32 ||  m_numErrors  >= MAX_LOG_MSGS){
		// this sets m_bufPtr to 0
		if ( ! dumpLog ( ) ) {
			fprintf(stderr,"Log::log: could not dump to file!\n");
#ifdef PTHREADS
			pthread_mutex_unlock ( &s_lock );
#endif
			return false;
		}
	}
	// . filter out nasty chars from the message
	// . replace with ~'s
	char cs;
	char *ttp    = tt;
	char *ttpend = tt + tlen;
	for ( ; ttp < ttpend ; ttp += cs ) {
		cs = getUtf8CharSize ( ttp );
		if ( is_binary_utf8 ( ttp ) ) {
			for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.';
			// careful not to skip the already skipped bytes
			cs = 0;
			continue;
		}
		// convert \n's and \r's to spaces
		if ( *ttp == '\n' ) *ttp = ' ';
		if ( *ttp == '\r' ) *ttp = ' ';
		if ( *ttp == '\t' ) *ttp = ' ';
	}

	if ( m_fd >= 0 ) {
		write ( m_fd , tt , tlen );
		write ( m_fd , "\n", 1 );
	}
	else {
		// print it out for now
		fprintf ( stderr, "%s\n", tt );
	}

	// set the stuff in the array
	m_errorMsg      [m_numErrors] = msg;
	m_errorMsgLen   [m_numErrors] = msgLen;
	m_errorTime     [m_numErrors] = now;
	m_errorType     [m_numErrors] = type;
	// increase the # of errors
	m_numErrors++;

#ifdef PTHREADS
	// unlock for threads
	pthread_mutex_unlock ( &s_lock );
#endif
	return false;
}

Example #17

0

Show file

File: PageAddColl.cpp Project: exename/open-source-search-engine

bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
	// get collection name
	//int32_t  nclen;
	//char *nc   = r->getString ( "nc" , &nclen );
	//int32_t  cpclen;
	//char *cpc  = r->getString ( "cpc" , &cpclen );

	g_errno = 0;

	//bool cast = r->getLong("cast",0);

	const char *msg = NULL;

	// if any host in network is dead, do not do this
	//if ( g_hostdb.hasDeadHost() ) msg = "A host in the network is dead.";

	char format = r->getReplyFormat();


	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
		// no addcoll given?
		int32_t  page = g_pages.getDynamicPageNumber ( r );
		const char *addcoll = r->getString("addcoll",NULL);
		const char *delcoll = r->getString("delcoll",NULL);
		if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
		if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
		if ( page == PAGE_ADDCOLL && ! addcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no addcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		if ( page == PAGE_DELCOLL && ! delcoll ) {
			g_errno = EBADENGINEER;
			const char *msg = "no delcoll parm provided";
			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
		}
		return g_httpServer.sendSuccessReply(s,format);
	}

	// error?
	const char *action = r->getString("action",NULL);
	const char *addColl = r->getString("addcoll",NULL);


	char  buf [ 64*1024 ];
	SafeBuf p(buf, 64*1024);


	//
	// CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS
	//

	SafeBuf gtmp;
	char *gmsg = NULL;
	// is it too big?
	if ( action && addColl && strlen(addColl) > MAX_COLL_LEN ) {
		gtmp.safePrintf("search engine name is too long");
		gmsg = gtmp.getBufStart();
	}
	// from Collectiondb.cpp::addNewColl() ensure coll name is legit
	const char *x = addColl;
	for ( ; x && *x ; x++ ) {
		if ( is_alnum_a(*x) ) continue;
		if ( *x == '-' ) continue;
		if ( *x == '_' ) continue; // underscore now allowed
		break;
	}
	if ( x && *x ) {
		g_errno = EBADENGINEER;
		gtmp.safePrintf("<font color=red>Error. \"%s\" is a "
				"malformed name because it "
				"contains the '%c' character.</font><br><br>",
				addColl,*x);
		gmsg = gtmp.getBufStart();
	}

	//
	// END GIGABOT ERRORS
	//



	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	// if added the coll successfully, do not print same page, jump to
	// printing the basic settings page so they can add sites to it.
	// crap, this GET request, "r", is missing the "c" parm sometimes.
	// we need to use the "addcoll" parm anyway. maybe print a meta
	// redirect then?
	char guide = r->getLong("guide",0);
	// do not redirect if gmsg is set, there was a problem with the name
	if ( action && ! msg && format == FORMAT_HTML && guide && ! gmsg ) {
		//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
		// just redirect to it
		if ( addColl )
			p.safePrintf("<meta http-equiv=Refresh "
				      "content=\"0; URL=/admin/settings"
				      "?guide=1&c=%s\">",
				      addColl);
		return g_httpServer.sendDynamicPage (s,
						     p.getBufStart(),
						     p.length());
	}


	// print standard header
	g_pages.printAdminTop ( &p , s , r , NULL, 
				"onload=document."
				"getElementById('acbox').focus();");


	if ( g_errno ) {
		msg = mstrerror( g_errno );
	}

	if ( msg && ! guide ) {
		const char *cc = "deleting";
		if ( add ) cc = "adding";
		p.safePrintf (
			  "<center>\n"
			  "<font color=red>"
			  "<b>Error %s collection: %s. "
			  "See log file for details.</b>"
			  "</font>"
			  "</center><br>\n",cc,msg);
	}

	//
	// CLOUD SEARCH ENGINE SUPPORT
	//
	if ( add && guide )
		printGigabotAdvice ( &p , PAGE_ADDCOLL , r , gmsg );



	// print the add collection box
	if ( add /*&& (! nc[0] || g_errno ) */ ) {

		const char *t1 = "Add Collection";
		if ( guide ) t1 = "Add Search Engine";

		p.safePrintf (
			  "<center>\n<table %s>\n"
			   "<tr class=hdrow><td colspan=2>"
			  "<center><b>%s</b></center>"
			  "</td></tr>\n"
			  ,TABLE_STYLE
			  ,t1
			      );
		const char *t2 = "collection";
		if ( guide ) t2 = "search engine";
		const char *str = addColl;
		if ( ! addColl ) str = "";
		p.safePrintf (
			      "<tr bgcolor=#%s>"
			      "<td><b>name of new %s to add</td>\n"
			      "<td><input type=text name=addcoll size=30 "
			      "id=acbox "
			      "value=\"%s\">"
			      "</td></tr>\n"
			      , LIGHT_BLUE
			      , t2 
			      , str
			      );

		// don't show the clone box if we are under gigabot the guide
		if ( ! guide )
			p.safePrintf(
				     "<tr bgcolor=#%s>"
				     "<td><b>clone settings from this "
				     "collection</b>"
				     "<br><font size=1>Copy settings from "
				     "this pre-existing collection. Leave "
				     "blank to "
				     "accept default values.</font></td>\n"
				     "<td><input type=text name=clonecoll "
				     "size=30>"
				     "</td>"
				     "</tr>"
				     , LIGHT_BLUE
				     );

		// collection pwds
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection passwords"
			     "</b>"
			     "<br><font size=1>List of white space separated "
			     "passwords allowed to adminster collection."
			     "</font>"
			     "</td>\n"
			     "<td><input type=text name=collpwd "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// ips box for security
		p.safePrintf(
			     "<tr bgcolor=#%s>"
			     "<td><b>collection ips"
			     "</b>"

			     "<br><font size=1>List of white space separated "
			     "IPs allowed to adminster collection."
			     "</font>"

			     "</td>\n"
			     "<td><input type=text name=collips "
			     "size=60>"
			     "</td>"
			     "</tr>"
			     , LIGHT_BLUE
			     );

		// now list collections from which to copy the config
		//p.safePrintf (
		//	  "<tr><td><b>copy configuration from this "
		//	  "collection</b><br><font size=1>Leave blank to "
		//	  "accept default values.</font></td>\n"
		//	  "<td><input type=text name=cpc value=\"%s\" size=30>"
		//	  "</td></tr>\n",coll);
		p.safePrintf ( "</table></center><br>\n");

		// wrap up the form started by printAdminTop
		g_pages.printAdminBottom ( &p );
		int32_t bufLen = p.length();
		return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
	}

	// if we added a collection, print its page
	//if ( add && nc[0] && ! g_errno ) 
	//	return g_parms.sendPageGeneric2 ( s , r , PAGE_SEARCH ,
	//					  nc , pwd );

	if ( g_collectiondb.m_numRecsUsed <= 0 ) goto skip;

	// print all collections out in a checklist so you can check the
	// ones you want to delete, the values will be the id of that collectn
	p.safePrintf (
		  "<center>\n<table %s>\n"
		  "<tr class=hdrow><td><center><b>Delete Collections"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  "<center><b>Select the collections you wish to delete. "
		  //"<font color=red>This feature is currently under "
		  //"development.</font>"
		  "</b></center></td></tr>\n"
		  "<tr bgcolor=#%s><td>"
		  // table within a table
		  "<center><table width=20%%>\n",
		  TABLE_STYLE,
		  LIGHT_BLUE,
		  DARK_BLUE
		      );

	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cr = g_collectiondb.m_recs[i];
		if ( ! cr ) continue;
		p.safePrintf (
			  "<tr bgcolor=#%s><td>"
			  "<input type=checkbox name=delcoll value=\"%s\"> "
			  "%s</td></tr>\n",
			  DARK_BLUE,
			  cr->m_coll,cr->m_coll);
	}
	p.safePrintf( "</table></center></td></tr></table><br>\n" );
skip:
	// wrap up the form started by printAdminTop
	g_pages.printAdminBottom ( &p );
	int32_t bufLen = p.length();
	return g_httpServer.sendDynamicPage (s,p.getBufStart(),bufLen);
}

Example #18

0

Show file

File: SearchInput.cpp Project: BKJackson/open-source-search-engine

// . sets m_qbuf1[] and m_qbuf2[]
// . m_qbuf1[] is the advanced query
// . m_qbuf2[] is the query to be used for spell checking
// . returns false and set g_errno on error
bool SearchInput::setQueryBuffers ( ) {

	m_sbuf1.reset();
	m_sbuf2.reset();
	m_sbuf3.reset();

	short qcs = csUTF8;
	if (m_queryCharset && m_queryCharsetLen){
		// we need to convert the query string to utf-8
		qcs = get_iana_charset(m_queryCharset, m_queryCharsetLen);
		if (qcs == csUnknown) {
			//g_errno = EBADCHARSET;
			//g_msg = "(error: unknown query charset)";
			//return false;
			qcs = csUTF8;
		}
	}
	// prepend sites terms
	long numSites = 0;
	char *csStr = NULL;
	numSites = 0;
	csStr = get_charset_str(qcs);

	if ( m_sites && m_sites[0] ) {
		char *s = m_sites;
		char *t;
		long  len;
		m_sbuf1.pushChar('(');//*p++ = '(';
	loop:
		// skip white space
		while ( *s && ! is_alnum_a(*s) ) s++;
		// bail if done
		if ( ! *s ) goto done;
		// get length of it
		t = s;
		while ( *t && ! is_wspace_a(*t) ) t++;
		len = t - s;
		// add site: term
		//if ( p + 12 + len >= pend ) goto toobig;
		if ( numSites > 0 ) m_sbuf1.safeStrcpy ( " UOR " );
		m_sbuf1.safeStrcpy ( "site:" );
		//p += ucToUtf8(p, pend-p,s, len, csStr, 0,0);
		m_sbuf1.safeMemcpy ( s , len );
		//memcpy ( p , s , len     ); p += len;
		//*p++ = ' ';
		m_sbuf1.pushChar(' ');
		s = t;
		numSites++;
		goto loop;
	done:
		m_sbuf1.safePrintf(") | ");
		// inc totalLen
		m_sitesQueryLen = m_sitesLen + (numSites * 10);
	}
	// append site: term
	if ( m_siteLen > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//memcpy ( p , "+site:" , 6 ); p += 6;
		m_sbuf1.safePrintf("+site:");
		//memcpy ( p , m_site , m_siteLen ); p += m_siteLen;
		m_sbuf1.safeMemcpy(m_site,m_siteLen);
	}


	// append gblang: term
	if( m_gblang > 0 ) {
		//if( p > pstart ) *p++ =  ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//p += sprintf( p, "+gblang:%li |", m_gblang );
		m_sbuf1.safePrintf( "+gblang:%li |", m_gblang );
	}
	// bookmark here so we can copy into st->m_displayQuery below
	//long displayQueryOffset = m_sbuf1.length();
	// append url: term
	if ( m_urlLen > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//memcpy ( p , "+url:" , 5 ); p += 5;
		m_sbuf1.safeStrcpy ( "+url:");
		//memcpy ( p , m_url , m_urlLen ); p += m_urlLen;
		m_sbuf1.safeMemcpy ( m_url , m_urlLen );
	}
	// append url: term
	if ( m_linkLen > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//memcpy ( p , "+link:" , 6 ); p += 6;
		m_sbuf1.safeStrcpy ( "+link:");
		//memcpy ( p , m_link , m_linkLen ); p += m_linkLen;
		m_sbuf1.safeMemcpy ( m_link , m_linkLen );
	}
	// append the natural query
	if ( m_queryLen > 0 ) {
		//if ( p  > pstart  ) *p++  = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//p += ucToUtf8(p, pend-p, m_query, m_queryLen, csStr, 0,0);
		m_sbuf1.safeMemcpy ( m_query , m_queryLen );
		//memcpy ( p  , m_query , m_queryLen ); p  += m_queryLen;
		// add to spell checked buf, too		
		//if ( p2 > pstart2 ) *p2++ = ' ';
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		//p2 +=ucToUtf8(p2, pend2-p2, m_query, m_queryLen, csStr, 0,0);
		m_sbuf2.safeMemcpy ( m_query , m_queryLen );
		//memcpy ( p2 , m_query , m_queryLen ); p2 += m_queryLen;
	}
	if ( m_query2Len > 0 ) {
		//if ( p3 > pstart3 ) *p3++ = ' ';
		if ( m_sbuf3.length() ) m_sbuf3.pushChar(' ');
		//p3+=ucToUtf8(p3, pend3-p3, m_query2, m_query2Len, csStr,0,0);
		m_sbuf3.safeMemcpy ( m_query2 , m_query2Len );
	}
	//if (g_errno == EILSEQ){ // illegal character seq
	//	log("query: bad char set");
	//	g_errno = 0;
	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
	//}
	// append quoted phrases to query
	if ( m_quoteLen1 > 0 ) {
		//if ( p  > pstart  ) *p++  = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//*p++ = '+';
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("+\"");
		//p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0);
		m_sbuf1.safeMemcpy ( m_quote1 , m_quoteLen1 );
		//memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ;
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("\"");
		// add to spell checked buf, too
		//if ( p2 > pstart2 ) *p2++ = ' ';
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		//*p2++ = '+';
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("+\"");
		//p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0);
		m_sbuf2.safeMemcpy ( m_quote1 , m_quoteLen1 );
		//memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ;
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("\"");
	}
	//if (g_errno == EILSEQ){ // illegal character seq
	//	g_errno = 0;
	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
	//}
	if ( m_quoteLen2 > 0 ) {
		//if ( p  > pstart  ) *p++  = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//*p++ = '+';
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("+\"");
		//p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0);
		m_sbuf1.safeMemcpy ( m_quote2 , m_quoteLen2 );
		//memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ;
		//*p++ = '\"';
		m_sbuf1.safeStrcpy("\"");
		// add to spell checked buf, too
		//if ( p2 > pstart2 ) *p2++ = ' ';
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		//*p2++ = '+';
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("+\"");
		//p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0);
		m_sbuf2.safeMemcpy ( m_quote2 , m_quoteLen2 );
		//memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ;
		//*p2++ = '\"';
		m_sbuf2.safeStrcpy("\"");
	}
	//if (g_errno == EILSEQ){ // illegal character seq
	//	g_errno = 0;
	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
	//}
	
	// append plus terms
	if ( m_plusLen > 0 ) {
		char *s = m_plus, *send = m_plus + m_plusLen;
		//if ( p > pstart && p < pend ) *p++  = ' ';
		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		while (s < send) {
			while (isspace(*s) && s < send) s++;
			char *s2 = s+1;
			if (*s == '\"') {
				// if there's no closing quote just treat
				// the end of the line as such
				while (*s2 != '\"' && s2 < send) s2++;
				if (s2 < send) s2++;
			} else {
				while (!isspace(*s2) && s2 < send) s2++;
			}
			if (s < send) break;
			//if (p < pend) *p++ = '+';
			//if (p2 < pend2) *p2++ = '+';
			m_sbuf1.pushChar('+');
			m_sbuf2.pushChar('+');
			//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
			//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
			m_sbuf1.safeMemcpy ( s , s2 - s );
			m_sbuf2.safeMemcpy ( s , s2 - s );
			/*
			if (g_errno == EILSEQ) { // illegal character seq
				g_errno = 0;
				if (qcs == csUTF8) {
					qcs = csISOLatin1;
					goto doOver;
				}
				if (qcs != csISOLatin1) {
					qcs = csUTF8;
					goto doOver;
				}
			}
			*/
			s = s2 + 1;
			if (s < send) {
				//if (p < pend) *p++ = ' ';
				//if (p2 < pend2) *p2++ = ' ';
				if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
				if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
			}
		}

	}  
	// append minus terms
	if ( m_minusLen > 0 ) {
		char *s = m_minus, *send = m_minus + m_minusLen;
		//if ( p > pstart && p < pend ) *p++  = ' ';
		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
		while (s < send) {
			while (isspace(*s) && s < send) s++;
			char *s2 = s+1;
			if (*s == '\"') {
				// if there's no closing quote just treat
				// the end of the line as such
				while (*s2 != '\"' && s2 < send) s2++;
				if (s2 < send) s2++;
			} else {
				while (!isspace(*s2) && s2 < send) s2++;
			}
			if (s < send) break;
			//if (p < pend) *p++ = '-';
			//if (p2 < pend2) *p2++ = '-';
			m_sbuf1.pushChar('-');
			m_sbuf2.pushChar('-');
			//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
			//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
			m_sbuf1.safeMemcpy ( s , s2 - s );
			m_sbuf2.safeMemcpy ( s , s2 - s );
			/*
			if (g_errno == EILSEQ) { // illegal character seq
				g_errno = 0;
				if (qcs == csUTF8) {
					qcs = csISOLatin1;
					goto doOver;
				}
				if (qcs != csISOLatin1) {
					qcs = csUTF8;
					goto doOver;
				}
			}
			*/
			s = s2 + 1;
			if (s < send) {
				//if (p < pend) *p++ = ' ';
				//if (p2 < pend2) *p2++ = ' ';
				if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
				if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
			}
		}
	}
	// append gbkeyword:numinlinks if they have &mininlinks=X, X>0
	long minInlinks = m_hr->getLong("mininlinks",0);
	if ( minInlinks > 0 ) {
		//if ( p > pstart ) *p++ = ' ';
		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
		//char *str = "gbkeyword:numinlinks";
		//long  len = gbstrlen(str);
		//memcpy ( p , str , len );
		//p += len;
		m_sbuf1.safePrintf ( "gbkeyword:numinlinks");
	}

	// null terms
	m_sbuf1.pushChar('\0');
	m_sbuf2.pushChar('\0');
	m_sbuf3.pushChar('\0');

	// the natural query
	m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;

	if ( ! m_displayQuery ) m_displayQuery = "";

	while ( *m_displayQuery == ' ' ) m_displayQuery++;

	m_displayQueryLen = gbstrlen(m_displayQuery);//p-m_displayQuery


	//log("query: got query %s",m_sbuf1.getBufStart());
	//log("query: got display query %s",m_displayQuery);

	// urlencoded display query
	urlEncode(m_qe,
		  MAX_QUERY_LEN*2,
		  m_displayQuery,
		  m_displayQueryLen);
	
	return true;
}