C++ (Cpp) getUtf8CharSize Exemples

Exemple #1

0

Afficher le fichier

Fichier : Words.cpp Projet : BillWangCS/open-source-search-engine

// a quickie
// this url gives a m_preCount that is too low. why?
// http://go.tfol.com/163/speed.asp
long countWords ( char *p , long plen , long niceness ) {
	char *pend  = p + plen;
	long  count = 1;
 loop:

	// sequence of punct
	for  ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
		// breathe
		QUICKPOLL ( niceness );
		// in case being set from xml tags, count as words now
		if ( *p=='<') count++; 
	}
	count++;

	// sequence of alnum
	for  ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
		// breathe
		QUICKPOLL ( niceness );

	count++;

	if ( p < pend ) goto loop;
	// some extra for good meaure
	return count+10;
}

Exemple #2

0

Afficher le fichier

Fichier : Words.cpp Projet : privacore/open-source-search-engine

// a quickie
// this url gives a m_preCount that is too low. why?
// http://go.tfol.com/163/speed.asp
static int32_t countWords ( const char *p , int32_t plen ) {
	const char *pend  = p + plen;
	int32_t  count = 1;

	while ( p < pend ) {

		// sequence of punct
		for  ( ; p < pend && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
			// in case being set from xml tags, count as words now
			if ( *p == '<' ) {
				count++;
			}
		}
		count++;

		// sequence of alnum
		for  ( ; p < pend && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
			;

		count++;

	};
	// some extra for good meaure
	return count+10;
}

Exemple #3

0

Afficher le fichier

Fichier : fctypes.cpp Projet : privacore/open-source-search-engine

bool verifyUtf8 ( const char *txt , int32_t tlen ) {
	if ( ! txt  || tlen <= 0 ) return true;
	char size;
	const char *p = txt;
	const char *pend = txt + tlen;
	for ( ; p < pend ; p += size ) {
		size = getUtf8CharSize(p);
		// skip if ascii
		if ( ! (p[0] & 0x80) ) continue;
		// ok, it's a utf8 char, it must have both hi bits set
		if ( (p[0] & 0xc0) != 0xc0 ) return false;
		// if only one byte, we are done..  how can that be?
		if ( size == 1 ) return false;
		//if ( ! utf8IsSane ( p[0] ) ) return false;
		// successive utf8 chars must have & 0xc0 be equal to 0x80
		// but the first char it must equal 0xc0, both set
		if ( (p[1] & 0xc0) != 0x80 ) return false;
		if ( size == 2 ) continue;
		if ( (p[2] & 0xc0) != 0x80 ) return false;
		if ( size == 3 ) continue;
		if ( (p[3] & 0xc0) != 0x80 ) return false;
	}
	if ( p != pend ) return false;
	return true;
}

Exemple #4

0

Afficher le fichier

Fichier : Unicode.cpp Projet : DeadNumbers/open-source-search-engine

int32_t stripAccentMarks (char *outbuf, int32_t outbufsize,
		       unsigned char *p, int32_t inbuflen) {
	char *s = (char *)p;
	char *send = (char *)p + inbuflen;
	int32_t cs;
	char *dst = outbuf;
	for ( ; s < send ; s += cs ) {
		// how big is this character?
		cs = getUtf8CharSize(s);
		// convert the utf8 character to UChar32
		UChar32 uc = utf8Decode ( s );
		// break "uc" into decomposition of UChar32s
		UChar32 ttt[32];
		int32_t klen = recursiveKDExpand(uc,ttt,32);
		if(klen>32){char *xx=NULL;*xx=0;}
		// sanity
		if ( dst + 5 > outbuf+outbufsize ) return -1;
		// if the same, leave it! it had no accent marks or other
		// modifiers...
		if ( klen <= 1 ) {
			gbmemcpy ( dst , s , cs );
			dst += cs;
			continue;
		}
		// take the first one as the stripped
		// convert back to utf8
		int32_t stored = utf8Encode ( ttt[0] , dst );
		// skip over the stored utf8 char
		dst += stored;
	}
	// sanity. breach check
	if ( dst > outbuf+outbufsize ) { char *xx=NULL;*xx=0; }
	// return # of bytes stored into outbuf
	return dst - outbuf;
}

Exemple #5

0

Afficher le fichier

Fichier : Unicode.cpp Projet : BKJackson/open-source-search-engine

long utf8ToAscii(char *outbuf, long outbufsize,
		 unsigned char *p, long inbuflen) { // inbuf

	char *dst = outbuf;
	unsigned char *pend = p + inbuflen;
	char *dend = outbuf + outbufsize;
	char cs;
	for ( ; p < pend ; p += cs ) {
		// do not breach
		if ( dst >= dend ) break;
		// get the size
		cs = getUtf8CharSize(p);
		// deal with one ascii char quickly
		if ( cs == 1 ) {
			*dst++ = *p;
			continue;
		}
		// we do not know how to convert this!
		if ( cs != 2 ) return -1;
		// standard crap
		char *table ;
		if      ( *p == 0xc3 ) table = ascii_c3;
		else if ( *p == 0xc4 ) table = ascii_c4;
		else if ( *p == 0xc5 ) table = ascii_c5;
		else if ( *p == 0xc6 ) table = ascii_c6;
		else return -1;

		if ( p[1] < 0x80 ) return -1;
		if ( p[1] > 0xbf ) return -1;

		*dst++ = table[p[1]-0x80];
	}
	return dst - outbuf;
}

Exemple #6

0

Afficher le fichier

Fichier : fctypes.cpp Projet : privacore/open-source-search-engine

bool has_alpha_utf8 ( char *s , char *send ) {
	char cs = 0;
	for ( ; s < send ; s += cs ) {
		cs = getUtf8CharSize ( s );
		if ( cs == 1 ) {
			if (is_alpha_a(*s)) return true;
			continue;
		}
		if ( is_alpha_utf8(s) ) return true;
	}
	return false;
}

Exemple #7

0

Afficher le fichier

Fichier : Words.cpp Projet : privacore/open-source-search-engine

static int32_t countWords ( const char *p ) {
	int32_t  count = 1;

	while ( *p ) {
		// sequence of punct
		for  ( ; *p && ! is_alnum_utf8 (p) ; p += getUtf8CharSize(p) ) {
			// in case being set from xml tags, count as words now
			if ( *p=='<') count++; 
		}
		count++;

		// sequence of alnum
		for  ( ; *p && is_alnum_utf8 (p) ; p += getUtf8CharSize(p) )
			;

		count++;

	}
	// some extra for good meaure
	return count+10;
}

Exemple #8

0

Afficher le fichier

Fichier : Synonyms.cpp Projet : nikhs/open-source-search-engine

// return false and set g_errno on error
bool Synonyms::addStripped ( char *w , long wlen , HashTableX *dt ) {
	// avoid overflow
	if ( wlen > 200 ) return true;

	// require utf8
	bool hadUtf8 = false;
	char size;
	for ( long i = 0 ; i < wlen ; i += size ) {
		size = getUtf8CharSize(w+i);
		if ( size == 1 ) continue;
		hadUtf8 = true;
		break;
	}
	if ( ! hadUtf8 ) return true;

	// filter out accent marks
	char abuf[256];
	//long alen = utf8ToAscii(abuf,256,(unsigned char *)w,wlen);
	long alen = stripAccentMarks(abuf,256,(unsigned char *)w,wlen);
	// skip if can't convert to ascii... (unsupported letter)
	if ( alen < 0 ) return true;

	// if same as original word, skip
	if ( wlen==alen && strncmp(abuf,w,wlen) == 0 ) return true;

	// hash it
	uint64_t h2 = hash64Lower_utf8(abuf,alen);
	// do not add dups
	if ( dt->isInTable ( &h2 ) ) return true;
	// add to dedup table. return false with g_errno set
	if ( ! dt->addKey ( &h2 ) ) return false;



	// store that
	*m_aidsPtr++ = h2;
	*m_wids0Ptr++ = 0LL;
	*m_wids1Ptr++ = 0LL;
	*m_termPtrsPtr++ = NULL;
	*m_termOffsPtr++ = m_synWordBuf.length();
	*m_termLensPtr++ = alen;
	*m_numAlnumWordsPtr++ = 1;
	*m_numAlnumWordsInBasePtr++ = 1;
	*m_srcPtr++ = SOURCE_GENERATED;

	m_synWordBuf.safeStrcpy(abuf);
	m_synWordBuf.pushChar('\0');

	return true;
}

Exemple #9

0

Afficher le fichier

Fichier : Words.cpp Projet : BillWangCS/open-source-search-engine

unsigned char getCharacterLanguage ( char *utf8Char ) {
	// romantic?
	char cs = getUtf8CharSize ( utf8Char );
	// can't say what language it is
	if ( cs == 1 ) return langUnknown;
	// convert to 32 bit unicode
	UChar32 c = utf8Decode ( utf8Char );
	UCScript us = ucGetScript ( c );
	// arabic? this also returns for persian!! fix?
	if ( us == ucScriptArabic ) 
		return langArabic;
	if ( us == ucScriptCyrillic )
		return langRussian;
	if ( us == ucScriptHebrew )
		return langHebrew;
	if ( us == ucScriptGreek )
		return langGreek;

	return langUnknown;
}

Exemple #10

0

Afficher le fichier

Fichier : Title.cpp Projet : lemire/open-source-search-engine

// . copy just words in [t0,t1)
// . returns false on error and sets g_errno
bool Title::copyTitle(Words *w, int32_t t0, int32_t t1) {
	// skip initial punct
	const char *const *wp    = w->getWords();
	const int32_t     *wlens = w->getWordLens();
	int32_t            nw    = w->getNumWords();

	// sanity check
	if ( t1 < t0 ) { char *xx = NULL; *xx = 0; }

	// don't breech number of words
	if ( t1 > nw ) {
		t1 = nw;
	}

	// no title?
	if ( nw == 0 || t0 == t1 ) {
		reset();
		return true;
	}

	const char *end = wp[t1-1] + wlens[t1-1] ;

	// allocate title
	int32_t need = end - wp[t0];

	// add 3 bytes for "..." and 1 for \0
	need += 5;

	// return false if could not hold the title
	if ( need > MAX_TITLE_LEN ) {
		m_title[0] = '\0';
		m_titleLen = 0;
		log("query: Could not alloc %" PRId32" bytes for title.",need);
		return false;
	}

	// point to the title to transcribe
	const char *src    = wp[t0];
	const char *srcEnd = end;

	// include a \" or \'
	if ( t0 > 0 && ( src[-1] == '\'' || src[-1] == '\"' ) ) {
		src--;
	}

	// and remove terminating | or :
	for ( ; 
	      srcEnd > src && 
		      (srcEnd[-1] == ':' || 
		       srcEnd[-1] == ' ' ||
		       srcEnd[-1] == '-' ||
		       srcEnd[-1] == '\n' ||
		       srcEnd[-1] == '\r' ||
		       srcEnd[-1] == '|'   )    ; 
	      srcEnd-- );

	// store in here
	char *dst    = m_title;

	// leave room for "...\0"
	char *dstEnd = m_title + need - 4;

	// size of character in bytes, usually 1
	char cs ;

	// point to last punct char
	char *lastp = dst;//NULL;

	int32_t charCount = 0;
	// copy the node @p into "dst"
	for ( ; src < srcEnd ; src += cs , dst += cs ) {
		// get src size
		cs = getUtf8CharSize ( src );

		// break if we are full!
		if ( dst + cs >= dstEnd ) {
			break;
		}

		// or hit our max char limit
		if ( charCount++ >= m_maxTitleLen ) {
			break;
		}

		// skip unwanted character
		if (isUtf8UnwantedSymbols(src)) {
			dst -= cs;
			continue;
		}

		// remember last punct for cutting purposes
		if ( ! is_alnum_utf8 ( src ) ) {
			lastp = dst;
		}

		// encode it as an html entity if asked to
		if ( *src == '<' ) {
			if ( dst + 4 >= dstEnd ) {
				break;
			}

			gbmemcpy ( dst , "&lt;" , 4 );
			dst += 4 - cs;
			continue;
		}

		// encode it as an html entity if asked to
		if ( *src == '>' ) {
			if ( dst + 4 >= dstEnd ) {
				break;
			}

			gbmemcpy ( dst , "&gt;" , 4 );
			dst += 4 - cs;
			continue;
		}

		// if more than 1 byte in char, use gbmemcpy
		if ( cs == 1 ) {
			*dst = *src;
		} else {
			gbmemcpy ( dst , src , cs );
		}
	}

	// null term always
	*dst = '\0';
	
	// do not split a word in the middle!
	if ( src < srcEnd ) { 
		if ( lastp ) {
			gbmemcpy ( lastp , "...\0" , 4 );
			dst = lastp + 3;
		} else {
			gbmemcpy ( dst   , "...\0" , 4 );
			dst += 3;
		}
	}

	// set size. does not include the terminating \0
	m_titleLen = dst - m_title;

	return true;
}

Exemple #11

0

Afficher le fichier

Fichier : Log.cpp Projet : privacore/open-source-search-engine

bool Log::logR ( int64_t now, int32_t type, const char *msg, bool forced ) {
	if ( ! g_loggingEnabled ) {
		return true;
	}

	// return true if we should not log this
	if ( ! forced && ! shouldLog ( type , msg ) ) {
		return true;
	}

	// get "msg"'s length
	int32_t msgLen = strlen ( msg );

	ScopedLock sl(s_lock);

	// do a timestamp, too. use the time synced with host #0 because
	// it is easier to debug because all log timestamps are in sync.
	if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore();

	// . skip all logging if power out, we do not want to screw things up
	// . allow logging for 10 seconds after power out though
	if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){
		return false;
	}

	// chop off any spaces at the end of the msg.
	while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--;

	// a tmp buffer
	char tt [ MAX_LINE_LEN ];
	char *p    = tt;


	if (m_logPrefix) {
		if ( m_logTimestamps ) {
			if( m_logReadableTimestamps ) {
				time_t now_t = (time_t)(now / 1000);
				struct tm tm_buf;
				struct tm *stm = localtime_r(&now_t,&tm_buf);

				p += sprintf ( p , "%04d%02d%02d-%02d%02d%02d-%03d %04" PRId32" ", stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec,(int)(now%1000), g_hostdb.m_hostId );
			} else {
				if ( g_hostdb.getNumHosts() <= 999 )
					p += sprintf ( p , "%" PRIu64 " %03" PRId32 " ", (uint64_t)now , g_hostdb.m_hostId );
				else if ( g_hostdb.getNumHosts() <= 9999 )
					p += sprintf ( p , "%" PRIu64" %04" PRId32" ", (uint64_t)now , g_hostdb.m_hostId );
				else if ( g_hostdb.getNumHosts() <= 99999 )
					p += sprintf ( p , "%" PRIu64" %05" PRId32" ", (uint64_t)now , g_hostdb.m_hostId );
			}
		}

		// Get thread id. pthread_self instead?
		unsigned tid=(unsigned)syscall(SYS_gettid);
		p += sprintf(p, "%06u ", tid);

		// Log level
		p += sprintf(p, "%s ", getTypeString(type));
	}

	// then message itself
	const char *x = msg;
	int32_t avail = (MAX_LINE_LEN) - (p - tt) - 1;
	if ( msgLen > avail ) msgLen = avail;
	if ( *x == ':' ) x++;
	if ( *x == ' ' ) x++;
	strncpy ( p , x , avail );
	// capitalize for consistency. no, makes grepping log msgs harder.
	//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
	p += strlen(p);
	// back up over spaces
	while ( p[-1] == ' ' ) p--;
	// end in period or ? or !
	//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
	//	*p++ = '.';
	*p ='\0';
	// the total length, not including the \0
	int32_t tlen = p - tt;

	// . filter out nasty chars from the message
	// . replace with ~'s
	char cs;
	char *ttp    = tt;
	char *ttpend = tt + tlen;
	for ( ; ttp < ttpend ; ttp += cs ) {
		cs = getUtf8CharSize ( ttp );
		if ( is_binary_utf8 ( ttp ) ) {
			for ( int32_t k = 0 ; k < cs ; k++ ) *ttp++ = '.';
			// careful not to skip the already skipped bytes
			cs = 0;
			continue;
		}
	}

	// . if filesize would be too big then make a new log file
	// . should make a new m_fd
	if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE && g_conf.m_logToFile )
		makeNewLogFile();

	if ( m_fd >= 0 ) {
		write ( m_fd , tt , tlen );
		write ( m_fd , "\n", 1 );
		m_logFileSize += tlen + 1;
	}
	else {
		// print it out for now
		fprintf ( stderr, "%s\n", tt );
	}

	return false;
}

Exemple #12

0

Afficher le fichier

Fichier : Words.cpp Projet : BillWangCS/open-source-search-engine

bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
	long  i = 0;
	long  j;
	//long  k = 0;
	long  wlen;
	//unsigned long e;
	//long  skip;
	long badCount = 0;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) goto done;

	if ( ! s[i] ) goto done;

	if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) {

		if ( m_numWords >= m_preCount ) goto done;

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if ( s[i+1]=='/' ) {
				// skip over /
				m_tagIds [m_numWords] = ::getTagId(s+i+2);
				m_tagIds [m_numWords] |= BACKBIT;
			}
			else
				m_tagIds [m_numWords] = ::getTagId(s+i+1);
			// word start
			m_words    [m_numWords] = s + i;
			m_wordIds  [m_numWords] = 0LL;
			// skip till end
			long tagLen = getTagLen(s+i); // ,niceness);
			m_wordLens [m_numWords] = tagLen;
			m_numWords++;
			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		//for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i));
		for ( ; s[i] ; i += getUtf8CharSize(s+i)){
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) break;
			// breathe
			QUICKPOLL(niceness);
			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) continue;
				// update
				oldScript = ucScriptCommon;
				// otherwise, stop we got alnum
				break;
			}
			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );
			// stop if word char
			if ( ! ucIsWordChar ( c ) ) continue;
			// update first though
			oldScript = ucGetScript ( c );
			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		if (m_tagIds) m_tagIds[m_numWords] = 0;
		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	//for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) );
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// breathe
		QUICKPOLL(niceness);
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanes chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}
	
	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
	
	// allow for words like we're dave's and i'm
	if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;
	
	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;
	// . Lars says it's better to leave the accented chars intact
	// . google agrees
	// . but what about "re'sume"?
	if ( computeWordIds ) {
		long long h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
		// until we get an accent removal algo, comment this
		// out and possibly use the query synonym pipeline
		// to search without accents. MDW
		//long long h2 = hash64AsciiLowerE(&s[j],wlen);
		//if ( h2 != h ) m_stripWordIds [m_numWords] = h2;
		//else           m_stripWordIds [m_numWords] = 0LL;
		//m_stripWordIds[m_numWords] = 0;
	}
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// break on \0 or MAX_WORDS
	//if ( ! s[i] ) goto done;
	// get a punct word
	goto uptop;
	/*
	  j = i;
	  // delineate the "punctuation" word
	  for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i));
	  // bad utf8 could cause us to breach the node, so watch out!
	  if ( i > nodeLen ) {
	  badCount++;
	  i = nodeLen;
	  }
	  // get word length
	  wlen = i - j;
	  if ( m_numWords >= m_preCount ) goto done;
	  m_words        [m_numWords  ] = &s[j];
	  m_wordLens     [m_numWords  ] = wlen;
	  m_wordIds      [m_numWords  ] = 0LL;
	  if (m_tagIds) m_tagIds[m_numWords] = 0;
	  m_numWords++;
	*/

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC,
		    "build: words: set: Fix counting routine.");
		char *xx = NULL; *xx = 0;
	}
	// compute total length
	if ( m_numWords <= 0 ) m_totalLen = 0;
	else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1];

	if ( badCount )
		log("words: had %li bad utf8 chars",badCount);

	return true;
}

Exemple #13

0

Afficher le fichier

Fichier : Words.cpp Projet : BillWangCS/open-source-search-engine

bool Words::set ( Xml *xml, 
		  bool computeWordIds , 
		  long niceness ,
		  long node1 ,
		  long node2 ) {
	// prevent setting with the same string
	if ( m_xml == xml ) { char *xx=NULL;*xx=0; }
	reset();
	m_xml = xml;
	m_version = xml->getVersion();
	//m_version = xml->getVersion();

	// quick test
	if ( ! s_tested ) {
		// only do once
		s_tested = true;
		// set c to a curling quote in unicode
		long c = 0x201c; // 0x235e;
		// encode it into utf8
		char dst[5];
		// point to it
		char *p = dst;
		// put space in there
		*p++ = ' ';
		// "numBytes" is how many bytes it stored into 'dst"
		long numBytes = utf8Encode ( c , p );
		// must be 2 bytes i guess
		if ( numBytes != 3 ) { char *xx=NULL; *xx=0; }
		// check it
		long size = getUtf8CharSize(p);
		if ( size != 3 ) { char *xx=NULL; *xx=0; }
		// is that punct
		if ( ! is_punct_utf8 ( p ) ) { char *xx=NULL;*xx=0; }
		// make sure can pair across
		//unsigned char bits = getPunctuationBits  ( dst , 4 );
		// must be able to pair across
		//if ( ! ( bits & D_CAN_PAIR_ACROSS ) ) { char *xx=NULL;*xx=0;}
	}

	// if xml is empty, bail
	if   ( ! xml->getContent() ) return true;

	long numNodes = xml->getNumNodes();
	if ( numNodes <= 0 ) return true;

	// . can be given a range, if node2 is -1 that means all!
	// . range is half-open: [node1, node2)
	if ( node2 < 0 ) node2 = numNodes;
	// sanity check
	if ( node1 > node2 ) { char *xx=NULL;*xx=0; }
	char *start = xml->getNode(node1);
	char *end   = xml->getNode(node2-1) + xml->getNodeLen(node2-1);
	long  size  = end - start;

	m_preCount = countWords( start , size , niceness );

	// allocate based on the approximate count
	if ( ! allocateWordBuffers(m_preCount, true)) return false;
	
	// are we done?
	for ( long k = node1 ; k < node2 && m_numWords < m_preCount ; k++ ){
		// get the kth node
		char *node    = xml->getNode   (k);
		long  nodeLen = xml->getNodeLen(k);
		// is the kth node a tag?
		if ( ! xml->isTag(k) ) {
			char c = node[nodeLen];
			node[nodeLen] = '\0';
			addWords(node,nodeLen,computeWordIds,niceness);
			node[nodeLen] = c;
			continue;
		}
		// it is a tag
		m_words    [m_numWords] = node;
		m_wordLens [m_numWords] = nodeLen;
		m_tagIds   [m_numWords] = xml->getNodeId(k);
		m_wordIds  [m_numWords] = 0LL;
		m_nodes    [m_numWords] = k;
		// we have less than 127 HTML tags, so set 
		// the high bit for back tags
		if ( xml->isBackTag(k)) {
			m_tagIds[m_numWords] |= BACKBIT;
		}
		//log(LOG_DEBUG, "Words: Word %ld: got tag %s%s (%d)", 
		//    m_numWords,
		//    isBackTag(m_numWords)?"/":"",
		//    g_nodes[getTagId(m_numWords)].m_nodeName,
		//    getTagId(m_numWords));
		
		m_numWords++;
		// used by XmlDoc.cpp
		m_numTags++;
		continue;
	}
	return true;
}

Exemple #14

0

Afficher le fichier

Fichier : Words.cpp Projet : privacore/open-source-search-engine

bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) {
	int32_t  i = 0;
	int32_t  j;
	int32_t  wlen;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) {
		goto done;
	}

	if ( ! s[i] ) {
		goto done;
	}

	if ( !is_alnum_utf8( s + i ) ) {
		if ( m_numWords >= m_preCount ) {
			goto done;
		}

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if( m_tagIds ) {
				if ( s[i + 1] == '/' ) {
					// skip over /
					m_tagIds[m_numWords] = ::getTagId( s + i + 2 );
					m_tagIds[m_numWords] |= BACKBIT;
				} else {
					m_tagIds[m_numWords] = ::getTagId( s + i + 1 );
				}
			}

			m_words[m_numWords] = s + i;
			m_wordIds[m_numWords] = 0LL;

			// skip till end
			int32_t tagLen = getTagLen( s + i );
			m_wordLens[m_numWords] = tagLen;
			m_nodes[m_numWords] = 0;
			m_numWords++;

			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		for ( ; s[i] ; i += getUtf8CharSize(s+i)) {
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) {
				break;
			}

			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) {
					continue;
				}

				// update
				oldScript = ucScriptCommon;

				// otherwise, stop we got alnum
				break;
			}

			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );

			// stop if word char
			if ( ! ucIsWordChar ( c ) ) {
				continue;
			}

			// update first though
			oldScript = ucGetScript ( c );

			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		m_nodes        [ m_numWords  ] = 0;

		if (m_tagIds) {
			m_tagIds[m_numWords] = 0;
		}

		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanes chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}
	
	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;

	// comma is ok if like ,ddd!d
	if ( s[i]==',' && 
	     i-j <= 3 &&
	     is_digit(s[i-1]) ) {
		// if word so far is 2 or 3 chars, make sure digits
		if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
		if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
		// scan forward
		while ( s[i] == ',' &&
		        is_digit(s[i+1]) &&
		        is_digit(s[i+2]) &&
		        is_digit(s[i+3]) &&
		        ! is_digit(s[i+4]) ) {
			i += 4;
		}
	}

	// decimal point?
	if ( s[i] == '.' &&
	     is_digit(s[i-1]) &&
	     is_digit(s[i+1]) ) {
		// allow the decimal point
		i++;
		// skip over string of digits
		while ( is_digit(s[i]) ) i++;
	}
	
 nogo:

	// allow for words like we're dave's and i'm
	if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) {
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;
	
	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;

	if ( computeWordIds ) {
		int64_t h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
	}

	m_nodes[m_numWords] = 0;
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// get a punct word
	goto uptop;

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC, "build: words: set: Fix counting routine.");
		gbshutdownLogicError();
	}

	return true;
}

Exemple #15

0

Afficher le fichier

Fichier : Summary.cpp Projet : exename/open-source-search-engine

// . return the score of the highest-scoring window containing match #m
// . window is defined by the half-open interval [a,b) where a and b are 
//   word #'s in the Words array indicated by match #m
// . return -1 and set g_errno on error
int64_t Summary::getBestWindow ( Matches *matches, int32_t mm, int32_t *lasta,
                                 int32_t *besta, int32_t *bestb, char *gotIt,
                                 char *retired, int32_t maxExcerptLen ) {
	// get the window around match #mm
	Match *m = &matches->m_matches[mm];

	// what is the word # of match #mm?
	int32_t matchWordNum = m->m_wordNum;

	// what Words/Pos/Bits classes is this match in?
	Words *words = m->m_words;
	Section **sp = NULL;
	int32_t *pos = m->m_pos->m_pos;

	// use "m_swbits" not "m_bits", that is what Bits::setForSummary() uses
	const swbit_t *bb = m->m_bits->m_swbits;

	// shortcut
	if ( m->m_sections ) {
		sp = m->m_sections->m_sectionPtrs;
	}

	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();
	nodeid_t *tids = words->getTagIds();

	// . sanity check
	// . this prevents a core i've seen
	if ( matchWordNum >= nw ) {
		log("summary: got overflow condition for q=%s",m_q->m_orig);

		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . we NULLify the section ptrs if we already used the word in another summary.
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
	if ( (bb[matchWordNum] & D_USED) || ( sp && (sp[matchWordNum]->m_flags & badFlags) ) ) {
		// assume no best window
		*besta = -1;
		*bestb = -1;
		*lasta = matchWordNum;
		return 0;
	}

	// . "a" is the left fence post of the window (it is a word # in Words)
	// . go to the left as far as we can 
	// . thus we decrement "a"
	int32_t a = matchWordNum;

	// "posa" is the character position of the END of word #a
	int32_t posa = pos[a+1];
	int32_t firstFrag = -1;
	bool startOnQuote = false;
	bool goodStart = false;
	int32_t wordCount = 0;

	// . decrease "a" as int32_t as we stay within maxNumCharsPerLine
	// . avoid duplicating windows by using "lasta", the last "a" of the
	//   previous call to getBestWindow(). This can happen if our last
	//   central query term was close to this one.
	for ( ; a > 0 && posa - pos[a-1] < maxExcerptLen && a > *lasta; a-- ) {
		// . don't include any "dead zone", 
		// . dead zones have already been used for the summary, and
		//   we are getting a second/third/... excerpt here now then
		// stop if its the start of a sentence, too
		// stop before title word
		if ( (bb[a-1] & D_USED) || (bb[a] & D_STARTS_SENTENCE) || ( bb[a-1] & D_IN_TITLE )) {
			goodStart = true;
			break;
		}

		// don't go beyond an LI, TR, P tag
		if ( tids && ( tids[a-1] == TAG_LI ||
		               tids[a-1] == TAG_TR ||
		               tids[a-1] == TAG_P  ||
		               tids[a-1] == TAG_DIV ) ) {
			goodStart = true;
			break;
		}

		// stop if its the start of a quoted sentence
		if ( a+1<nw && (bb[a+1] & D_IN_QUOTES) && 
		     words->getWord(a)[0] == '\"' ){
			startOnQuote = true;
			goodStart    = true;
			break;
		}

		// find out the first instance of a fragment (comma, etc)
		// watch out! because frag also means 's' in there's
		if ( ( bb[a] & D_STARTS_FRAG ) && !(bb[a-1] & D_IS_STRONG_CONNECTOR) && firstFrag == -1 ) {
			firstFrag = a;
		}

		if ( wids[a] ) {
			wordCount++;
		}
	}

	// if didn't find a good start, then start at the start of the frag
	if ( !goodStart && firstFrag != -1 ) {
		a = firstFrag;
	}

	// don't let punct or tag word start a line, unless a quote
	if ( a < matchWordNum && !wids[a] && words->getWord(a)[0] != '\"' ){
		while ( a < matchWordNum && !wids[a] ) a++;
		
		// do not break right after a "strong connector", like 
		// apostrophe
		while ( a < matchWordNum && a > 0 && 
			( bb[a-1] & D_IS_STRONG_CONNECTOR ) )
			a++;
		
		// don't let punct or tag word start a line
		while ( a < matchWordNum && !wids[a] ) a++;
	}

	// remember, b is not included in the summary, the summary is [a,b-1]
	// remember to include all words in a matched phrase
	int32_t b = matchWordNum + m->m_numWords ;
	int32_t endQuoteWordNum = -1;
	int32_t numTagsCrossed = 0;

	for ( ; b <= nw; b++ ) {
		if ( b == nw ) {
			break;
		}

		if ( pos[b+1] - pos[a] >= maxExcerptLen ) {
			break;
		}
		
		if ( startOnQuote && words->getWord(b)[0] == '\"' ) {
			endQuoteWordNum = b;
		}

		// don't include any dead zone, those are already-used samples
		if ( bb[b] & D_USED ) {
			break;
		}

		// stop on a title word
		if ( bb[b] & D_IN_TITLE ) {
			break;
		}

		if ( wids[b] ) {
			wordCount++;
		}

		// don't go beyond an LI or TR backtag
		if ( tids && ( tids[b] == (BACKBIT|TAG_LI) ||
		               tids[b] == (BACKBIT|TAG_TR) ) ) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 ) {
				break;
			}
		}

		// go beyond a P or DIV backtag in case the earlier char is a
		// ':'. This came from a special case for wikipedia pages 
		// eg. http://en.wikipedia.org/wiki/Flyover
		if ( tids && ( tids[b] == (BACKBIT|TAG_P)  ||
		               tids[b] == (BACKBIT|TAG_DIV) )) {
			numTagsCrossed++;

			// try to have atleast 10 words in the summary
			if ( wordCount > 10 && words->getWord(b-1)[0] != ':' ) {
				break;
			}
		}
	}

	// don't end on a lot of punct words
	if ( b > matchWordNum && !wids[b-1]){
		// remove more than one punct words. if we're ending on a quote
		// keep it
		while ( b > matchWordNum && !wids[b-2] && endQuoteWordNum != -1 && b > endQuoteWordNum ) {
			b--;
		}
		
		// do not break right after a "strong connector", like apostrophe
		while ( b > matchWordNum && (bb[b-2] & D_IS_STRONG_CONNECTOR) ) {
			b--;
		}
	}

	Match *ms = matches->m_matches;

	// make m_matches.m_matches[mi] the first match in our [a,b) window
	int32_t mi ;

	// . the match at the center of the window is match #"mm", so that
	//   matches->m_matches[mm] is the Match class
	// . set "mi" to it and back up "mi" as int32_t as >= a
	for ( mi = mm ; mi > 0 && ms[mi-1].m_wordNum >=a ; mi-- )
		;

	// now get the score of this excerpt. Also mark all the represented 
	// query words. Mark the represented query words in the array that
	// comes to us. also mark how many times the same word is repeated in
	// this summary.
	int64_t score = 0LL;

	// is a url contained in the summary, that looks bad! punish!
	bool hasUrl = false;

	// the word count we did above was just an approximate. count it right
	wordCount = 0;

	// for debug
	//char buf[5000];
	//char *xp = buf;
	SafeBuf xp;

	// wtf?
	if ( b > nw ) {
		b = nw;
	}

	// first score from the starting match down to a, including match
	for ( int32_t i = a ; i < b ; i++ ) {
		// debug print out
		if ( g_conf.m_logDebugSummary ) {
			int32_t len = words->getWordLen(i);
			char cs;
			for (int32_t k=0;k<len; k+=cs ) {
				const char *c = words->getWord(i)+k;
				cs = getUtf8CharSize(c);
				if ( is_binary_utf8 ( c ) ) {
					continue;
				}
				xp.safeMemcpy ( c , cs );
				xp.nullTerm();
			}
		}

		// skip if in bad section, marquee, select, script, style
		if ( sp && (sp[i]->m_flags & badFlags) ) {
			continue;
		}

		// don't count just numeric words
		if ( words->isNum(i) ) {
			continue;
		}

		// check if there is a url. best way to check for '://'
		if ( wids && !wids[i] ) {
			const char *wrd = words->getWord(i);
			int32_t  wrdLen = words->getWordLen(i);
			if ( wrdLen == 3 && wrd[0] == ':' && wrd[1] == '/' &&  wrd[2] == '/' ) {
				hasUrl = true;
			}
		}

		// skip if not wid
		if ( ! wids[i] ) {
			continue;
		}

		// just make every word 100 pts
		int32_t t = 100;

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		// boost it if in bold or italics
		if ( bb[i] & D_IN_BOLDORITALICS ) {
			t *= 2;
		}

		// add the score for this word
		score += t;

		// print the score, "t"
		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf("(%" PRId32")",t);
		}

		// count the alpha words we got
		wordCount++;

		// if no matches left, skip
		if ( mi >= matches->m_numMatches ) {
			continue;
		}

		// get the match
		Match *next = &ms[mi];

		// skip if not a match
		if ( i != next->m_wordNum ) {
			continue;
		}

		// must be a match in this class
		if ( next->m_words != words ) {
			continue;
		}

		// advance it
		mi++;

		// which query word # does it match
		int32_t qwn = next->m_qwordNum;

		if ( qwn < 0 || qwn >= m_q->m_numWords ){g_process.shutdownAbort(true);}

		// undo old score
		score -= t;

		// add 100000 per match
		t = 100000;

		// weight based on tf, goes from 0.1 to 1.0
		t = (int32_t)((float)t * m_wordWeights [ qwn ]);

		// if it is a query stop word, make it 10000 pts
		if ( m_q->m_qwords[qwn].m_isQueryStopWord ) {
			t = 0;//10000;
		}

		// penalize it if in one of these sections
		if ( bb[i] & ( D_IN_PARENS | D_IN_SUP | D_IN_LIST ) ) {
			t /= 2;
		}

		if ( gotIt[qwn] > 0 ) {
			// have we matched it in this [a,b) already?
			if ( gotIt[qwn] == 1 ) {
				t /= 15;
			} else {
				// if we have more than 2 matches in the same window,
				// it may not give a good summary. give a heavy penalty
				t -= 200000;
			}
		} else if ( retired [qwn] > 0 ) {
			// have we matched it already in a winning window?
			t /= 12;
		}

		// add it back
		score += t;

		if ( g_conf.m_logDebugSummary ) {
			xp.safePrintf ("[%" PRId32"]{qwn=%" PRId32",ww=%f}",t,qwn,
				       m_wordWeights[qwn]);
		}

		// inc the query word count for this window
		if ( gotIt[qwn] < 100 ) {
			gotIt[qwn]++;
		}
	}

	int32_t oldScore = score;
	
	// apply the bonus if it starts or a sentence
	// only apply if the score is positive and if the wordcount is decent
	if ( score > 0 && wordCount > 7 ){
		// a match can give us 10k to 100k pts based on the tf weights
		// so we don't want to overwhelm that too much, so let's make
		// this a 20k bonus if it starts a sentence
		if ( bb[a] & D_STARTS_SENTENCE ) {
			score += 8000;
		} else if ( bb[a] & D_STARTS_FRAG ) {
			// likewise, a fragment, like after a comma
			score += 4000;
		}

		// 1k if the match word is very close to the
		// start of a sentence, lets say 3 alphawords
		if ( matchWordNum - a < 7 ) {
			score += 1000;
		}
	}

	// a summary isn't really a summary if its less than 7 words.
	// reduce the score, but still give it a decent score.
	// minus 5M.
	if ( wordCount < 7 ) {
		score -= 20000;
	}

	// summaries that cross a lot of tags are usually bad, penalize them
	if ( numTagsCrossed > 1 ) {
		score -= (numTagsCrossed * 20000);
	}

	if ( hasUrl ) {
		score -= 8000;
	}

	// show it
	if ( g_conf.m_logDebugSummary ) {
		log(LOG_DEBUG, "sum: score=%08" PRId32" prescore=%08" PRId32" a=%05" PRId32" b=%05" PRId32" %s",
		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
		     xp.getBufStart());
	}

	// set lasta, besta, bestb
	*lasta = a;
	*besta = a;
	*bestb = b;

	return score;
}

Exemple #16

0

Afficher le fichier

Fichier : Pos.cpp Projet : harkhuang/open-source-search-engine

// . set the filtered position of each word
// . used by Summary.cpp to determine how many chars are in the summary,
//   be those chars single byte or utf8 chars that are 4 bytes
// . returns false and sets g_errno on error
// . if f is non-NULL store filtered words into there. back to back spaces
//   are eliminated.
bool Pos::set ( Words  *words  ,
                Sections *sections ,
                char   *f   ,
                char   *fend,
                long   *len ,
                long    a   ,
                long    b   ,
                char   *buf ,
                long    bufSize ) {

    // free m_buf in case this is a second call
    if ( ! f ) reset();

    long        nw    = words->getNumWords();
    long       *wlens = words->m_wordLens;
    nodeid_t   *tids  = words->getTagIds(); // m_tagIds;
    char      **wp    = words->m_words;
    //long       *ss    = NULL;
    //long long  *wids  = words->m_wordIds;
    //if ( scores ) ss  = scores->m_scores;

    // save start point for filtering
    char *fstart = f;

    // -1 is the default value
    if ( b == -1 ) b = nw;

    // alloc array if need to
    long need = (nw+1) * 4;

    // do not destroy m_pos/m_numWords if only filtering into a buffer
    if ( f ) goto skip;

    m_needsFree = false;

    m_buf = m_localBuf;
    if ( need > POS_LOCALBUFSIZE && need < bufSize )
        m_buf = buf;
    else if ( need > POS_LOCALBUFSIZE ) {
        m_buf = (char *)mmalloc(need,"Pos");
        m_needsFree = true;
    }
    // bail on error
    if ( ! m_buf ) return false;
    m_bufSize = need;
    m_pos      = (long *)m_buf;
    m_numWords = nw;

skip:
    // this is the CHARACTER count.
    long pos = 0;
    bool trunc = false;
    char *p , *pend;
    //char *nextp;
    //long  skip;

    char* lastBreak = NULL;
    // utf8 char
    //long c;
    // its size in bytes
    //char cs;

    // shortcut
    //Section **sp = NULL;
    //if ( sections ) sp = sections->m_sectionPtrs;

    //long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;

    // flag for stopping back-to-back spaces. only count those as one char.
    bool lastSpace = false;
    long maxCharSize = 4; // we are utf8
    for ( long i = a ; i < b ; i++ ) {
        if (trunc) break;
        // set pos for the ith word to "pos"
        if ( ! f ) m_pos[i] = pos;

        // if inside a bad tag, skip it
        //if ( sp && (sp[i]->m_flags & badFlags) ) continue;

        // is tag?
        if ( tids && tids[i] ) {
            // if not breaking, does nothing
            if ( ! g_nodes[tids[i]&0x7f].m_isBreaking ) continue;
            // list tag? <li>
            if ( tids[i] == TAG_LI ) {
                if ( f ) {
                    if ((fend - f > maxCharSize)) {
                        *f++ = '*';
                    }
                    else {
                        trunc = true;
                    }
                }
                pos++;
                lastSpace = false;
                continue;
            }
            // if had a previous breaking tag and no non-tag
            // word after it, do not count back-to-back spaces
            if ( lastSpace ) continue;
            // if had a br tag count it as a '.'
            if ( tids[i] ) { // == 20 ) { // <br>
                // are we filtering?
                if ( f && f != fstart ) {
                    if ((fend-f>2*maxCharSize)) {
                        *f++ = '.';
                        *f++ = ' ';
                    }
                    else trunc = true;
                }
                // count as double periods
                //pos += 3;
                // no, just single period.
                pos += 2;
                lastSpace = true;
                continue;
            }
            // are we filtering?
            if ( f ) {
                if ((fend-f > maxCharSize)) {
                    *f++ = ' ';
                }
                else trunc = true;
            }
            // count as a single space
            pos++;
            // do not allow back-to-back spaces
            lastSpace = true;
            continue;
        }

        // scan through all chars discounting back-to-back spaces

        // assume filters out to the same # of chars
        p    = wp[i] ;
        pend = p + wlens[i];
        unsigned char cs = 0;
        for ( ; p < pend ; p += cs ) {
            // get size
            cs = getUtf8CharSize(p);
            // do not count space if one before
            if ( is_wspace_utf8 (p) ) {
                if ( lastSpace ) continue;
                lastSpace = true;
                // are we filtering?
                if ( f ) {
                    if (fend-f > 1 ) {
                        lastBreak = f;
                        *f++ = ' ';
                    }
                    else trunc = true;
                }
                pos++;
                continue;
            }
            if ( f ) {
                if (fend-f > cs) {
                    // change '|' to commas
                    if ( *p == '|' )
                        *f++ = ',';
                    else if ( cs == 1 )
                        *f++ = *p;
                    else {
                        memcpy(f,p,cs);
                        f += cs;
                    }
                }
                else trunc = true;
            }

            pos++;
            lastSpace = false;
        }
    }
    if (trunc) {
        if(lastBreak == NULL) {
            *len = 0;
            return false;
        }
        else if(f) f = lastBreak;
    }
    // set pos for the END of the last word here (used in Summary.cpp)
    if ( ! f ) m_pos[nw] = pos;
    // NULL terminate f
    else {
        *len = f - fstart;
    }
    if ( fend-f > maxCharSize) {
        *f = '\0';
    }
    // Success
    return true;
}

Exemple #17

0

Afficher le fichier

Fichier : Log.cpp Projet : FlavioFalcao/open-source-search-engine

bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
		 bool forced ) {

	// filter if we should
	//if ( forced ) goto skipfilter;

	// return true if we should not log this
	if ( ! forced && ! shouldLog ( type , msg ) ) return true;
	// skipfilter:
	// can we log if we're a sig handler? don't take changes
	if ( g_inSigHandler ) 
		return logLater ( now , type , msg , NULL );
	//if ( g_inSigHandler ) return false;
	// get "msg"'s length
	long msgLen = gbstrlen ( msg );

#ifdef PTHREADS
	// lock for threads
	pthread_mutex_lock ( &s_lock );
#endif

	// do a timestamp, too. use the time synced with host #0 because
	// it is easier to debug because all log timestamps are in sync.
	if ( now == 0 ) now = gettimeofdayInMillisecondsGlobalNoCore();

	// . skip all logging if power out, we do not want to screw things up
	// . allow logging for 10 seconds after power out though
	if ( ! g_process.m_powerIsOn && now - g_process.m_powerOffTime >10000){
#ifdef PTHREADS
		pthread_mutex_unlock ( &s_lock );
#endif
		return false;
	}

	//if ( now == 0 ) now  = g_nowApprox;
	// chop off any spaces at the end of the msg.
	while ( is_wspace_a ( msg [ msgLen - 1 ] ) && msgLen > 0 ) msgLen--;
	// get this pid
	pid_t pid = getpidtid();
	// a tmp buffer
	char tt [ MAX_LINE_LEN ];
	char *p    = tt;
	char *pend = tt + MAX_LINE_LEN;
	/*
	// print timestamp, hostid, type
	if ( g_hostdb.m_numHosts <= 999 ) 
		sprintf ( p , "%llu %03li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	else if ( g_hostdb.m_numHosts <= 9999 ) 
		sprintf ( p , "%llu %04li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	else if ( g_hostdb.m_numHosts <= 99999 ) 
		sprintf ( p , "%llu %05li %s ",
			  now , g_hostdb.m_hostId , getTypeString(type) );
	*/


	// print timestamp, hostid, type

	if ( m_logTimestamps ) {
		if ( g_hostdb.m_numHosts <= 999 ) 
			sprintf ( p , "%llu %03li ",
				  now , g_hostdb.m_hostId );
		else if ( g_hostdb.m_numHosts <= 9999 ) 
			sprintf ( p , "%llu %04li ",
				  now , g_hostdb.m_hostId );
		else if ( g_hostdb.m_numHosts <= 99999 ) 
			sprintf ( p , "%llu %05li ",
				  now , g_hostdb.m_hostId );
		p += gbstrlen ( p );
	}

	// msg resource
	char *x = msg;
	long cc = 7;
	// the first 7 bytes or up to the : must be ascii
	//while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; }
	// space pad
	//while ( cc-- > 0 ) *p++ = ' ';
	// ignore the label for now...
	while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; }
	// thread id if in "thread"
	if ( pid != s_pid && s_pid != -1 ) {
		//sprintf ( p , "[%li] " , (long)getpid() );
		sprintf ( p , "[%lu] " , (unsigned long)pid );
		p += gbstrlen ( p );
	}
	// then message itself
	long avail = (MAX_LINE_LEN) - (p - tt) - 1;
	if ( msgLen > avail ) msgLen = avail;
	if ( *x == ':' ) x++;
	if ( *x == ' ' ) x++;
	strncpy ( p , x , avail );
	// capitalize for consistency. no, makes grepping log msgs harder.
	//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
	p += gbstrlen(p);
	// back up over spaces
	while ( p[-1] == ' ' ) p--;
	// end in period or ? or !
	//if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
	//	*p++ = '.';
	*p ='\0';
	// the total length, not including the \0
	long tlen = p - tt;

	// call sprintf, but first make sure we have room in m_buf and in
	// the arrays. who know how much room the sprintf is going to need???
	// NOTE: TODO: this is shaky -- fix it!
	if ( m_bufPtr + tlen  >= 1024 * 32 ||  m_numErrors  >= MAX_LOG_MSGS){
		// this sets m_bufPtr to 0
		if ( ! dumpLog ( ) ) {
			fprintf(stderr,"Log::log: could not dump to file!\n");
#ifdef PTHREADS
			pthread_mutex_unlock ( &s_lock );
#endif
			return false;
		}
	}
	// . filter out nasty chars from the message
	// . replace with ~'s
	char cs;
	char *ttp    = tt;
	char *ttpend = tt + tlen;
	for ( ; ttp < ttpend ; ttp += cs ) {
		cs = getUtf8CharSize ( ttp );
		if ( is_binary_utf8 ( ttp ) ) {
			for ( long k = 0 ; k < cs ; k++ ) *ttp++ = '.';
			// careful not to skip the already skipped bytes
			cs = 0;
			continue;
		}
		// convert \n's and \r's to spaces
		if ( *ttp == '\n' ) *ttp = ' ';
		if ( *ttp == '\r' ) *ttp = ' ';
		if ( *ttp == '\t' ) *ttp = ' ';
	}

	if ( m_fd >= 0 ) {
		write ( m_fd , tt , tlen );
		write ( m_fd , "\n", 1 );
	}
	else {
		// print it out for now
		fprintf ( stderr, "%s\n", tt );
	}

	// set the stuff in the array
	m_errorMsg      [m_numErrors] = msg;
	m_errorMsgLen   [m_numErrors] = msgLen;
	m_errorTime     [m_numErrors] = now;
	m_errorType     [m_numErrors] = type;
	// increase the # of errors
	m_numErrors++;

#ifdef PTHREADS
	// unlock for threads
	pthread_mutex_unlock ( &s_lock );
#endif
	return false;
}

Exemple #18

0

Afficher le fichier

Fichier : Json.cpp Projet : DeadNumbers/open-source-search-engine

JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {

	m_prev = NULL;

	m_stackPtr = 0;
	m_sb.purge();

	JsonItem *ji = NULL;

	if ( ! json ) return NULL;

	// how much space will we need to avoid any reallocs?
	char *p = json;
	bool inQuote = false;
	int32_t need = 0;
	for ( ; *p ; p++ ) {
		// ignore any escaped char. also \x1234
		if ( *p == '\\' ) {
			if ( p[1] ) p++;
			continue;
		}
		if ( *p == '\"' )
			inQuote = ! inQuote;
		if ( inQuote ) 
			continue;
		if ( *p == '{' ||
		     *p == ',' ||
		     *p == '[' ||
		     *p == ':' )
			// +1 for null terminating string of each item
			need += sizeof(JsonItem) +1;
	}
	// plus the length of the string to store it decoded etc.
	need += p - json;
	// plus a \0 for the value and a \0 for the name of each jsonitem
	need += 2;
	// prevent cores for now
	need += 10;
	// . to prevent safebuf from reallocating do this
	// . safeMemcpy() calls reserve(m_length+len) and reserves
	//   tries to alloc m_length + (m_length+len) so since,
	//   m_length+len should never be more than "need" we need to
	//   double up here
	need *= 2;
	// this should be enough
	if ( ! m_sb.reserve ( need ) ) return NULL;
	// for testing if we realloc
	char *mem = m_sb.getBufStart();

	int32_t  size;

	char *NAME = NULL;
	int32_t  NAMELEN = 0;

	// reset p
	p = json;
	// json maybe bad utf8 causing us to miss the \0 char, so use "pend"
	char *pend = json + gbstrlen(json);

	// scan
	for ( ; p < pend ; p += size ) {
		// get size
		size = getUtf8CharSize ( p );

		// skip spaces
		if ( is_wspace_a (*p) )
			continue;

		// skip commas
		if ( *p == ',' ) continue;

		// did we hit a '{'? that means the existing json item
		// is a parent of the item(s) inside the {}'s
		if ( *p == '{' ) {
			// if ji is non-null it must be a name like in
			// \"stats\":{\"fetchTime\":2069,....}
			// . this indicates the start of a json object
			// . addNewItem() will push the current item on stack
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// current ji is an object type then
			ji->m_type = JT_OBJECT;
			// set the name
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			// this goes on the stack
			if ( m_stackPtr >= MAXJSONPARENTS ) return NULL;
			m_stack[m_stackPtr++] = ji;
			// and null this
			ji = NULL;
			continue;
		}
		// pop the stack?
		if ( *p == '}' ) {
			// just pop it and restore name cursor
			if ( m_stackPtr > 0 ) {
				JsonItem *px = m_stack[m_stackPtr-1];
				NAME    = px->m_name;
				NAMELEN = px->m_nameLen;
				m_stackPtr--;
			}
			continue;
		}
		// array of things?
		if ( *p == '[' ) {
			// make a newitem to put on stack
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// current ji is an object type then
			ji->m_type = JT_ARRAY;
			// start of array hack. HACK!
			//ji->m_valueLong = (int32_t)p;
			ji->m_valueArray = p;
			// set the name
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			// init to a bogus value. should be set below.
			// at least this should avoid a core in XmlDoc.cpp
			// getTokenizedDiffbotReply()
			ji->m_valueLen = 0;
			// this goes on the stack
			if ( m_stackPtr >= MAXJSONPARENTS ) return NULL;
			m_stack[m_stackPtr++] = ji;
			ji = NULL;
			continue;
		}
		// pop the stack?
		if ( *p == ']' ) {
			// just pop it and restore name cursor
			if ( m_stackPtr > 0 ) {
				JsonItem *px = m_stack[m_stackPtr-1];
				NAME    = px->m_name;
				NAMELEN = px->m_nameLen;
				// start of array hack. HACK!
				char *start = (char *)px->m_valueArray;//Long;
				// include ending ']' in length of array
				px->m_valueLen = p - start + 1;
				m_stackPtr--;
			}
			continue;
		}

		// a quote?
		if ( *p == '\"' ) {
			// find end of quote
			char *end = p + 1;
			for ( ; *end ; end++ ) {
				// skip two chars if escaped
				if ( *end == '\\' && end[1] ) {
					end++; 
					continue;
				}
				// this quote is unescaped then
				if ( *end == '\"' ) break;
			}
			// field?
			char *x = end + 1;
			// skip spaces
			for ( ; *x && is_wspace_a(*x) ; x++ );
			// define the string
			char *str  = p + 1;
			int32_t  slen = end - str;
			// . if a colon follows, it was a field
			if ( *x == ':' ) {

				// we can't be the first thing in the safebuf
				// json must start with { or [ i guess
				// otherwise getFirstItem() won't work!
				if ( m_sb.m_length==0 ) {
					g_errno = EBADJSONPARSER;
					return NULL;
				}

				// let's push this now so we can \0 term
				char *savedStr = m_sb.getBuf();
				m_sb.safeMemcpy ( str , slen );
				m_sb.pushChar('\0');
				// just set the name cursor
				NAME    = savedStr;//str;
				NAMELEN = slen;
			}
			// . otherwise, it was field value, so index it
			// . TODO: later make field names compounded to
			//   better represent nesting?
			// . added 'else if (NAME){' fix for json=\"too small\"
			else if ( NAME ) {
				// make a new one in safebuf. our
				// parent will be the array type item.
				ji = addNewItem();
				if ( ! ji ) return NULL;
				// we are a string
				ji->m_type = JT_STRING;
				// use name cursor
				ji->m_name    = NAME;
				ji->m_nameLen = NAMELEN;
				// get length decoded
				int32_t curr = m_sb.length();
				// store decoded string right after jsonitem
				if ( !m_sb.safeDecodeJSONToUtf8 (str,slen,
								 niceness ))
					return NULL;
				// store length decoded json
				ji->m_valueLen = m_sb.length() - curr;
				// end with a \0
				m_sb.pushChar('\0');
				// ok, this one is done
				ji = NULL;
			}
			else {
				log("json: fieldless name in json");
				g_errno = EBADJSONPARSER;
				return NULL;
			}
			// skip over the string
			size = 0;
			p    = x;
			continue;
		}

		// true or false?
		if ( (*p == 't' && strncmp(p,"true",4)==0) ||
		     (*p == 'f' && strncmp(p,"false",5)==0) ) {
			// make a new one
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// copy the number as a string as well
			int32_t curr = m_sb.length();
			// what is the length of it?
			int32_t slen = 4;
			ji->m_valueLong = 1;
			ji->m_valueDouble = 1.0;
			if ( *p == 'f' ) {
				slen = 5;
				ji->m_valueLong = 0;
				ji->m_valueDouble = 0;
			}
			// store decoded string right after jsonitem
			if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness))
				return NULL;
			// store length decoded json
			ji->m_valueLen = m_sb.length() - curr;
			// end with a \0
			m_sb.pushChar('\0');
			ji->m_type = JT_NUMBER;
			// use name cursor
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			ji = NULL;
			// skip over the string
			size = 1;
			//p    = end;
			continue;
		}
			


		// if we hit a digit they might not be in quotes like
		// "crawled":123
		if ( is_digit ( *p ) ||
		     // like .123 ?
		     ( *p == '.' && is_digit(p[1]) ) ) {
			// find end of the number
			char *end = p + 1;
			// . allow '.' for decimal numbers
			// . TODO: allow E for exponent
			for ( ; *end && (is_digit(*end) || *end=='.');end++) ;
			// define the string
			char *str  = p;
			int32_t  slen = end - str;
			// make a new one
			ji = addNewItem();
			if ( ! ji ) return NULL;
			// back up over negative sign?
			if ( str > json && str[-1] == '-' ) str--;
			// decode
			//char c = str[slen];
			//str[slen] = '\0';
			ji->m_valueLong = atol(str);
			ji->m_valueDouble = atof(str);
			// copy the number as a string as well
			int32_t curr = m_sb.length();
			// store decoded string right after jsonitem
			if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness))
				return NULL;
			// store length decoded json
			ji->m_valueLen = m_sb.length() - curr;
			// end with a \0
			m_sb.pushChar('\0');
			//str[slen] = c;
			ji->m_type = JT_NUMBER;
			// use name cursor
			ji->m_name    = NAME;
			ji->m_nameLen = NAMELEN;
			ji = NULL;
			// skip over the string
			size = 0;
			p    = end;
			continue;
		}
	}

	// for testing if we realloc
	char *memEnd = m_sb.getBufStart();
	if ( mem != memEnd ) { char *xx=NULL;*xx=0; }

	return (JsonItem *)m_sb.getBufStart();
}