C++ (Cpp) ucProperties示例

示例#1

0

显示文件

文件： create_ucd_tables.cpp 项目： DeadNumbers/open-source-search-engine

void handleDerivedNormalizationProps(u_int32_t line, char **col, 
				     u_int32_t colCount) {
	//printf("Line %"INT32": ", line);
	//for (u_int32_t i=0;i<colCount;i++) 
	//	printf("'%s' ", col[i]);
	//printf("\n");
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		//printf("U+%04x ", c);
		// get current props, if any
		UCProps props = ucProperties(c);

		if (!strncmp(col[1], "NFKC_QC", 7))
			props |= UC_NFKC_QC_NO;
		else if (!strncmp(col[1], "Full_Composition_Exclusion", 26)){
			g_excludeCount++;
			props |= UC_COMP_EX;
			//printf("Excluding %4x props: %04x\n", c, props);
		}
		
		if (props) g_ucProps.setValue(c, &props);
	}
	//printf("\n");
	
}

示例#2

0

显示文件

文件： create_ucd_tables.cpp 项目： DeadNumbers/open-source-search-engine

void handleScripts(u_int32_t, char **col, u_int32_t colCount){
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		UCProps props = ucProperties(c);
		//void *p = g_ucProps.getValue(c);
		//if (p) props = *(u_char*)p;
		UCScript s = ucScriptCommon;
		for (int j=0; j < ucScriptNumScripts; j++) {
			if (!strcmp(col[1], g_ucScriptNames[j])){
				s = j;
				g_ucScripts.setValue(c, &j);
			}
		}
		if (s == ucScriptThai) props |= UC_THAI;
		else if (s == ucScriptHiragana) props |= UC_HIRAGANA;
		else if (s == ucScriptKatakana) props |= UC_KATAKANA;
		else if (s == ucScriptKatakana_Or_Hiragana) 
			props |= UC_KATAKANA|UC_HIRAGANA;
		if (props)
			g_ucProps.setValue(c, &props);
	}

}

示例#3

0

显示文件

文件： create_ucd_tables.cpp 项目： DeadNumbers/open-source-search-engine

void handleDerivedCoreProps(u_int32_t line, char **col, u_int32_t colCount) {
	//printf("Line %"INT32": ", line);
	//for (u_int32_t i=0;i<colCount;i++) 
	//	printf("'%s' ", col[i]);
	//printf("\n");
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		//printf("U+%04x ", c);
		// get current props, if any
		UCProps props = ucProperties(c);
		if (!strncmp(col[1], "Alphabetic", 10))
			props |= UC_ALPHA | UC_WORDCHAR;
		else if (!strncmp(col[1], "Default_Ignorable_Code_Point", 28))
			props |= UC_IGNORABLE;
		else if (!strncmp(col[1], "Lowercase", 9))
			props |= UC_LOWER | UC_WORDCHAR;
		else if (!strncmp(col[1], "Uppercase", 9))
			props |= UC_UPPER | UC_WORDCHAR;
		else if (!strncmp(col[1], "Grapheme_Extend", 15))
			props |= UC_WORDCHAR;
		if (props)
			g_ucProps.setValue(c, &props);
// 		if (c == ' ' && (props&UC_WORDCHAR)) 
// 			printf("Yow: line %"INT32"\n", line);
// 		if (c == 0 && props)
// 			printf("!!!\nHey: line %"INT32"!!!\n\n", line);
	}
	//printf("\n");
	
}

示例#4

0

显示文件

文件： create_ucd_tables.cpp 项目： DeadNumbers/open-source-search-engine

void handlePropList(u_int32_t line, char **col, u_int32_t colCount) {
	//printf("Line %"INT32": ", line);
	//for (u_int32_t i=0;i<colCount;i++) 
	//	printf("'%s' ", col[i]);
	//printf("\n");
	char *range = NULL;
	UChar32 codePointStart = strtol(col[0], &range, 16);
	UChar32 codePointEnd = codePointStart;
	if (range && range[0] == '.' && range[1] == '.')
		codePointEnd = strtol(range+2, NULL, 16);
	for (UChar32 c = codePointStart ; c <= codePointEnd ; c++) {
		//printf("U+%04x ", c);
		// get current props, if any
		UCProps props = ucProperties(c);
		//void *p = g_ucProps.getValue(c);
		//if (p) props = *(u_char*)p;
		if (!strncmp(col[1], "Ideographic", 11))
			props |= UC_IDEOGRAPH | UC_WORDCHAR;
		else if (!strncmp(col[1], "Unified_Ideograph", 17))
			props |= UC_IDEOGRAPH | UC_WORDCHAR;
		else if (!strncmp(col[1], "White_Space", 11))
			props |= UC_WHITESPACE;

		if (props)
			g_ucProps.setValue(c, &props);
	}
	//printf("\n");
	
}

示例#5

0

显示文件

文件： create_ucd_tables.cpp 项目： DeadNumbers/open-source-search-engine

void handleUnicodeData(u_int32_t line, char **col, u_int32_t colCount) {

	UChar32 codePoint = strtol(col[0], NULL, 16);

// 	if ((colCount < 14) || (codePoint == 0)){
// 		printf("line %"INT32": no data (%"INT32" cols)\n", line, colCount);
// 		return;
// 	}
	char *name = col[1];
	char *category = col[2];
	u_char combiningClass = strtol(col[3], NULL, 10);
	char *decompStr = col[5];
	UChar32 ucMapping = strtol(col[12],NULL, 16);
	UChar32 lcMapping = strtol(col[13],NULL, 16);
	
	// Set general category
	//g_ucCategory.setValue(codePoint, (void*)category);
	UCProps props = ucProperties(codePoint);
	if (category[0] == 'L') props |= UC_ALPHA | UC_WORDCHAR;
	else if (category[0] == 'N') props |= UC_DIGIT | UC_WORDCHAR;
	else if (category[0] == 'Z') props |= UC_WHITESPACE;
	if (props)
		g_ucProps.setValue(codePoint, &props);
	
	if (lcMapping) 
		g_ucLowerMap.setValue(codePoint, (void*)&lcMapping);
	if (ucMapping) 
		g_ucUpperMap.setValue(codePoint, (void*)&ucMapping);
	if (combiningClass)
		g_ucCombiningClass.setValue(codePoint, (void*)&combiningClass);

	if (decompStr && decompStr[0]){
		
		u_char decompCount = 0;
		UChar32 decomp[32];
		bool kompat = false;
		// Get decomposition
		char *p = decompStr;
		int decompLen = gbstrlen(decompStr);
		while (p < decompStr+decompLen) {
			char *pend = p;
			while (*pend && *pend != ' ') pend++;
			*pend = '\0';
			if (p[0] == '<') kompat = true;
			else{
				decomp[decompCount++] = strtol(p, NULL, 16);
			}
			p = pend+1;
		}

//  		printf ("Code Point U+%04"XINT32", %s: %s (%d chars)\n", 
//  			codePoint, name, kompat?"(Kompatable)":"", decompCount);
// 		g_decompCount++;
// 		if (decompStr[0] != '<')
		bool fullComp=false;
		if (!kompat && !(props & UC_COMP_EX)) {
			// set up canonical combining table
			g_canonicalDecompCount++;
// 			printf("%4x:", codePoint);
// 			for (int i = 0; i<decompCount;i++)
// 				printf(" %4x", decomp[i]);
// 			printf("\n");
			fullComp = true;
		}
		setKDValue(codePoint, decomp, decompCount, fullComp);
	    	// JAB: we now have Kompatible and Canonical decompositions
		if (!kompat)
			setCDValue(codePoint, decomp, decompCount);
	}
}

示例#6

0

显示文件

文件： Words.cpp 项目： BillWangCS/open-source-search-engine

bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
	long  i = 0;
	long  j;
	//long  k = 0;
	long  wlen;
	//unsigned long e;
	//long  skip;
	long badCount = 0;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) goto done;

	if ( ! s[i] ) goto done;

	if ( ! is_alnum_utf8(s+i) ) { // && m_numWords < m_preCount ) {

		if ( m_numWords >= m_preCount ) goto done;

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if ( s[i+1]=='/' ) {
				// skip over /
				m_tagIds [m_numWords] = ::getTagId(s+i+2);
				m_tagIds [m_numWords] |= BACKBIT;
			}
			else
				m_tagIds [m_numWords] = ::getTagId(s+i+1);
			// word start
			m_words    [m_numWords] = s + i;
			m_wordIds  [m_numWords] = 0LL;
			// skip till end
			long tagLen = getTagLen(s+i); // ,niceness);
			m_wordLens [m_numWords] = tagLen;
			m_numWords++;
			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		//for (;s[i] && ! is_alnum_utf8(s+i);i+=getUtf8CharSize(s+i));
		for ( ; s[i] ; i += getUtf8CharSize(s+i)){
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) break;
			// breathe
			QUICKPOLL(niceness);
			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) continue;
				// update
				oldScript = ucScriptCommon;
				// otherwise, stop we got alnum
				break;
			}
			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );
			// stop if word char
			if ( ! ucIsWordChar ( c ) ) continue;
			// update first though
			oldScript = ucGetScript ( c );
			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		if (m_tagIds) m_tagIds[m_numWords] = 0;
		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	//for ( ; is_alnum_utf8 (&s[i] ) ; i += getUtf8CharSize(s+i) );
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// breathe
		QUICKPOLL(niceness);
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanes chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}
	
	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
	
	// allow for words like we're dave's and i'm
	if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;
	
	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;
	// . Lars says it's better to leave the accented chars intact
	// . google agrees
	// . but what about "re'sume"?
	if ( computeWordIds ) {
		long long h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
		// until we get an accent removal algo, comment this
		// out and possibly use the query synonym pipeline
		// to search without accents. MDW
		//long long h2 = hash64AsciiLowerE(&s[j],wlen);
		//if ( h2 != h ) m_stripWordIds [m_numWords] = h2;
		//else           m_stripWordIds [m_numWords] = 0LL;
		//m_stripWordIds[m_numWords] = 0;
	}
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// break on \0 or MAX_WORDS
	//if ( ! s[i] ) goto done;
	// get a punct word
	goto uptop;
	/*
	  j = i;
	  // delineate the "punctuation" word
	  for ( ; s[i] && !is_alnum_utf8(&s[i]);i+=getUtf8CharSize(s+i));
	  // bad utf8 could cause us to breach the node, so watch out!
	  if ( i > nodeLen ) {
	  badCount++;
	  i = nodeLen;
	  }
	  // get word length
	  wlen = i - j;
	  if ( m_numWords >= m_preCount ) goto done;
	  m_words        [m_numWords  ] = &s[j];
	  m_wordLens     [m_numWords  ] = wlen;
	  m_wordIds      [m_numWords  ] = 0LL;
	  if (m_tagIds) m_tagIds[m_numWords] = 0;
	  m_numWords++;
	*/

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC,
		    "build: words: set: Fix counting routine.");
		char *xx = NULL; *xx = 0;
	}
	// compute total length
	if ( m_numWords <= 0 ) m_totalLen = 0;
	else m_totalLen = m_words[m_numWords-1] - s + m_wordLens[m_numWords-1];

	if ( badCount )
		log("words: had %li bad utf8 chars",badCount);

	return true;
}

示例#7

0

显示文件

文件： Words.cpp 项目： privacore/open-source-search-engine

bool Words::addWords( char *s, int32_t nodeLen, bool computeWordIds ) {
	int32_t  i = 0;
	int32_t  j;
	int32_t  wlen;

	bool hadApostrophe = false;

	UCScript oldScript = ucScriptCommon;
	UCScript saved;
	UCProps props;

 uptop:

	// bad utf8 can cause a breach
	if ( i >= nodeLen ) {
		goto done;
	}

	if ( ! s[i] ) {
		goto done;
	}

	if ( !is_alnum_utf8( s + i ) ) {
		if ( m_numWords >= m_preCount ) {
			goto done;
		}

		// tag?
		if ( s[i]=='<' && m_hasTags && isTagStart(s+i) ) {
			// get the tag id
			if( m_tagIds ) {
				if ( s[i + 1] == '/' ) {
					// skip over /
					m_tagIds[m_numWords] = ::getTagId( s + i + 2 );
					m_tagIds[m_numWords] |= BACKBIT;
				} else {
					m_tagIds[m_numWords] = ::getTagId( s + i + 1 );
				}
			}

			m_words[m_numWords] = s + i;
			m_wordIds[m_numWords] = 0LL;

			// skip till end
			int32_t tagLen = getTagLen( s + i );
			m_wordLens[m_numWords] = tagLen;
			m_nodes[m_numWords] = 0;
			m_numWords++;

			// advance
			i += tagLen;
			goto uptop;
		}

		// it is a punct word, find end of it
		char *start = s+i;
		for ( ; s[i] ; i += getUtf8CharSize(s+i)) {
			// stop on < if we got tags
			if ( s[i] == '<' && m_hasTags ) {
				break;
			}

			// if we are simple ascii, skip quickly
			if ( is_ascii(s[i]) ) {
				// accumulate NON-alnum chars
				if ( ! is_alnum_a(s[i]) ) {
					continue;
				}

				// update
				oldScript = ucScriptCommon;

				// otherwise, stop we got alnum
				break;
			}

			// if we are utf8 we stop on special props
			UChar32 c = utf8Decode ( s+i );

			// stop if word char
			if ( ! ucIsWordChar ( c ) ) {
				continue;
			}

			// update first though
			oldScript = ucGetScript ( c );

			// then stop
			break;
		}
		m_words        [ m_numWords  ] = start;
		m_wordLens     [ m_numWords  ] = s+i - start;
		m_wordIds      [ m_numWords  ] = 0LL;
		m_nodes        [ m_numWords  ] = 0;

		if (m_tagIds) {
			m_tagIds[m_numWords] = 0;
		}

		m_numWords++;
		goto uptop;
	}

	// get an alnum word
	j = i;
 again:
	for ( ; s[i] ; i += getUtf8CharSize(s+i) ) {
		// simple ascii?
		if ( is_ascii(s[i]) ) {
			// accumulate alnum chars
			if ( is_alnum_a(s[i]) ) continue;
			// update
			oldScript = ucScriptCommon;
			// otherwise, stop we got punct
			break;
		}
		// get the code point of the utf8 char
		UChar32 c = utf8Decode ( s+i );
		// get props
		props = ucProperties ( c );
		// good stuff?
		if ( props & (UC_IGNORABLE|UC_EXTEND) ) continue;
		// stop? if UC_WORCHAR is set, that means its an alnum
		if ( ! ( props & UC_WORDCHAR ) ) {
			// reset script between words
			oldScript = ucScriptCommon;
			break;
		}
		// save it
		saved = oldScript;
		// update here
		oldScript = ucGetScript(c);
		// treat ucScriptLatin (30) as common so we can have latin1
		// like char without breaking the word!
		if ( oldScript == ucScriptLatin ) oldScript = ucScriptCommon;
		// stop on this crap too i guess. like japanes chars?
		if ( props & ( UC_IDEOGRAPH | UC_HIRAGANA | UC_THAI ) ) {
			// include it
			i += getUtf8CharSize(s+i);
			// but stop
			break;
		}
		// script change?
		if ( saved != oldScript ) break;
	}
	
	// . java++, A++, C++ exception
	// . A+, C+, exception
	// . TODO: consider putting in Bits.cpp w/ D_CAN_BE_IN_PHRASE
	if ( s[i]=='+' ) {
		if ( s[i+1]=='+' && !is_alnum_utf8(&s[i+2]) ) i += 2;
		else if ( !is_alnum_utf8(&s[i+1]) ) i++;
	}
	// . c#, j#, ...
	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;

	// comma is ok if like ,ddd!d
	if ( s[i]==',' && 
	     i-j <= 3 &&
	     is_digit(s[i-1]) ) {
		// if word so far is 2 or 3 chars, make sure digits
		if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
		if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
		// scan forward
		while ( s[i] == ',' &&
		        is_digit(s[i+1]) &&
		        is_digit(s[i+2]) &&
		        is_digit(s[i+3]) &&
		        ! is_digit(s[i+4]) ) {
			i += 4;
		}
	}

	// decimal point?
	if ( s[i] == '.' &&
	     is_digit(s[i-1]) &&
	     is_digit(s[i+1]) ) {
		// allow the decimal point
		i++;
		// skip over string of digits
		while ( is_digit(s[i]) ) i++;
	}
	
 nogo:

	// allow for words like we're dave's and i'm
	if ( s[i] == '\'' && s[i + 1] && is_alnum_utf8( &s[i + 1] ) && !hadApostrophe ) {
		i++;
		hadApostrophe = true;
		goto again;
	}
	hadApostrophe = false;
	
	// get word length
	wlen = i - j;
	if ( m_numWords >= m_preCount ) goto done;
	m_words   [ m_numWords  ] = &s[j];
	m_wordLens[ m_numWords  ] = wlen;

	if ( computeWordIds ) {
		int64_t h = hash64Lower_utf8(&s[j],wlen);
		m_wordIds [m_numWords] = h;
	}

	m_nodes[m_numWords] = 0;
	if (m_tagIds) m_tagIds[m_numWords] = 0;
	m_numWords++;
	m_numAlnumWords++;
	// get a punct word
	goto uptop;

 done:
	// bad programming warning
	if ( m_numWords > m_preCount ) {
		log(LOG_LOGIC, "build: words: set: Fix counting routine.");
		gbshutdownLogicError();
	}

	return true;
}